diff --git a/inc/common/optimizer/graph_optimizer.h b/inc/common/optimizer/graph_optimizer.h index 2c0cebe6..253aaae1 100644 --- a/inc/common/optimizer/graph_optimizer.h +++ b/inc/common/optimizer/graph_optimizer.h @@ -62,6 +62,9 @@ class GraphOptimizer { // optimize streamed Graph virtual Status OptimizeStreamGraph(ComputeGraph &graph, const RunContext &context) { return SUCCESS; } + + // op compile + virtual Status OptimizeFusedGraphAfterGraphSlice(ComputeGraph &graph) { return SUCCESS; } }; } // namespace ge /*lint +e148*/ diff --git a/inc/common/util/ai_core/common/aicore_util_attr_define.h b/inc/common/util/ai_core/common/aicore_util_attr_define.h index 6c20c470..ba28d7b3 100644 --- a/inc/common/util/ai_core/common/aicore_util_attr_define.h +++ b/inc/common/util/ai_core/common/aicore_util_attr_define.h @@ -35,5 +35,7 @@ static const std::string ATTR_NAME_L2_FUSION_EXTEND_PTR = "l2_fusion_extend_cont static const std::string L1_OPTIMIZED = "l1_optimized"; static const std::string L2_OPTIMIZED = "l2_optimized"; + +static const std::string OP_SLICE_INFO = "_op_slice_info"; } // namespace fe #endif diff --git a/inc/common/util/ai_core/common/scope_allocator.h b/inc/common/util/ai_core/common/scope_allocator.h index 3b264425..6cebb286 100644 --- a/inc/common/util/ai_core/common/scope_allocator.h +++ b/inc/common/util/ai_core/common/scope_allocator.h @@ -34,6 +34,7 @@ class ScopeAllocator { bool HasScopeAttr(ge::ConstOpDescPtr opdef); bool GetScopeAttr(ge::ConstOpDescPtr opdef, int64_t& scopeId); bool SetScopeAttr(ge::OpDescPtr opdef, int64_t scopeId); + bool ResetScopeId(int64_t scopeId); private: int64_t scopeId; diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index 619812d7..68743bc8 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -40,6 +40,7 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; +// Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep"; @@ -48,7 +49,10 @@ const char *const OPTION_EXEC_ENABLE_DUMP_DEBUG = "ge.exec.enableDumpDebug"; const char *const OPTION_EXEC_DUMP_DEBUG_MODE = "ge.exec.dumpDebugMode"; const char *const OPTION_EXEC_ENABLE_INCRE_BUILD = "ge.exec.enableIncreBuild"; const char *const OPTION_EXEC_INCRE_BUILD_CACHE_PATH = "ge.exec.increBuildCachePath"; +const char *const OPTION_EXEC_ENABLE_EXCEPTION_DUMP = "ge.exec.enable_exception_dump"; const char *const OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES = "ge.exec.enableScopeFusionPasses"; +const char *const OPTION_EXEC_PROFILING_FPPONIT_OPTIONS = "ge.exec.profilingFpPointOptions"; +const char *const OPTION_EXEC_PROFILING_BPPONIT_OPTIONS = "ge.exec.profilingBpPointOptions"; // profiling flag const char *const OPTION_EXEC_PROFILING_MODE = "ge.exec.profilingMode"; const char *const OPTION_EXEC_PROFILING_OPTIONS = "ge.exec.profilingOptions"; diff --git a/inc/external/graph/operator_reg.h b/inc/external/graph/operator_reg.h index f0e1e84a..759c70f2 100644 --- a/inc/external/graph/operator_reg.h +++ b/inc/external/graph/operator_reg.h @@ -223,6 +223,7 @@ class OpReg { \ private: \ void __dy_input_##x() { \ + Operator::DynamicInputRegister(#x, 0, true); \ (void)OpReg() #define DYNAMIC_OUTPUT(x, t) \ @@ -242,6 +243,7 @@ class OpReg { \ private: \ void __dy_output_##x() { \ + Operator::DynamicOutputRegister(#x, 0, true); \ (void)OpReg() #define GRAPH(x) \ diff --git a/inc/external/register/register.h b/inc/external/register/register.h index e905e8d4..f3091fae 100644 --- a/inc/external/register/register.h +++ b/inc/external/register/register.h @@ -55,6 +55,28 @@ class Message; } // namespace google namespace domi { +const int64_t kMaxNameLength = 1048576; // 1M + +enum DynamicType { kInvalid = 0, kInput = 1, kOutput = 2 }; +struct DynamicInputOutputInfo { + DynamicType type; // input/output + const char *port_name; + int64_t port_name_len; + const char *attr_name; + int64_t attr_name_len; + DynamicInputOutputInfo() + : type(kInvalid), port_name(nullptr), port_name_len(0), attr_name(nullptr), attr_name_len(0) {} + DynamicInputOutputInfo(DynamicType type, const char *port_name, int64_t port_name_len, const char *attr_name, + int64_t attr_name_len) + : type(type), + port_name(port_name), + port_name_len(port_name_len), + attr_name(attr_name), + attr_name_len(attr_name_len) {} +}; +Status AutoMappingByOpFn(const ge::Operator &op_src, ge::Operator &op); +Status AutoMappingByOpFnDynamic(const ge::Operator &op_src, ge::Operator &op, + const vector &dynamic_name_attr_value); Status AutoMappingFn(const google::protobuf::Message *op_src, ge::Operator &op); Status AutoMappingFnDynamic(const google::protobuf::Message *op_src, ge::Operator &op, std::map> dynamic_name_attr_value, @@ -71,6 +93,7 @@ using ParseParamFunc = std::function; using FusionParseParamFunc = std::function, ge::Operator &)>; +using FusionParseParamByOpFunc = std::function &, ge::Operator &)>; using ParseSubgraphFunc = std::function; class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData { @@ -91,6 +114,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData { OpRegistrationData &FusionParseParamsFn(const FusionParseParamFunc &fusionParseParamFn); + OpRegistrationData &FusionParseParamsFn(const FusionParseParamByOpFunc &fusion_parse_param_fn); + OpRegistrationData &ParseSubgraphPostFn(const ParseSubgraphFunc &subgraph_post_fn); OpRegistrationData &ImplyType(const domi::ImplyType &imply_type); @@ -108,6 +133,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData { ParseParamFunc GetParseParamFn() const; ParseParamByOpFunc GetParseParamByOperatorFn() const; FusionParseParamFunc GetFusionParseParamFn() const; + FusionParseParamByOpFunc GetFusionParseParamByOpFn() const; ParseSubgraphFunc GetParseSubgraphPostFn() const; private: diff --git a/inc/external/register/scope/scope_fusion_pass_register.h b/inc/external/register/scope/scope_fusion_pass_register.h index 77be4b8c..8e5605a7 100644 --- a/inc/external/register/scope/scope_fusion_pass_register.h +++ b/inc/external/register/scope/scope_fusion_pass_register.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "ge/ge_api_error_codes.h" #include "register/register_error_codes.h" #include "register/register_types.h" @@ -52,15 +53,16 @@ class ScopePassManager; class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY Scope { public: - explicit Scope(const std::string &name, const std::string &sub_type = "", Scope *father_scope = nullptr); + Scope(); + Status Init(const std::string &name, const std::string &sub_type = "", Scope *father_scope = nullptr); ~Scope(); - std::string Name() const; - std::string SubType() const; - std::map AllNodesMap() const; + const std::string &Name() const; + const std::string &SubType() const; + const std::unordered_map &AllNodesMap() const; Scope *GetSubScope(const std::string &scope_name) const; - std::string LastName() const; - std::vector GetAllSubScopes() const; + const std::string LastName() const; + const std::vector &GetAllSubScopes() const; const Scope *GetFatherScope() const; private: @@ -76,12 +78,13 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY Scope { class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY FusionScopesResult { public: FusionScopesResult(); + Status Init(); ~FusionScopesResult(); void SetName(const std::string &name); void SetType(const std::string &type); void SetDescription(const std::string &description); - std::string Name() const; - std::vector Nodes() const; + const std::string &Name() const; + const std::vector &Nodes() const; void InsertInputs(const std::string &inner_op_name, const std::vector &index_map); void InsertOutputs(const std::string &inner_op_name, const std::vector &index_map); @@ -136,7 +139,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY ScopeTree { ScopeTree &operator=(const ScopeTree &scopetree) = delete; ~ScopeTree(); - std::vector GetAllScopes() const; + const std::vector &GetAllScopes() const; private: class ScopeTreeImpl; @@ -154,7 +157,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY ScopeGraph { ~ScopeGraph(); const ScopeTree *GetScopeTree() const; - std::map GetNodesMap() const; + const std::unordered_map &GetNodesMap() const; private: class ScopeGraphImpl; @@ -203,7 +206,7 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY NodeOpTypeFeature : ScopeBa class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY NodeAttrFeature : ScopeBaseFeature { public: - NodeAttrFeature(std::string nodeType, std::string attr_name, ge::DataType datatype, ScopeAttrValue attr_value); + NodeAttrFeature(std::string nodeType, std::string attr_name, ge::DataType datatype, ScopeAttrValue &attr_value); NodeAttrFeature(NodeAttrFeature const &feature); NodeAttrFeature &operator=(NodeAttrFeature const &feature); ~NodeAttrFeature(); diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h index 3686befc..36c1a0bf 100644 --- a/inc/framework/common/ge_types.h +++ b/inc/framework/common/ge_types.h @@ -258,16 +258,19 @@ struct ComputeGraphDescInfo { struct OpDescInfo { std::string op_name; + std::string op_type; uint32_t task_id; uint32_t stream_id; std::vector input_format; std::vector> input_shape; std::vector input_data_type; std::vector input_addrs; + std::vector input_size; std::vector output_format; std::vector> output_shape; std::vector output_data_type; std::vector output_addrs; + std::vector output_size; }; struct ModelDumpConfig { std::string model_name; diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h index 3671f970..fbe7e73f 100644 --- a/inc/framework/common/helper/model_helper.h +++ b/inc/framework/common/helper/model_helper.h @@ -64,6 +64,7 @@ class ModelHelper { Status LoadWeights(OmFileLoadHelper& om_load_helper); Status LoadTask(OmFileLoadHelper& om_load_helper); Status LoadTBEKernelStore(OmFileLoadHelper& om_load_helper); + Status LoadCustAICPUKernelStore(OmFileLoadHelper& om_load_helper); Status ReleaseLocalModelData() noexcept; Status SaveModelPartition(std::shared_ptr& om_file_save_helper, ModelPartitionType type, const uint8_t* data, size_t size); diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h index 189c63c3..ad284d07 100644 --- a/inc/framework/common/types.h +++ b/inc/framework/common/types.h @@ -851,9 +851,9 @@ static constexpr int32_t PARTITION_TYPE_WEIGHTS = 1; static constexpr int32_t PARTITION_TYPE_TASK_INFO = 2; // number of partitions in the current model -static constexpr uint32_t PARTITION_SIZE = 4; +static constexpr uint32_t PARTITION_SIZE = 5; -enum ModelPartitionType { MODEL_DEF = 0, WEIGHTS_DATA, TASK_INFO, TBE_KERNELS }; +enum ModelPartitionType { MODEL_DEF = 0, WEIGHTS_DATA, TASK_INFO, TBE_KERNELS, CUST_AICPU_KERNELS }; struct ModelPartitionMemInfo { ModelPartitionType type; diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h index f9fa4ce9..00846112 100644 --- a/inc/framework/executor/ge_executor.h +++ b/inc/framework/executor/ge_executor.h @@ -108,11 +108,11 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { /// @ingroup ge /// @brief Get current dynamic dims info by combined dims /// @param [in] model_id: model id allocate from manager - /// @param [in] combined_dims: array of combined dimensions + /// @param [in] dynamic_dims: cur gear dynamic dims value /// @param [out] cur_dynamic_dims: current dynamic dims /// @return execute result /// - ge::Status GetCurDynamicDims(uint32_t model_id, const std::vector &combined_dims, + ge::Status GetCurDynamicDims(uint32_t model_id, const std::vector &dynamic_dims, std::vector &cur_dynamic_dims); /// diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h index d3f472e9..37bca897 100644 --- a/inc/framework/generator/ge_generator.h +++ b/inc/framework/generator/ge_generator.h @@ -28,6 +28,7 @@ #include "graph/graph.h" #include "graph/op_desc.h" #include "graph/detail/attributes_holder.h" +#include "omg/omg_inner_types.h" namespace ge { class GeGenerator { @@ -45,6 +46,7 @@ class GeGenerator { GeGenerator &operator=(const GeGenerator &) = delete; Status Initialize(const std::map &options); + Status Initialize(const std::map &options, OmgContext &context); Status Finalize(); diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h index 80361232..2f91d7aa 100644 --- a/inc/framework/omg/omg_inner_types.h +++ b/inc/framework/omg/omg_inner_types.h @@ -98,24 +98,14 @@ struct OmgContext { std::vector out_top_names; // path for the aicpu custom operator so_file std::vector aicpu_op_run_paths; - // ddk version - std::string ddk_version; // preferential format used by the entire network domiTensorFormat_t net_format = DOMI_TENSOR_RESERVED; domi::FrameworkType type = domi::FRAMEWORK_RESERVED; RunMode run_mode = ONLY_PRE_CHECK; bool train_flag = false; - // whether to use FP16 high precision - int32_t fp16_high_precision = HIGH_PRECISION_DEFAULT; std::string output_type; - // Save the name of the entire network: Some special operators are used to determine a network. Some operators in the - // network require special processing based on the specific network. e.g:faster-rcnn, the FirstStageProcessor module - // is determined as the Faster-R-CNN network based on the scope fusion. Then, the conv+reshape operators in the - // FirstStageBoxPredictor/BoxEncodingPredictor scope are combined. The convolution kernel rearrangement reshape - // operator needs to be deleted for the convolution kernel. - std::string net_name; // Whether to use dynamic batch size or dynamic image size bool is_dynamic_input = false; std::string dynamic_batch_size; diff --git a/inc/graph/compute_graph.h b/inc/graph/compute_graph.h index 8d3db43c..2ec6b663 100644 --- a/inc/graph/compute_graph.h +++ b/inc/graph/compute_graph.h @@ -93,6 +93,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A NodePtr AddNodeFront(const OpDescPtr &op); NodePtr AddInputNode(NodePtr node); NodePtr AddOutputNode(NodePtr node); + NodePtr AddOutputNodeByIndex(NodePtr node, int32_t index); // insert node with specific pre_node NodePtr AddNodeAfter(OpDescPtr &op, const NodePtr &pre_node); NodePtr AddNodeAfter(NodePtr node, const NodePtr &pre_node); @@ -138,6 +139,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A graphStatus TopologicalSorting(); bool IsValid() const; + void InValid() { is_valid_flag_ = false; } void Dump() const; void Swap(ComputeGraph &graph); @@ -268,6 +270,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A friend class ModelSerializeImp; friend class GraphDebugImp; friend class OnnxUtils; + friend class TuningUtils; std::string name_; uint32_t graph_id_ = 0; diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h index 714375e4..b0bf8ce9 100644 --- a/inc/graph/debug/ge_attr_define.h +++ b/inc/graph/debug/ge_attr_define.h @@ -1031,6 +1031,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_FLAG; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_ADDR; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_INPUT_L1_VALID_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ENGINE_NAME_FOR_LX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NEED_LX_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OPTIMIZE_GROUP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OP_COMPILE_STRATEGY; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_BUFFER; // for unregistered op GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_UNREGST_OPPATH; diff --git a/inc/graph/node.h b/inc/graph/node.h index 2629f525..f4a1c6a8 100644 --- a/inc/graph/node.h +++ b/inc/graph/node.h @@ -174,6 +174,9 @@ class Node : public std::enable_shared_from_this { fusion_output_dataflow_list_ = fusion_output_list; } + bool GetHostNode() const { return host_node_; } + void SetHostNode(bool is_host) { host_node_ = is_host; } + void SetOrigNode(const NodePtr &orignode) { orig_node_ = orignode; } NodePtr GetOrigNode() { return orig_node_; } @@ -192,6 +195,7 @@ class Node : public std::enable_shared_from_this { OutControlAnchorPtr out_control_anchor_; map attrs_; // lint !e1073 bool has_init_{false}; + bool host_node_{false}; bool anchor_status_updated_{false}; std::vector send_event_id_list_; std::vector recv_event_id_list_; @@ -202,6 +206,7 @@ class Node : public std::enable_shared_from_this { NodePtr orig_node_; friend class NodeUtils; friend class OnnxUtils; + friend class TuningUtils; }; } // namespace ge diff --git a/inc/graph/op_desc.h b/inc/graph/op_desc.h index 27c91efc..c7da30b7 100644 --- a/inc/graph/op_desc.h +++ b/inc/graph/op_desc.h @@ -18,6 +18,7 @@ #define INC_GRAPH_OP_DESC_H_ #include +#include #include #include #include @@ -87,6 +88,8 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { graphStatus AddInputDescMiddle(const string &name, const unsigned int num, size_t index); + graphStatus AddOutputDescMiddle(const string &name, const unsigned int num, size_t index); + graphStatus AddOutputDescForward(const string &name, const unsigned int num); graphStatus AddOptionalInputDesc(const string &name, const GeTensorDesc &input_desc); @@ -187,6 +190,14 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { graphStatus CommonVerify() const; + graphStatus AddRegisterInputName(const string &name); + + graphStatus AddRegisterOutputName(const string &name); + + vector GetRegisterInputName() const; + + vector GetRegisterOutputName() const; + using AttrHolder::AddRequiredAttr; using AttrHolder::DelAttr; using AttrHolder::GetAllAttrNames; @@ -297,9 +308,11 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { vector inputs_desc_{}; map input_name_idx_{}; + vector register_input_name_{}; std::unordered_set optional_input_names_{}; vector outputs_desc_{}; map output_name_idx_{}; + vector register_output_name_{}; std::function infer_func_ = nullptr; std::function infer_format_func_ = nullptr; std::function verifier_func_ = nullptr; diff --git a/inc/graph/op_kernel_bin.h b/inc/graph/op_kernel_bin.h index e81d79d0..3970460a 100644 --- a/inc/graph/op_kernel_bin.h +++ b/inc/graph/op_kernel_bin.h @@ -42,6 +42,7 @@ class OpKernelBin { using OpKernelBinPtr = std::shared_ptr; const char *const OP_EXTATTR_NAME_TBE_KERNEL = "tbeKernel"; +const char *const OP_EXTATTR_CUSTAICPU_KERNEL = "cust_aicpu_kernel"; } // namespace ge #endif // INC_GRAPH_OP_KERNEL_BIN_H_ diff --git a/inc/graph/opsproto_manager.h b/inc/graph/opsproto_manager.h index 46b722ec..06846573 100644 --- a/inc/graph/opsproto_manager.h +++ b/inc/graph/opsproto_manager.h @@ -23,6 +23,7 @@ #include #include #include +#include namespace ge { class OpsProtoManager { @@ -30,14 +31,15 @@ class OpsProtoManager { static OpsProtoManager *Instance(); bool Initialize(const std::map &options); - void Finalize(); + private: void LoadOpsProtoPluginSo(std::string &path); - private: std::string pluginPath_; std::vector handles_; + bool is_init_ = false; + std::mutex mutex_; }; } // namespace ge diff --git a/inc/graph/tuning_utils.h b/inc/graph/tuning_utils.h new file mode 100644 index 00000000..98262a23 --- /dev/null +++ b/inc/graph/tuning_utils.h @@ -0,0 +1,130 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MAIN_TUNING_UTILS_H +#define MAIN_TUNING_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "framework/common/debug/ge_log.h" +#include "utils/attr_utils.h" +#include "utils/node_utils.h" +#include "external/ge/ge_api_types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +namespace ge { +// Configure build mode, default value is "normal" +const char *const BUILD_MODE = "ge.buildMode"; +const char *const BUILD_STEP = "ge.buildStep"; +// Configure tuning path +const char *const TUNING_PATH = "ge.tuningPath"; +// for interface: aclgrphBuildModel +const std::set ir_builder_supported_options_for_lx_fusion = {BUILD_MODE, BUILD_STEP, TUNING_PATH}; + +// Build model +const char *const BUILD_MODE_NORMAL = "normal"; +const char *const BUILD_MODE_TUNING = "tuning"; +const char *const BUILD_MODE_BASELINE = "baseline"; +const std::set build_mode_options = {BUILD_MODE_NORMAL, BUILD_MODE_TUNING, BUILD_MODE_BASELINE}; + +// Build step +const char *const BUILD_STEP_BEFORE_UB_MATCH = "before_ub_match"; +const char *const BUILD_STEP_AFTER_UB_MATCH = "after_ub_match"; +const char *const BUILD_STEP_AFTER_BUILDER = "after_builder"; +const char *const BUILD_STEP_AFTER_BUILDER_SUB = "after_builder_sub"; +const char *const BUILD_STEP_AFTER_MERGE = "after_merge"; +const std::set build_step_options = {BUILD_STEP_BEFORE_UB_MATCH, BUILD_STEP_AFTER_UB_MATCH, + BUILD_STEP_AFTER_BUILDER, BUILD_STEP_AFTER_BUILDER_SUB, + BUILD_STEP_AFTER_MERGE}; + +using SubgraphCreateOutNode = std::unordered_map; +using NodetoNodeMap = std::unordered_map; +using NodeSet = std::set; +using NodeNametoNodeNameMap = std::unordered_map; +using NodetoNodeNameMap = std::unordered_map; +class TuningUtils { + public: + TuningUtils() = default; + ~TuningUtils() = default; + // Dump all the subgraphs and modify + // the subgraphs in them to be executable subgraphs if exe_flag is true + // `tuning_path` means path to save the graphs + static graphStatus ConvertGraphToFile(std::vector tuning_subgraphs, + std::vector non_tuning_subgraphs = {}, bool exe_flag = false, + const std::string &path = "", const std::string &user_path = ""); + // Recovery `graph` from graph dump files configured in options + static graphStatus ConvertFileToGraph(const map &options, ge::Graph &graph); + + private: + // part 1 + struct HelpInfo { + int64_t index; + bool exe_flag; + bool is_tuning_graph; + const std::string &path; + const std::string &user_path; + }; + static graphStatus MakeExeGraph(ComputeGraphPtr &exe_graph, const HelpInfo &help_info); + static graphStatus HandlePld(NodePtr &node); + static graphStatus HandleEnd(NodePtr &node); + static graphStatus ChangePld2Data(NodePtr &node, NodePtr &data_node); + static graphStatus ChangeEnd2NetOutput(NodePtr &node, NodePtr &out_node); + static graphStatus LinkEnd2NetOutput(NodePtr &node, NodePtr &out_node); + static graphStatus CreateDataNode(NodePtr &node, NodePtr &data_node); + static graphStatus CreateNetOutput(NodePtr &node, NodePtr &out_node); + static graphStatus AddAttrToDataNodeForMergeGraph(const NodePtr &pld, NodePtr &data_node); + static graphStatus AddAttrToNetOutputForMergeGraph(const NodePtr &end, NodePtr &out_node); + static void DumpGraphToPath(ComputeGraphPtr &exe_graph, int64_t index, bool is_tuning_graph, std::string path); + + static SubgraphCreateOutNode create_output_; + // part 2 + static graphStatus MergeAllSubGraph(std::vector &graphs, ComputeGraphPtr &graph); + static graphStatus MergeSubGraph(ComputeGraphPtr &graph); + // Deletes new data and output nodes added by call `MakeExeGraph()` func in part 1 + static graphStatus RemoveDataNetoutputEdge(ComputeGraphPtr &graph); + static graphStatus GetInAndOutAnchorPair(NodePtr &data_node, NodePtr &out_node, AnchorPtr &dest_in_anchor, + AnchorPtr &src_out_anchor); + static NodeNametoNodeNameMap data_2_netoutput_; + static NodetoNodeNameMap data_node_2_netoutput_; + static NodetoNodeMap data_node_2_netoutput_node_; + static NodeSet netoutput_nodes_; + static NodeSet merged_graph_nodes_; + static std::mutex mutex_; + // for debug + static std::string PrintCheckLog(); + static std::string GetNodeNameByAnchor(const Anchor *anchor); +}; +} // namespace ge +#endif // MAIN_TUNING_UTILS_H diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h index 5f627ea4..fdcbe1a9 100644 --- a/inc/graph/utils/graph_utils.h +++ b/inc/graph/utils/graph_utils.h @@ -36,8 +36,8 @@ do { \ GraphUtils::DumpGEGraph(compute_graph, name); \ GraphUtils::DumpGEGraphToOnnx(*compute_graph, name); \ + uint64_t i = 0; \ for (const auto &sub_graph_func : compute_graph->GetAllSubgraphs()) { \ - static int8_t i = 0; \ auto sub_graph_func_name = std::string(name) + std::string("_sub_graph_") + std::to_string(i++); \ GraphUtils::DumpGEGraph(sub_graph_func, sub_graph_func_name); \ GraphUtils::DumpGEGraphToOnnx(*sub_graph_func, sub_graph_func_name); \ @@ -203,10 +203,13 @@ class GraphUtils { static bool MatchDumpStr(const std::string &suffix); - static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false); + static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false, + const std::string &user_graph_name = ""); static bool LoadGEGraph(const char *file, ge::ComputeGraph &compute_graph); + static bool LoadGEGraph(const char *file, ge::ComputeGraphPtr &compute_graph); + static void BreakConnect(const std::map &all_nodes_infos); static void DumpGEGraphToOnnx(const ge::ComputeGraph &compute_graph, const std::string &suffix); diff --git a/src/common/graph/CMakeLists.txt b/src/common/graph/CMakeLists.txt index f041e4b6..4f9e1a00 100755 --- a/src/common/graph/CMakeLists.txt +++ b/src/common/graph/CMakeLists.txt @@ -24,6 +24,7 @@ file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../../proto/task.proto" "../../proto/fwk_adaper.proto" "../../proto/op_mapping_info.proto" + "../proto/dump_task.proto" ) file(GLOB_RECURSE ONNX_PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} diff --git a/src/common/graph/compute_graph.cc b/src/common/graph/compute_graph.cc index 52953fb2..e6c306b0 100644 --- a/src/common/graph/compute_graph.cc +++ b/src/common/graph/compute_graph.cc @@ -36,6 +36,7 @@ namespace ge { namespace { const size_t OUTPUT_PARAM_SIZE = 2; +const std::string alias_name_attr = "_aliasName"; bool IsUseBFS() { string run_mode; const int base = 10; @@ -133,6 +134,14 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodePtr ComputeGraph::FindNode(co if (node->GetName() == name) { return node; } + std::vector out_alias_name; + if (AttrUtils::GetListStr(node->GetOpDesc(), alias_name_attr, out_alias_name)) { + for (const auto &alias_name : out_alias_name) { + if (alias_name == name) { + return node; + } + } + } } return nullptr; } @@ -258,6 +267,7 @@ NodePtr ComputeGraph::AddNodeFront(NodePtr node) { GELOGE(GRAPH_FAILED, "The node ptr or op desc should not be null."); return nullptr; } + node->SetHostNode(is_valid_flag_); node->GetOpDesc()->SetId(nodes_.size()); if (nodes_.size() > 0 && nodes_[0]->GetType() == DATA) { (void)nodes_.insert(nodes_.begin() + 1, node); @@ -284,6 +294,7 @@ NodePtr ComputeGraph::AddNodeAfter(NodePtr node, const NodePtr &pre_node) { GELOGE(GRAPH_FAILED, "The node ptr or op desc should not be null."); return nullptr; } + node->SetHostNode(is_valid_flag_); node->GetOpDesc()->SetId(nodes_.size()); auto node_iter = std::find(nodes_.begin(), nodes_.end(), pre_node); if (node_iter != nodes_.end()) { @@ -313,6 +324,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodePtr ComputeGraph::AddNode(Nod GELOGE(GRAPH_FAILED, "The node ptr should not be null."); return nullptr; } + node->SetHostNode(is_valid_flag_); node->GetOpDesc()->SetId((int64_t)GetDirectNodesSize()); nodes_.push_back(node); return node; @@ -339,6 +351,7 @@ NodePtr ComputeGraph::AddNode(OpDescPtr op, int64_t id) { // for unserialize. NodePtr node = shared_ptr(new (std::nothrow) Node(op, shared_from_this())); GE_IF_BOOL_EXEC(node == nullptr, GELOGE(GRAPH_FAILED, "node_ptr is NULL!!!"); return nullptr); GE_IF_BOOL_EXEC(node->Init() != GRAPH_SUCCESS, GELOGE(GRAPH_FAILED, "node init fail."); return nullptr); + node->SetHostNode(is_valid_flag_); nodes_.push_back(node); return node; } @@ -355,7 +368,9 @@ NodePtr ComputeGraph::AddInputNode(NodePtr node) { return node; } -NodePtr ComputeGraph::AddOutputNode(NodePtr node) { +NodePtr ComputeGraph::AddOutputNode(NodePtr node) { return AddOutputNodeByIndex(node, 0); } + +NodePtr ComputeGraph::AddOutputNodeByIndex(NodePtr node, int32_t index) { if (node == nullptr || node->GetOpDesc() == nullptr) { GELOGE(GRAPH_FAILED, "The node ptr or opdesc should not be null."); return nullptr; @@ -365,7 +380,7 @@ NodePtr ComputeGraph::AddOutputNode(NodePtr node) { NodePtr result = node; // [output_nodes_info_ : should not be null] for (const auto &item : output_nodes_info_) { - if (item.first->GetName() == node->GetName()) { + if (item.first->GetName() == node->GetName() && item.second == index) { already_have = true; result = item.first; break; @@ -373,7 +388,8 @@ NodePtr ComputeGraph::AddOutputNode(NodePtr node) { } if (!already_have) { - output_nodes_info_.emplace_back(std::make_pair(node, 0)); + output_nodes_info_.emplace_back(std::make_pair(node, index)); + GELOGI("Push back node name:%s, index:%ld, into output_nodes_info_.", node->GetName().c_str(), index); } if (std::find(nodes_.begin(), nodes_.end(), node) == nodes_.end()) { diff --git a/src/common/graph/debug/ge_op_types.h b/src/common/graph/debug/ge_op_types.h index f11ef31e..dff87331 100644 --- a/src/common/graph/debug/ge_op_types.h +++ b/src/common/graph/debug/ge_op_types.h @@ -32,6 +32,8 @@ GE_REGISTER_OPTYPE(STATELESSWHILE, "StatelessWhile"); GE_REGISTER_OPTYPE(SQUEEZE, "Squeeze"); GE_REGISTER_OPTYPE(EXPANDDIMS, "ExpandDims"); GE_REGISTER_OPTYPE(SWITCH, "Switch"); +GE_REGISTER_OPTYPE(REFSWITCH, "RefSwitch"); +GE_REGISTER_OPTYPE(SWITCHN, "SwitchN"); GE_REGISTER_OPTYPE(MERGE, "Merge"); GE_REGISTER_OPTYPE(STREAMMERGE, "StreamMerge"); GE_REGISTER_OPTYPE(ENTER, "Enter"); @@ -40,6 +42,7 @@ GE_REGISTER_OPTYPE(NEXTITERATION, "NextIteration"); GE_REGISTER_OPTYPE(REFNEXTITERATION, "RefNextIteration"); GE_REGISTER_OPTYPE(CONSTANT, "Const"); GE_REGISTER_OPTYPE(PLACEHOLDER, "PlaceHolder"); +GE_REGISTER_OPTYPE(END, "End"); GE_REGISTER_OPTYPE(FRAMEWORKOP, "FrameworkOp"); GE_REGISTER_OPTYPE(GETNEXT, "GetNext"); GE_REGISTER_OPTYPE(INITDATA, "InitData"); diff --git a/src/common/graph/format_refiner.cc b/src/common/graph/format_refiner.cc index 9cb76539..4cb41349 100644 --- a/src/common/graph/format_refiner.cc +++ b/src/common/graph/format_refiner.cc @@ -43,7 +43,7 @@ namespace ge { namespace { const std::unordered_set kChangeDimNodes = {PERMUTE, EXPANDDIMS, SQUEEZE}; const string kIsGraphInferred = "_is_graph_inferred"; -RefRelations reflection_builder; +thread_local RefRelations reflection_builder; } // namespace graphStatus ReflectionProcess(const std::unordered_set &reflection, diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc index fde03a43..708347a7 100644 --- a/src/common/graph/ge_attr_define.cc +++ b/src/common/graph/ge_attr_define.cc @@ -967,6 +967,13 @@ const std::string ATTR_NAME_SWITCH_FOR_L2_FUSION = "_enable_l2_fusion"; const std::string ATTR_NAME_OP_INPUT_L1_FLAG = "_op_input_l1_flag"; const std::string ATTR_NAME_OP_INPUT_L1_ADDR = "_op_input_l1_addr"; const std::string ATTR_NAME_OP_INPUT_L1_VALID_SIZE = "_op_input_l1_valid_size"; +const std::string ATTR_NAME_ENGINE_NAME_FOR_LX = "_lxfusion_engine_name"; +const std::string ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX = "_lxfusion_op_kernel_lib_name"; +const std::string ATTR_NAME_NEED_LX_FUSION = "_lx_fusion"; +const std::string ATTR_NAME_OPTIMIZE_GROUP = "_optimize_group"; +const std::string ATTR_NAME_OP_COMPILE_STRATEGY = "_op_compile_strategy"; +const std::string ATTR_NAME_TBE_KERNEL_NAME = "_tbe_kernel_name"; +const std::string ATTR_NAME_TBE_KERNEL_BUFFER = "_tbe_kernel_buffer"; // Op debug attrs const std::string ATTR_OP_DEBUG_FLAG = "_op_debug_flag"; diff --git a/src/common/graph/graph.mk b/src/common/graph/graph.mk index b007dac8..9e9ffa3a 100644 --- a/src/common/graph/graph.mk +++ b/src/common/graph/graph.mk @@ -8,6 +8,7 @@ COMMON_LOCAL_SRC_FILES := \ ./proto/task.proto \ ./proto/fwk_adapter.proto \ ./proto/op_mapping_info.proto \ + ./proto/dump_task.proto \ ./anchor.cc \ ./ge_attr_value.cc \ ./attr_value.cc \ @@ -29,6 +30,7 @@ COMMON_LOCAL_SRC_FILES := \ ./ge_tensor.cc \ ./detail/attributes_holder.cc \ ./utils/anchor_utils.cc \ + ./utils/tuning_utils.cc \ ./utils/graph_utils.cc \ ./utils/ge_ir_utils.cc \ ./utils/node_utils.cc \ @@ -51,6 +53,7 @@ COMMON_LOCAL_C_INCLUDES := \ proto/task.proto \ proto/fwk_adapter.proto \ proto/op_mapping_info.proto \ + proto/dump_task.proto \ inc \ inc/external \ inc/external/graph \ diff --git a/src/common/graph/model_serialize.cc b/src/common/graph/model_serialize.cc index 673bb31b..16855fc5 100644 --- a/src/common/graph/model_serialize.cc +++ b/src/common/graph/model_serialize.cc @@ -195,9 +195,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ModelSerializeImp::Serialize } } // Outputs - for (const auto &output : graph->GetOutputNodes()) { - if (output != nullptr) { - graph_proto->add_output(output->GetName() + ":0"); + for (const auto &output : graph->GetGraphOutNodesInfo()) { + if (output.first != nullptr) { + graph_proto->add_output(output.first->GetName() + ":" + std::to_string(output.second)); + GELOGI("Add output to graph proto, node name:%s, index:%ld", output.first->GetName().c_str(), output.second); } } if (graph->attrs_.GetProtoMsg() != nullptr) { @@ -440,7 +441,8 @@ bool ModelSerializeImp::HandleNodeNameRef() { } GE_IF_BOOL_EXEC(item.graph == nullptr, continue); - auto ret = item.graph->AddOutputNode(node_it->second); + auto ret = item.graph->AddOutputNodeByIndex(node_it->second, item.index); + GELOGI("node name:%s, item.index:%ld", node_it->second->GetName().c_str(), item.index); if (ret == nullptr) { GELOGE(GRAPH_FAILED, "AddOutputNode failed."); return false; diff --git a/src/common/graph/op_desc.cc b/src/common/graph/op_desc.cc index 0b22eb83..706ec9cd 100644 --- a/src/common/graph/op_desc.cc +++ b/src/common/graph/op_desc.cc @@ -219,6 +219,10 @@ graphStatus OpDesc::AddInputDesc(const string &name, const ge::GeTensorDesc &inp } inputs_desc_.push_back(in_desc); (void)input_name_idx_.insert(make_pair(name, index)); + if (find(register_input_name_.begin(), register_input_name_.end(), name) == register_input_name_.end()) { + register_input_name_.push_back(name); + } + return GRAPH_SUCCESS; } } @@ -255,6 +259,38 @@ graphStatus OpDesc::AddInputDescMiddle(const string &name, const unsigned int nu return GRAPH_SUCCESS; } +graphStatus OpDesc::AddOutputDescMiddle(const string &name, const unsigned int num, size_t index) { + for (unsigned int i = 0; i < num; i++) { + string output_name = name + std::to_string(i); + GE_CHK_BOOL_RET_STATUS((output_name_idx_.find(output_name) == output_name_idx_.end()), GRAPH_FAILED, + "Add input tensor_desc is existed. name[%s]", output_name.c_str()); + + std::shared_ptr out_desc = ComGraphMakeShared(GeTensorDesc()); + if (out_desc == nullptr) { + GELOGE(GRAPH_FAILED, "AddInputDescMiddle failed, malloc shared_ptr failed."); + return GRAPH_FAILED; + } + + if (index > outputs_desc_.size()) { + GELOGE(GRAPH_FAILED, "AddInputDescMiddle failed, insert index should not more than inputs size."); + return GRAPH_FAILED; + } + + (void)outputs_desc_.insert(outputs_desc_.begin() + index + i, out_desc); + + // Update index in input_name_idx + for (auto it = output_name_idx_.begin(); it != output_name_idx_.end(); ++it) { + if (it->second >= (index + i)) { + it->second += 1; + } + } + + (void)output_name_idx_.insert(make_pair(output_name, i + index)); + } + + return GRAPH_SUCCESS; +} + graphStatus OpDesc::AddInputDescForward(const string &name, const unsigned int num) { for (unsigned int i = 0; i < num; i++) { string input_name = name + std::to_string(i); @@ -550,6 +586,9 @@ graphStatus OpDesc::AddOutputDesc(const string &name, const ge::GeTensorDesc &ou } outputs_desc_.push_back(tensor); (void)output_name_idx_.insert(make_pair(name, index)); + if (find(register_output_name_.begin(), register_output_name_.end(), name) == register_output_name_.end()) { + register_output_name_.push_back(name); + } return GRAPH_SUCCESS; } @@ -655,6 +694,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI return inputs_desc_[it->second]; } +graphStatus OpDesc::AddRegisterInputName(const std::string &name) { + if (find(register_input_name_.begin(), register_input_name_.end(), name) == register_input_name_.end()) { + register_input_name_.push_back(name); + } + + return GRAPH_SUCCESS; +} + +vector OpDesc::GetRegisterInputName() const { return register_input_name_; } + graphStatus OpDesc::AddDynamicInputDesc(const string &name, const unsigned int num, bool is_push_back) { if (is_push_back) { for (unsigned int i = 0; i < num; i++) { @@ -663,6 +712,10 @@ graphStatus OpDesc::AddDynamicInputDesc(const string &name, const unsigned int n } else { if (AddInputDescForward(name, num) != GRAPH_SUCCESS) return GRAPH_FAILED; } + if (AddRegisterInputName(name) != GRAPH_SUCCESS) { + return GRAPH_FAILED; + } + return GRAPH_SUCCESS; } @@ -673,6 +726,16 @@ graphStatus OpDesc::AddDynamicInputDescByIndex(const string &name, const unsigne return GRAPH_SUCCESS; } +graphStatus OpDesc::AddRegisterOutputName(const string &name) { + if (find(register_output_name_.begin(), register_output_name_.end(), name) == register_output_name_.end()) { + register_output_name_.push_back(name); + } + + return GRAPH_SUCCESS; +} + +vector OpDesc::GetRegisterOutputName() const { return register_output_name_; } + graphStatus OpDesc::AddDynamicOutputDesc(const string &name, const unsigned int num, bool is_push_back) { if (is_push_back) { for (unsigned int i = 0; i < num; i++) { @@ -681,6 +744,10 @@ graphStatus OpDesc::AddDynamicOutputDesc(const string &name, const unsigned int } else { if (AddOutputDescForward(name, num) != GRAPH_SUCCESS) return GRAPH_FAILED; } + + if (AddRegisterOutputName(name) != GRAPH_SUCCESS) { + return GRAPH_FAILED; + } return GRAPH_SUCCESS; } diff --git a/src/common/graph/opsproto/opsproto_manager.cc b/src/common/graph/opsproto/opsproto_manager.cc index 4c8c1be5..d482715b 100644 --- a/src/common/graph/opsproto/opsproto_manager.cc +++ b/src/common/graph/opsproto/opsproto_manager.cc @@ -31,6 +31,13 @@ OpsProtoManager *OpsProtoManager::Instance() { } bool OpsProtoManager::Initialize(const std::map &options) { + std::lock_guard lock(mutex_); + + if (is_init_) { + GELOGI("OpsProtoManager is already initialized."); + return true; + } + /*lint -e1561*/ auto proto_iter = options.find("ge.opsProtoLibPath"); /*lint +e1561*/ @@ -42,10 +49,19 @@ bool OpsProtoManager::Initialize(const std::map &optio pluginPath_ = proto_iter->second; LoadOpsProtoPluginSo(pluginPath_); + is_init_ = true; + return true; } void OpsProtoManager::Finalize() { + std::lock_guard lock(mutex_); + + if (!is_init_) { + GELOGI("OpsProtoManager is not initialized."); + return; + } + for (auto handle : handles_) { if (handle != nullptr) { if (dlclose(handle) != 0) { @@ -57,6 +73,8 @@ void OpsProtoManager::Finalize() { GELOGW("close opsprotomanager handler failure, handler is nullptr"); } } + + is_init_ = false; } static std::vector Split(const std::string &str, char delim) { diff --git a/src/common/graph/shape_refiner.cc b/src/common/graph/shape_refiner.cc index 35c109af..a87e3753 100644 --- a/src/common/graph/shape_refiner.cc +++ b/src/common/graph/shape_refiner.cc @@ -601,7 +601,7 @@ InferenceContextPtr CreateInferenceContext(const std::unordered_map context_map; +thread_local std::unordered_map context_map; } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ShapeRefiner::ClearContextMap() { context_map.clear(); } @@ -645,6 +645,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ShapeRefiner::InferSh graphStatus status = InferShapeAndType(node, op, before_subgraph); if (status == GRAPH_PARAM_INVALID || status == GRAPH_SUCCESS) { if (is_unknown_graph) { + PrintInOutTensorShape(node, "after_infershape when running"); return GRAPH_SUCCESS; } auto op_desc = node->GetOpDesc(); diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc index 19c28c63..e2f9f857 100644 --- a/src/common/graph/utils/graph_utils.cc +++ b/src/common/graph/utils/graph_utils.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include "./ge_context.h" #include "debug/ge_util.h" @@ -57,6 +58,7 @@ namespace { const int32_t kBaseOfIntegerValue = 10; #ifdef FMK_SUPPORT_DUMP const char *const kDumpGeGraph = "DUMP_GE_GRAPH"; +const int kDumpGraphIndexWidth = 5; #endif const char *const kDumpGraphLevel = "DUMP_GRAPH_LEVEL"; const char *const kDumpStrBuild = "Build"; @@ -431,10 +433,15 @@ GraphUtils::InsertNodeAfter(const OutDataAnchorPtr &src, const std::vectorGetOutControlAnchor(); GE_CHECK_NOTNULL(src_out_ctrl_anchor); + bool ctrl_edge_flag = true; + std::string type = NodeUtils::GetNodeType(src->GetOwnerNode()); + if ((type == SWITCH) || (type == REFSWITCH) || (type == SWITCHN)) { + ctrl_edge_flag = false; + } + for (auto &dst : dsts) { GE_CHECK_NOTNULL(dst); NodePtr dst_node = dst->GetOwnerNode(); - GE_CHECK_NOTNULL(dst_node); GELOGI("Insert node %s between %s->%s.", insert_node->GetName().c_str(), src_node->GetName().c_str(), dst_node->GetName().c_str()); if (src_node->GetOwnerComputeGraph() != dst_node->GetOwnerComputeGraph()) { @@ -450,11 +457,12 @@ GraphUtils::InsertNodeAfter(const OutDataAnchorPtr &src, const std::vectorGetOutControlAnchor(); - GE_CHECK_NOTNULL(new_out_ctrl_anchor); + if (!ctrl_edge_flag) { + continue; + } for (const InControlAnchorPtr &peer_in_ctrl_anchor : src_out_ctrl_anchor->GetPeerInControlAnchors()) { if ((RemoveEdge(src_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS) || - (AddEdge(new_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS)) { + (AddEdge(insert_node->GetOutControlAnchor(), peer_in_ctrl_anchor) != GRAPH_SUCCESS)) { GELOGE(GRAPH_FAILED, "ReplaceEdge from %s->%s to %s->%s failed.", src_node->GetName().c_str(), peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), insert_node->GetName().c_str(), peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str()); @@ -552,7 +560,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::MatchDumpStr(con GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, - bool is_always_dump) { + bool is_always_dump, + const std::string &user_graph_name) { #ifdef FMK_SUPPORT_DUMP char *dump_ge_graph = std::getenv(kDumpGeGraph); GE_IF_BOOL_EXEC(dump_ge_graph == nullptr && !is_always_dump, return;); @@ -563,32 +572,33 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons } // file name - static int file_idx = 0; - const int dump_graph_index_width = 5; - file_idx++; - GELOGD("Start to dump om txt: %d", file_idx); + static std::atomic_long atomic_file_index(0); + auto file_index = atomic_file_index.fetch_add(1); + GELOGD("Start to dump om txt: %ld", file_index); - static int max_dumpfile_num = 0; - if (max_dumpfile_num == 0) { + thread_local long max_dump_file_num = 0; + if (max_dump_file_num == 0) { string opt = "0"; (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt); - max_dumpfile_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue); + max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue); } - if (max_dumpfile_num != 0 && file_idx > max_dumpfile_num) { - GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileCnt=%d.", max_dumpfile_num); + if (max_dump_file_num != 0 && file_index > max_dump_file_num) { + GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileCnt=%ld.", max_dump_file_num); return; } std::stringstream stream_file_name; - stream_file_name << "ge_proto_" << std::setw(dump_graph_index_width) << std::setfill('0') << file_idx; + stream_file_name << "ge_proto_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index; stream_file_name << "_" << suffix << ".txt"; - std::string proto_file = stream_file_name.str(); + std::string proto_file = user_graph_name.empty() ? stream_file_name.str() : user_graph_name; // Create buffer ge::Model model("", ""); model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast(graph))); Buffer buffer; - model.Save(buffer, true); + const int64_t kDumpLevel = + (dump_ge_graph != nullptr) ? std::strtol(dump_ge_graph, nullptr, kBaseOfIntegerValue) : ge::OnnxUtils::NO_DUMP; + model.Save(buffer, kDumpLevel != ge::OnnxUtils::DUMP_ALL); // Write file ge::proto::ModelDef ge_proto; @@ -631,6 +641,35 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraph(cons } } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraph(const char *file, + ge::ComputeGraphPtr &compute_graph) { + ge::proto::ModelDef model_def; + // Get ModelDef object from file generated by DumpGEGraph() + if (!ReadProtoFromTextFile(file, &model_def)) { + GELOGE(GRAPH_FAILED, "Get ModelDef failed from file"); + return false; + } + ge::Model model; + // Get Model object from ModelDef by deserialize ModelDef + if (model.Load(model_def) == GRAPH_SUCCESS) { + GE_CHK_BOOL_EXEC(GraphUtils::GetComputeGraph(model.GetGraph()) != nullptr, return false, + "Get computer graph is nullptr"); + compute_graph = GraphUtils::GetComputeGraph(model.GetGraph()); + for (const auto &node : compute_graph->GetDirectNode()) { + GELOGI("Node %s set owner graph", node->GetName().c_str()); + GE_CHECK_NOTNULL(node); + if (node->SetOwnerComputeGraph(compute_graph) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Node %s set owner graph failed", node->GetName().c_str()); + return false; + } + } + return true; + } else { + GELOGE(GRAPH_FAILED, "Get Model failed from ModelDef"); + return false; + } +} + // Printing protocol messages in text format is useful for debugging and human editing of messages. GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToTextFile( const google::protobuf::Message &proto, const char *real_path) { @@ -666,16 +705,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToText return; } if (fseek(file, 0L, SEEK_END) == 0) { - int64_t fileSize = ftell(file); - static int64_t maxDumpFileSize = 0; - if (maxDumpFileSize == 0) { + long fileSize = ftell(file); + thread_local long max_dump_file_size = 0; + if (max_dump_file_size == 0) { string opt = "0"; // Can not check return value (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_SIZE, opt); - maxDumpFileSize = atol(opt.c_str()); + max_dump_file_size = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue); } - if (maxDumpFileSize != 0 && fileSize != -1 && fileSize > maxDumpFileSize) { - GELOGW("dump graph file size > maxDumpFileSize, maxDumpFileSize=%ld.", maxDumpFileSize); + if (max_dump_file_size != 0 && fileSize != -1 && fileSize > max_dump_file_size) { + GELOGW("dump graph file size > maxDumpFileSize, maxDumpFileSize=%ld.", max_dump_file_size); GE_IF_BOOL_EXEC(std::remove(real_path) != 0, GELOGW("remove %s failed", real_path)); GE_CHK_BOOL_EXEC(fclose(file) == 0, return, "Fclose %s failed", real_path); return; @@ -734,25 +773,23 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn } // 2.Set file name - static int file_index = 0; - file_index++; - GELOGD("Start to dump ge onnx file: %d", file_index); + static std::atomic_long atomic_file_index(0); + auto file_index = atomic_file_index.fetch_add(1); + GELOGD("Start to dump ge onnx file: %ld", file_index); - static int max_dumpfile_num = 0; - if (max_dumpfile_num == 0) { + thread_local long max_dump_file_num = 0; + if (max_dump_file_num == 0) { string opt = "0"; (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt); - max_dumpfile_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue); + max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue); } - if (max_dumpfile_num != 0 && file_index > max_dumpfile_num) { - GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileNum=%d.", max_dumpfile_num); + if (max_dump_file_num != 0 && file_index > max_dump_file_num) { + GELOGW("dump graph file cnt > maxDumpFileNum, maxDumpFileNum=%ld.", max_dump_file_num); return; } - /// 99999 graphs can be dumped at most at one time - /// setw(5) is for formatted sort std::stringstream stream_file_name; - stream_file_name << "ge_onnx_" << std::setw(5) << std::setfill('0') << file_index; + stream_file_name << "ge_onnx_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index; stream_file_name << "_graph_" << compute_graph.GetGraphID(); stream_file_name << "_" << suffix << ".pbtxt"; std::string proto_file = stream_file_name.str(); @@ -1363,6 +1400,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr GraphUtils::FindR /// Make a copy of ComputeGraph. /// @param graph: original graph. /// @param prefix: node name prefix of new graph. +/// @param output_nodes: output nodes of new graph. /// @return ComputeGraphPtr /// GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr @@ -1399,6 +1437,14 @@ GraphUtils::CloneGraph(const ComputeGraphPtr &graph, const std::string &prefix, } } + std::string session_graph_id; + if (AttrUtils::GetStr(*graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) { + bool ret = AttrUtils::SetStr(*new_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id); + if (!ret) { + GELOGE(GRAPH_FAILED, "Set attr ATTR_NAME_SESSION_GRAPH_ID failed."); + return nullptr; + } + } return new_graph; } diff --git a/src/common/graph/utils/op_desc_utils.cc b/src/common/graph/utils/op_desc_utils.cc index 92883877..e0579581 100644 --- a/src/common/graph/utils/op_desc_utils.cc +++ b/src/common/graph/utils/op_desc_utils.cc @@ -479,6 +479,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDescUtils:: return ret; } + if (node.GetType() == DATA) { + auto parent = NodeUtils::GetParentInput(node); + if ((parent != nullptr) && NodeUtils::IsConst(*parent)) { + auto weight = MutableWeights(parent->GetOpDesc()); + if (weight == nullptr) { + GELOGI("const op has no weight, op name:%s", parent->GetName().c_str()); + return ret; + } + ret.push_back(weight); + } + return ret; + } + // Other operators, get weights from connected constop auto input_nodes = GetConstInputs(node); for (const auto &input_node : input_nodes) { @@ -560,11 +573,9 @@ OpDescPtr OpDescUtils::CreateConstOp(const GeTensorPtr &tensor_ptr) { const_opdesc->SetType(CONSTANT); - static int const_count = 0; - const_opdesc->SetName("dynamic_const_" + std::to_string(const_count)); - + thread_local int64_t const_count = 0; + const_opdesc->SetName("dynamic_const_" + std::to_string(GetTid()) + "_" + std::to_string(const_count)); GELOGI("add const op: %s", const_opdesc->GetName().c_str()); - ++const_count; (void)const_opdesc->AddOutputDesc(tensor_ptr->GetTensorDesc()); diff --git a/src/common/graph/utils/tuning_utils.cc b/src/common/graph/utils/tuning_utils.cc new file mode 100644 index 00000000..0f07a197 --- /dev/null +++ b/src/common/graph/utils/tuning_utils.cc @@ -0,0 +1,684 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/tuning_utils.h" +#include "../debug/ge_util.h" +#include "../debug/ge_op_types.h" + +namespace ge { +const std::string peer_node_name_attr = "_peerNodeName"; +const std::string parent_node_name_attr = "_parentNodeName"; +const std::string alias_name_attr = "_aliasName"; +const std::string parent_node_attr = "parentNode"; +const std::string parent_node_anchor_index_attr = "_parentNodeAnchorIndex"; +const std::string tuning_subgraph_prefix = "/aicore_subgraph_"; +const std::string non_tuning_subgraph_prefix = "/subgraph_"; +const std::set kPartitionOpTypes = {PLACEHOLDER, END}; +const std::set kExeTypes = {DATA, NETOUTPUT}; +NodeNametoNodeNameMap TuningUtils::data_2_netoutput_; +NodetoNodeNameMap TuningUtils::data_node_2_netoutput_; +NodetoNodeMap TuningUtils::data_node_2_netoutput_node_; +NodeSet TuningUtils::netoutput_nodes_; +NodeSet TuningUtils::merged_graph_nodes_; +SubgraphCreateOutNode TuningUtils::create_output_; +std::mutex TuningUtils::mutex_; + +std::string TuningUtils::PrintCheckLog() { + std::stringstream ss; + ss << "d2n:{"; + for (const auto &pair : data_2_netoutput_) { + ss << "data:" << pair.first << "-" + << "netoutput:" << pair.second; + ss << " | "; + } + ss << "}"; + ss << "netoutputs:{"; + for (const auto &node : netoutput_nodes_) { + ss << "netoutput:" << node->GetName(); + ss << " | "; + } + ss << "}"; + return ss.str(); +} + +std::string TuningUtils::GetNodeNameByAnchor(const Anchor *anchor) { + if (anchor == nullptr) { + GELOGE(GRAPH_FAILED, "Anchor is nullptr"); + return "Null"; + } + auto node = anchor->GetOwnerNode(); + return node == nullptr ? "Null" : node->GetName(); +} + +// part 1 +graphStatus TuningUtils::ConvertGraphToFile(std::vector tuning_subgraphs, + std::vector non_tuning_subgraphs, bool exe_flag, + const std::string &path, const std::string &user_path) { + int64_t i = 0; + int64_t j = 0; + std::lock_guard lock(mutex_); + for (auto &subgraph : tuning_subgraphs) { + create_output_.emplace(subgraph, nullptr); + auto help_info = HelpInfo{i, exe_flag, true, path, user_path}; + if (MakeExeGraph(subgraph, help_info) != SUCCESS) { + GELOGE(GRAPH_FAILED, "TUU:subgraph %zu generate exe graph failed", i); + return GRAPH_FAILED; + } + i++; + } + + for (auto &subgraph : non_tuning_subgraphs) { + create_output_.emplace(subgraph, nullptr); + auto help_info = HelpInfo{j, true, false, path, user_path}; + if (MakeExeGraph(subgraph, help_info) != SUCCESS) { + GELOGE(GRAPH_FAILED, "TUU:non tuning_subgraph %zu generate exe graph failed", j); + return GRAPH_FAILED; + } + j++; + } + create_output_.clear(); + return SUCCESS; +} + +// +---------------+ +// | pld pld | +// | \ / | +// | relu relu | +// | \ / | +// | add | +// | | | +// | end | +// +---------------+ +// | +// | +// V +// +---------------+ +// | data data | +// | \ / | +// | relu relu | +// | \ / | +// | add | +// | | | +// | netoutput | +// +---------------+ +graphStatus TuningUtils::MakeExeGraph(ComputeGraphPtr &exe_graph, const HelpInfo &help_info) { + GE_CHECK_NOTNULL(exe_graph); + // if not make exe, just dump and return + if (!help_info.exe_flag) { + DumpGraphToPath(exe_graph, help_info.index, help_info.is_tuning_graph, help_info.path); + GELOGI("TUU:just return, dump original sub_graph[%s]index[%d]", exe_graph->GetName().c_str(), help_info.index); + return SUCCESS; + } + // modify sub graph + for (NodePtr &node : exe_graph->GetDirectNode()) { + // 1.handle pld + if (node->GetType() == PLACEHOLDER) { + if (HandlePld(node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), + exe_graph->GetName().c_str()); + return FAILED; + } + } + // 2.handle end + if (node->GetType() == END) { + if (HandleEnd(node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), + exe_graph->GetName().c_str()); + return FAILED; + } + } + } + graphStatus ret = exe_graph->TopologicalSorting(); + if (ret != SUCCESS) { + GELOGE(ret, "Graph[%s] topological sort failed, ret:%d.", exe_graph->GetName().c_str(), ret); + return ret; + } + // dump subgraphs which modified by us + if (help_info.user_path.empty()) { + DumpGraphToPath(exe_graph, help_info.index, help_info.is_tuning_graph, help_info.path); + } else { + GraphUtils::DumpGEGraph(exe_graph, "", true, help_info.user_path); + } + return SUCCESS; +} + +void TuningUtils::DumpGraphToPath(ComputeGraphPtr &exe_graph, int64_t index, bool is_tuning_graph, std::string path) { + if (!path.empty()) { + if (is_tuning_graph) { + GraphUtils::DumpGEGraph(exe_graph, "", true, path + tuning_subgraph_prefix + std::to_string(index) + ".txt"); + } else { + GraphUtils::DumpGEGraph(exe_graph, "", true, path + non_tuning_subgraph_prefix + std::to_string(index) + ".txt"); + } + } else { + path = "./"; + if (is_tuning_graph) { + GraphUtils::DumpGEGraph(exe_graph, "", true, path + tuning_subgraph_prefix + std::to_string(index) + ".txt"); + } else { + GraphUtils::DumpGEGraph(exe_graph, "", true, path + non_tuning_subgraph_prefix + std::to_string(index) + ".txt"); + } + } +} + +graphStatus TuningUtils::CreateDataNode(NodePtr &node, NodePtr &data_node) { + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + auto data_op_desc = ComGraphMakeShared(node->GetName(), DATA); + GE_CHECK_NOTNULL(data_op_desc); + auto pld_op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(pld_op_desc); + auto output_desc = pld_op_desc->GetOutputDesc(0); // only one output for pld and data + // data inputdesc & outputdesc set as same + if (data_op_desc->AddInputDesc(output_desc) != SUCCESS) { + GELOGE(FAILED, "TUU:data node %s AddOutputDesc failed", data_op_desc->GetName().c_str()); + return FAILED; + } + if (data_op_desc->AddOutputDesc(output_desc) != SUCCESS) { + GELOGE(FAILED, "TUU:data node %s AddOutputDesc failed", data_op_desc->GetName().c_str()); + return FAILED; + } + data_node = graph->AddNode(data_op_desc); + GE_CHECK_NOTNULL(data_node); + if (data_node->SetOwnerComputeGraph(graph) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:SetOwnerComputeGraph failed"); + return FAILED; + } + return SUCCESS; +} + +graphStatus TuningUtils::AddAttrToDataNodeForMergeGraph(const NodePtr &pld, NodePtr &data_node) { + auto op_desc = data_node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + + auto pld_desc = pld->GetOpDesc(); + GE_CHECK_NOTNULL(pld_desc); + // inherit + // a. set `end's input node type` as attr + std::string parent_op_type; + if (!AttrUtils::GetStr(pld_desc, "parentOpType", parent_op_type)) { + GELOGE(FAILED, "TUU:pld %s get parentOpType failed", pld_desc->GetName().c_str()); + return FAILED; + } + (void)AttrUtils::SetStr(op_desc, "parentOpType", parent_op_type); + // b. set `end's input node name` as attr + std::string parent_op_name; + if (!AttrUtils::GetStr(pld_desc, parent_node_name_attr, parent_op_name)) { + GELOGE(FAILED, "TUU:pld %s get _parentNodeName failed", pld_desc->GetName().c_str()); + return FAILED; + } + (void)AttrUtils::SetStr(op_desc, parent_node_name_attr, parent_op_name); + // c. set `end's input node's out anchor index` as attr + int parent_node_anchor_index; + if (!AttrUtils::GetInt(pld_desc, "anchorIndex", parent_node_anchor_index)) { + GELOGE(FAILED, "TUU:pld %s get anchorIndex failed", pld_desc->GetName().c_str()); + return FAILED; + } + (void)AttrUtils::SetInt(op_desc, parent_node_anchor_index_attr, parent_node_anchor_index); + GELOGD("TUU:from node %s(%s) to add attr to node %s(%s) success", pld->GetName().c_str(), pld->GetType().c_str(), + data_node->GetName().c_str(), data_node->GetType().c_str()); + // d. set `end node name` as attr + std::string peer_end_name; + if (!AttrUtils::GetStr(pld_desc, peer_node_name_attr, peer_end_name)) { + GELOGE(FAILED, "TUU:pld %s get _peerNodeName failed", pld_desc->GetName().c_str()); + return FAILED; + } + (void)AttrUtils::SetStr(op_desc, peer_node_name_attr, peer_end_name); + GELOGD("TUU:from node %s(%s) to add attr to node %s(%s) success", pld->GetName().c_str(), pld->GetType().c_str(), + data_node->GetName().c_str(), data_node->GetType().c_str()); + return SUCCESS; +} + +graphStatus TuningUtils::ChangePld2Data(NodePtr &node, NodePtr &data_node) { + auto type_pld = node->GetType(); + auto type_data = data_node->GetType(); + if (type_pld != PLACEHOLDER || type_data != DATA) { + GELOGE(FAILED, "TUU:Failed to change node %s from type %s to type %s", node->GetName().c_str(), type_pld.c_str(), + type_data.c_str()); + return FAILED; + } + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + std::vector output_map(node->GetAllOutDataAnchorsSize()); + for (size_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) { + output_map[i] = static_cast(i); + } + + auto ret = GraphUtils::ReplaceNodeAnchors(data_node, node, {}, output_map); + if (ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:Failed to replace node %s by node %s error node %u", node->GetName().c_str(), + data_node->GetName().c_str(), ret); + return FAILED; + } + + NodeUtils::UnlinkAll(*node); + + ret = GraphUtils::RemoveNodeWithoutRelink(graph, node); + if (ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:Failed to remove node %s from graph", node->GetName().c_str()); + return FAILED; + } + + GELOGD("TUU:Remove node %s(%s) by the ChangePld2Data process, replace it with node %s(%s)", node->GetName().c_str(), + node->GetType().c_str(), data_node->GetName().c_str(), data_node->GetType().c_str()); + return ret; +} + +graphStatus TuningUtils::HandlePld(NodePtr &node) { + GE_CHECK_NOTNULL(node); + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + NodePtr data_node = nullptr; + + // 1. create data node + if (CreateDataNode(node, data_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + // 2. add necessary info to data_node for recovery whole graph + if (AddAttrToDataNodeForMergeGraph(node, data_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + // 3. replace pld node by data node created before + if (ChangePld2Data(node, data_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + GELOGD("TUU:pld[%s] handle success", node->GetName().c_str()); + return SUCCESS; +} + +graphStatus TuningUtils::CreateNetOutput(NodePtr &node, NodePtr &out_node) { + GE_CHECK_NOTNULL(node); + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + auto search = create_output_.find(graph); + if (search == create_output_.end()) { + GELOGE(FAILED, "TUU:node %s's owner sub graph %s not exist in create_output map", node->GetName().c_str(), + graph->GetName().c_str()); + return FAILED; + } + if (search->second != nullptr) { + out_node = search->second; + GELOGD("TUU:sub graph %s has created output node, just return", graph->GetName().c_str()); + return SUCCESS; + } + auto out_op_desc = ComGraphMakeShared(node->GetName(), NETOUTPUT); + GE_CHECK_NOTNULL(out_op_desc); + out_node = graph->AddNode(out_op_desc); + GE_CHECK_NOTNULL(out_node); + if (out_node->SetOwnerComputeGraph(graph) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:SetOwnerComputeGraph failed"); + return FAILED; + } + create_output_[graph] = out_node; + return SUCCESS; +} + +graphStatus TuningUtils::AddAttrToNetOutputForMergeGraph(const NodePtr &end, NodePtr &out_node) { + GE_CHECK_NOTNULL(end); + GE_CHECK_NOTNULL(out_node); + auto op_desc = out_node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + std::vector alias_names = {}; + (void)AttrUtils::GetListStr(op_desc, alias_name_attr, alias_names); + alias_names.push_back(end->GetName()); + (void)AttrUtils::SetListStr(op_desc, alias_name_attr, alias_names); + return SUCCESS; +} + +graphStatus TuningUtils::LinkEnd2NetOutput(NodePtr &end_node, NodePtr &out_node) { + GE_CHECK_NOTNULL(end_node); + GE_CHECK_NOTNULL(out_node); + // get end in node is control node or normal node + AnchorPtr end_in_anchor = (end_node->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr) + ? Anchor::DynamicAnchorCast(end_node->GetInControlAnchor()) + : Anchor::DynamicAnchorCast(end_node->GetInDataAnchor(0)); + auto src_anchor = end_in_anchor->GetFirstPeerAnchor(); // src_anchor should be only 1 + if (GraphUtils::RemoveEdge(src_anchor, end_in_anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:remove end input edge from from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s", + GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(), + GetNodeNameByAnchor(end_in_anchor.get()).c_str(), end_in_anchor->GetIdx(), end_node->GetName().c_str(), + end_node->GetOwnerComputeGraph()->GetName().c_str()); + return FAILED; + } + // add edge between `end in node` and `out_node` + if (src_anchor->IsTypeOf()) { + std::shared_ptr anchor = + ComGraphMakeShared(out_node, out_node->GetAllInDataAnchors().size()); + GE_CHECK_NOTNULL(anchor); + out_node->in_data_anchors_.push_back(anchor); + if (GraphUtils::AddEdge(src_anchor, anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s", + GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(), + GetNodeNameByAnchor(anchor.get()).c_str(), anchor->GetIdx(), end_node->GetName().c_str(), + end_node->GetOwnerComputeGraph()->GetName().c_str()); + return FAILED; + } + auto end_op_desc = end_node->GetOpDesc(); + GE_CHECK_NOTNULL(end_op_desc); + auto out_node_op_desc = out_node->GetOpDesc(); + GE_CHECK_NOTNULL(out_node_op_desc); + // end node always has one input + if (out_node_op_desc->AddInputDesc(end_op_desc->GetInputDesc(0)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:node %s add input desc failed.", out_node_op_desc->GetName().c_str()); + return FAILED; + } + } else if (src_anchor->IsTypeOf()) { + auto anchor = out_node->GetInControlAnchor(); + if (GraphUtils::AddEdge(src_anchor, anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s", + GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(), + GetNodeNameByAnchor(anchor.get()).c_str(), anchor->GetIdx(), end_node->GetName().c_str(), + end_node->GetOwnerComputeGraph()->GetName().c_str()); + return FAILED; + } + } else { + GELOGE(FAILED, "TUU: node_name:%s, graph_name:%s handled failed", end_node->GetName().c_str(), + end_node->GetOwnerComputeGraph()->GetName().c_str()); + return FAILED; + } + + return SUCCESS; +} + +graphStatus TuningUtils::ChangeEnd2NetOutput(NodePtr &end_node, NodePtr &out_node) { + GE_CHECK_NOTNULL(end_node); + GE_CHECK_NOTNULL(out_node); + auto type_end = end_node->GetType(); + auto type_out = out_node->GetType(); + if (type_end != END || type_out != NETOUTPUT) { + GELOGE(FAILED, "TUU:Failed to change end_node %s from type %s to type %s", end_node->GetName().c_str(), + type_end.c_str(), type_out.c_str()); + return FAILED; + } + // link all `end nodes's in node` to this out_node + if (LinkEnd2NetOutput(end_node, out_node) != SUCCESS) { + GELOGE(FAILED, "TUU:end_node [%s] LinkEnd2NetOutput failed.", end_node->GetName().c_str()); + return FAILED; + } + // remove `end node` + NodeUtils::UnlinkAll(*end_node); + auto graph = end_node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + if (GraphUtils::RemoveNodeWithoutRelink(graph, end_node) != SUCCESS) { + GELOGE(FAILED, "TUU:end node [%s] RemoveNodeWithoutRelink failed.", end_node->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + +graphStatus TuningUtils::HandleEnd(NodePtr &node) { + GE_CHECK_NOTNULL(node); + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + NodePtr out_node = nullptr; + + // 1. create net_output node , add only one NetOutput node to one subgraph + if (CreateNetOutput(node, out_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + // 2. add necessary info to out_node for recovery whole graph + if (AddAttrToNetOutputForMergeGraph(node, out_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + // 3. replace all end nodes by one output node created before + if (ChangeEnd2NetOutput(node, out_node) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to handle node %s from graph %s", node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + GELOGD("TUU:end[%s] handle success", node->GetName().c_str()); + return SUCCESS; +} + +// part 2 +graphStatus TuningUtils::ConvertFileToGraph(const map &options, ge::Graph &graph) { + // 1. get all subgraph object + std::vector graphs; + // options format like {index:"subgraph_path"} + for (const auto &pair : options) { + ComputeGraphPtr compute_graph = ComGraphMakeShared(std::to_string(pair.first)); + if (!ge::GraphUtils::LoadGEGraph(pair.second.c_str(), *compute_graph)) { + GELOGE(FAILED, "TUU:load graph from file failed"); + } + graphs.push_back(compute_graph); + } + // 2. merge graph + ComputeGraphPtr merged_graph = ComGraphMakeShared("whole_graph_after_tune"); + GE_CHECK_NOTNULL(merged_graph); + if (MergeAllSubGraph(graphs, merged_graph) != SUCCESS) { + GELOGE(FAILED, "TUU:MergeGraph failed"); + return FAILED; + } + // 3. set parent graph + for (const auto &node : merged_graph->GetDirectNode()) { + GE_CHECK_NOTNULL(node); + if (node->SetOwnerComputeGraph(merged_graph) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:node %s set owner graph failed", node->GetName().c_str()); + return FAILED; + } + } + graph = GraphUtils::CreateGraphFromComputeGraph(merged_graph); + return SUCCESS; +} + +// +----------------------------------+ +// | const const | +// | \ / | +// | netoutput(end,end) | +// +----------------------------------+ +// + +// +----------------------------------+ +// | data(pld) data(pld) | +// | \ / | +// | relu relu | +// | \ / | +// | \ / | +// | add | +// | | | +// | netoutput(end) | +// +----------------------------------+ +// + +// +----------------------------------+ +// | data(pld) | +// | / | +// | netoutput | +// +----------------------------------+ +// | +// | +// V +// +----------------------------------+ +// | const const | +// | \ / | +// | relu relu | +// | \ / | +// | \ / | +// | add | +// | | | +// | netoutput | +// +----------------------------------+ +graphStatus TuningUtils::MergeAllSubGraph(std::vector &subgraphs, + ComputeGraphPtr &output_merged_compute_graph) { + GE_CHECK_NOTNULL(output_merged_compute_graph); + // 1. handle all subgraphs + for (auto &subgraph : subgraphs) { + Status ret_status = MergeSubGraph(subgraph); + if (ret_status != SUCCESS) { + GELOGE(ret_status, "TUU:subgraph %s merge failed", subgraph->GetName().c_str()); + return ret_status; + } + } + + for (const auto &node : merged_graph_nodes_) { + (void)output_merged_compute_graph->AddNode(node); + GELOGD("TUU:graph %s add node %s success", output_merged_compute_graph->GetName().c_str(), node->GetName().c_str()); + } + + // 2. remove data and output node added by us + if (RemoveDataNetoutputEdge(output_merged_compute_graph) != SUCCESS) { + GELOGE(FAILED, "TUU:Failed to merge graph %s", output_merged_compute_graph->GetName().c_str()); + return FAILED; + } + graphStatus ret = output_merged_compute_graph->TopologicalSorting(); + if (ret != SUCCESS) { + GELOGE(ret, "Graph[%s] topological sort failed, ret:%d.", output_merged_compute_graph->GetName().c_str(), ret); + return ret; + } + GELOGD("TUU:Print-%s", PrintCheckLog().c_str()); + GELOGI("TUU:output_merged_compute_graph %s success", output_merged_compute_graph->GetName().c_str()); + return SUCCESS; +} + +graphStatus TuningUtils::MergeSubGraph(ComputeGraphPtr &subgraph) { + for (auto &node : subgraph->GetDirectNode()) { + if (kPartitionOpTypes.count(node->GetType()) > 0) { + GELOGE(FAILED, "TUU:subgraph passed in should not contain nodes of end or pld type"); + return FAILED; + } + // handle data converted from pld node + if (node->GetType() == DATA) { + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + std::string peer_out_name; + bool has_valid_str = (AttrUtils::GetStr(op_desc, peer_node_name_attr, peer_out_name)) && (!peer_out_name.empty()); + if (has_valid_str) { + std::lock_guard lock(mutex_); + data_2_netoutput_.emplace(op_desc->GetName(), peer_out_name); + data_node_2_netoutput_.emplace(node, peer_out_name); + continue; + } + } + // handle netoutput converted from end node + if (node->GetType() == NETOUTPUT) { + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + std::vector out_alias_name; + bool has_valid_str = + (AttrUtils::GetListStr(op_desc, alias_name_attr, out_alias_name)) && (!out_alias_name.empty()); + if (has_valid_str) { + std::lock_guard lock(mutex_); + netoutput_nodes_.insert(node); + } + } + { + std::lock_guard lock(mutex_); + merged_graph_nodes_.emplace(node); + } + GELOGD("TUU:subgraph %s add node %s success", subgraph->GetName().c_str(), node->GetName().c_str()); + } + GELOGI("TUU:merge subgraph %s success", subgraph->GetName().c_str()); + return SUCCESS; +} + +graphStatus TuningUtils::RemoveDataNetoutputEdge(ComputeGraphPtr &graph) { + GE_CHECK_NOTNULL(graph); + // 1. traverse + for (auto &pair : data_node_2_netoutput_) { + auto data_node = pair.first; + GE_CHECK_NOTNULL(data_node); + auto netoutput_name = pair.second; + auto netoutput_node = graph->FindNode(netoutput_name); + GE_CHECK_NOTNULL(netoutput_node); + data_node_2_netoutput_node_.emplace(data_node, netoutput_node); + // 2. get `data out anchor` and `net output in anchor` and `net output in node's out anchor` + AnchorPtr data_out_anchor = (data_node->GetOutDataAnchor(0)->GetFirstPeerAnchor() == nullptr) + ? Anchor::DynamicAnchorCast(data_node->GetOutControlAnchor()) + : Anchor::DynamicAnchorCast(data_node->GetOutDataAnchor(0)); + AnchorPtr net_output_in_anchor = nullptr; + AnchorPtr src_out_anchor = nullptr; + if (GetInAndOutAnchorPair(data_node, netoutput_node, net_output_in_anchor, src_out_anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:get out node:%s 's in anchor related with data node:%s failed", + netoutput_node->GetName().c_str(), data_node->GetName().c_str()); + return FAILED; + } + // 3. relink + if (GraphUtils::RemoveEdge(src_out_anchor, net_output_in_anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:remove edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s", + GetNodeNameByAnchor(src_out_anchor.get()).c_str(), src_out_anchor->GetIdx(), + GetNodeNameByAnchor(net_output_in_anchor.get()).c_str(), net_output_in_anchor->GetIdx(), + data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + GE_CHECK_NOTNULL(data_out_anchor); + for (const auto &peer_in_anchor : data_out_anchor->GetPeerAnchors()) { + if (GraphUtils::RemoveEdge(data_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:remove edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s", + GetNodeNameByAnchor(data_out_anchor.get()).c_str(), data_out_anchor->GetIdx(), + GetNodeNameByAnchor(peer_in_anchor.get()).c_str(), peer_in_anchor->GetIdx(), + data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + if (GraphUtils::AddEdge(src_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:add edge from %s(%d) to %s(%d) failed. node_name:(data:%s;netoutput:%s), graph_name:%s", + GetNodeNameByAnchor(src_out_anchor.get()).c_str(), src_out_anchor->GetIdx(), + GetNodeNameByAnchor(peer_in_anchor.get()).c_str(), peer_in_anchor->GetIdx(), + data_node->GetName().c_str(), netoutput_node->GetName().c_str(), graph->GetName().c_str()); + return FAILED; + } + } + } + // 4. remove out nodes added by us + for (auto &node : netoutput_nodes_) { + NodeUtils::UnlinkAll(*node); + if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + GELOGE(FAILED, "TUU:Failed to remove node %s from graph", node->GetName().c_str()); + return FAILED; + } + GELOGD("TUU:Remove node %s by the RemoveDataNetoutputEdge process success", node->GetName().c_str()); + } + return SUCCESS; +} + +graphStatus TuningUtils::GetInAndOutAnchorPair(NodePtr &data_node, NodePtr &out_node, AnchorPtr &dest_in_anchor, + AnchorPtr &src_out_anchor) { + // 1. get `data parent node name`, i.e. `netoutput input node name` + std::string netoutput_input_name; + auto op_desc = data_node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + if (!AttrUtils::GetStr(op_desc, parent_node_name_attr, netoutput_input_name)) { + GELOGE(FAILED, "TUU:Failed to get parent node attr from node %s", op_desc->GetName().c_str()); + return FAILED; + } + // 2. find index + int parent_node_anchor_index; + if (!AttrUtils::GetInt(op_desc, parent_node_anchor_index_attr, parent_node_anchor_index)) { + GELOGE(FAILED, "TUU:Failed to get parent node anchor index attr from node %s", op_desc->GetName().c_str()); + return FAILED; + } + // 3.find in data or ctrl anchor by 1&2 step + for (auto &in_anchor : out_node->GetAllInAnchors()) { + GE_CHECK_NOTNULL(in_anchor); + for (auto &src_anchor : in_anchor->GetPeerAnchors()) { // get all peer anchors for ctrl + GE_CHECK_NOTNULL(src_anchor); + auto src_node = src_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(src_node); + if (src_node->GetName() == netoutput_input_name && src_anchor->GetIdx() == parent_node_anchor_index) { + dest_in_anchor = in_anchor; + src_out_anchor = src_anchor; + GELOGD("TUU:get out node:%s 's in anchor(%d) src_node:%s 's out anchor(%d) related with data node:%s", + out_node->GetName().c_str(), dest_in_anchor->GetIdx(), netoutput_input_name.c_str(), + parent_node_anchor_index, data_node->GetName().c_str()); + break; + } + } + } + GE_CHECK_NOTNULL(dest_in_anchor); + GE_CHECK_NOTNULL(src_out_anchor); + return SUCCESS; +} + +} // namespace ge \ No newline at end of file diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt index 922502e6..18c433cb 100755 --- a/src/ge/CMakeLists.txt +++ b/src/ge/CMakeLists.txt @@ -31,6 +31,7 @@ file(GLOB PROTO_HEADER_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../proto/ge_ir.proto" "../proto/fwk_adapter.proto" "../proto/op_mapping_info.proto" + "../proto/dump_task.proto" ) ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) ge_protobuf_generate(ge PROTO_CLIENT_SRCS PROTO_CLIENT_HDRS ${PROTO_CLIENT_LIST}) @@ -39,6 +40,7 @@ ge_protobuf_generate(ge PROTO_HEADER_SRCS PROTO_HEADER_HDRS ${PROTO_HEADER_LIST} include_directories(${CMAKE_CURRENT_LIST_DIR}) include_directories(${GE_SOURCE_DIR}) include_directories(${GE_SOURCE_DIR}/src) +include_directories(${GE_SOURCE_DIR}/src/ge/analyzer) include_directories(${GE_SOURCE_DIR}/inc) include_directories(${GE_SOURCE_DIR}/inc/common/util) include_directories(${GE_SOURCE_DIR}/inc/external) @@ -55,6 +57,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge) ######### libge_runner.so ############# # need to remove dependencies on pb files later file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} + "analyzer/analyzer.cc" "client/ge_api.cc" "common/dump/dump_manager.cc" "common/dump/dump_properties.cc" @@ -105,12 +108,12 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/manager/graph_caching_allocator.cc" "graph/manager/graph_var_manager.cc" "graph/manager/model_manager/event_manager.cc" + "graph/manager/rdma_pool_allocator.cc" "graph/manager/trans_var_data_utils.cc" "graph/manager/util/debug.cc" "graph/manager/util/hcom_util.cc" "graph/manager/util/rt_context_util.cc" "graph/manager/util/variable_accelerate_ctrl.cc" - "graph/manager/model_manager/event_manager.cc" "graph/manager/util/debug.cc" "graph/manager/util/hcom_util.cc" "graph/manager/util/rt_context_util.cc" @@ -228,6 +231,7 @@ target_link_libraries(ge_runner ######### libge_compiler.so ############# # need to remove dependencies on pb files later file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} + "analyzer/analyzer.cc" "common/dump/dump_properties.cc" "common/dump/dump_manager.cc" "common/dump/dump_op.cc" @@ -276,6 +280,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/manager/trans_var_data_utils.cc" "graph/manager/graph_var_manager.cc" "graph/manager/model_manager/event_manager.cc" + "graph/manager/rdma_pool_allocator.cc" "graph/manager/util/debug.cc" "graph/manager/util/rt_context_util.cc" "graph/manager/util/variable_accelerate_ctrl.cc" diff --git a/src/ge/analyzer/analyzer.cc b/src/ge/analyzer/analyzer.cc new file mode 100644 index 00000000..1c944971 --- /dev/null +++ b/src/ge/analyzer/analyzer.cc @@ -0,0 +1,304 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "analyzer.h" + +#include +#include +#include + +#include "framework/common/debug/ge_log.h" +#include "framework/common/util.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" +#include "graph/utils/type_utils.h" + +namespace ge { +using json = nlohmann::json; +using Status = ge::Status; +using ComputeGraph = ge::ComputeGraph; +using namespace analyzer; + +namespace { +constexpr int kFileAuthority = 0640; +constexpr int kJsonDumpLevel = 4; + +const std::string kFilePath = "./"; +const std::string kAnalyzeFile = "ge_check_op.json"; + +const std::string kUnknownShape = "unknownshape"; +const std::string kUnsupport = "unsupport"; + +const std::string kSessionId = "session_id"; +const std::string kGraphId = "graph_id"; +const std::string kOpInfo = "op_info"; +const std::string kErrorType = "error_type"; +const std::string kOpName = "name"; +const std::string kOpType = "type"; +const std::string kReason = "reason"; +const std::string kInput = "input"; +const std::string kOutput = "output"; +const std::string kShape = "shape"; +const std::string kDataType = "data_type"; +const std::string kLayout = "layout"; +const std::string kResult = "result"; +const std::string kOp = "op"; + +std::map errors_map{{PARSER, "paser_error"}, + {INFER_SHAPE, "infer_shape_error"}, + {CHECKSUPPORT, "check_support_error"}, + {GRAPH_OPTIMIZE, "graph_optimize_error"}, + {GRAPH_PARTION, "graph_partion_error"}, + {GRAPH_BUILDER, "graph_builder_error"}}; +} // namespace + +Analyzer *Analyzer::GetInstance() { + static Analyzer instance; + return &instance; +} + +Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) { + GELOGD("Start to build map. SessionId:%lu GraphId:%lu", session_id, graph_id); + std::lock_guard lg(mutex_); + auto iter = graph_infos_.find(session_id); + if (iter == graph_infos_.end()) { + auto p = new (std::nothrow) GraphInfo(); + GE_CHECK_NOTNULL(p); + std::shared_ptr graph_info(p); + std::map> graph_map; + graph_map[graph_id] = graph_info; + graph_info->session_id = session_id; + graph_info->graph_id = graph_id; + graph_infos_.insert({session_id, graph_map}); + } else { + auto iter1 = (iter->second).find(graph_id); + if (iter1 == (iter->second).end()) { + auto p = new (std::nothrow) GraphInfo(); + GE_CHECK_NOTNULL(p); + std::shared_ptr graph_info(p); + graph_info->session_id = session_id; + graph_info->graph_id = graph_id; + (iter->second).insert({graph_id, graph_info}); + } else { + GELOGI("session_id:%lu graph_id:%lu already existed json object", session_id, graph_id); + } + } + return SUCCESS; +} + +ge::Status Analyzer::Initialize() { + ClearHistoryFile(); + return CreateAnalyzerFile(); +} + +void Analyzer::Finalize() { + GELOGD("Analyzer start to finalize!"); + std::lock_guard lg(mutex_); + for (auto &session_resource : graph_infos_) { + session_resource.second.clear(); + } + graph_infos_.clear(); + + std::lock_guard lk(file_mutex_); + if (json_file_.is_open()) { + json_file_.close(); + } +} + +void Analyzer::DestroySessionJsonObject(uint64_t session_id) { + std::lock_guard lg(mutex_); + auto iter = graph_infos_.find(session_id); + if (iter == graph_infos_.end()) { + GELOGW("can not find the stored object by session_id[%lu].Do nothing", session_id); + } else { + graph_infos_.erase(iter); + } +} + +void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) { + std::lock_guard lg(mutex_); + auto iter = graph_infos_.find(session_id); + if (iter == graph_infos_.end()) { + GELOGW("can not find the stored object by session_id[%lu].Do nothing", session_id); + } else { + auto iter1 = (iter->second).find(graph_id); + if (iter1 == (iter->second).end()) { + GELOGW("can not find the graph json object by session_id[%lu] and graph_id[%lu].Do nothing", session_id, + graph_id); + } + (iter->second).erase(iter1); + } +} + +std::shared_ptr Analyzer::GetJsonObject(uint64_t session_id, uint64_t graph_id) { + std::lock_guard lg(mutex_); + auto iter = graph_infos_.find(session_id); + if (iter == graph_infos_.end()) { + GELOGE(PARAM_INVALID, "session_id:%lu does not exist!", session_id); + return nullptr; + } else { + auto iter1 = (iter->second).find(graph_id); + if (iter1 == (iter->second).end()) { + GELOGE(PARAM_INVALID, "graph_id:%lu does not exist!", graph_id); + return nullptr; + } + GELOGI("GetJsonObject Success!session_id:%lu graph_id:%lu", session_id, graph_id); + return iter1->second; + } +} + +void Analyzer::ClearHistoryFile() { + GELOGD("Analyzer start to clear history file!"); + + // Remove history files + int res = remove(json_file_name_.c_str()); + GELOGD("remove file %s, result:%d", json_file_name_.c_str(), res); +} + +ge::Status Analyzer::CreateAnalyzerFile() { + GELOGD("start to create analyzer file!"); + // Check whether the manifest exists, if not, create it. + string real_path = RealPath(kFilePath.c_str()); + if (real_path.empty()) { + GELOGE(FAILED, "File path is invalid."); + return FAILED; + } + string file = real_path + "/" + kAnalyzeFile; + GELOGD("Created analyzer file:[%s]", file.c_str()); + int fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority); + if (fd < 0) { + GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", file.c_str()); + return INTERNAL_ERROR; + } + if (close(fd) != 0) { + GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", file.c_str()); + return INTERNAL_ERROR; + } + json_file_name_ = file; + + GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str()); + return SUCCESS; +} + +ge::Status Analyzer::SaveAnalyzerDataToFile() { + GELOGD("start to save analyze file!"); + std::lock_guard lg(file_mutex_); + json_file_.open(json_file_name_, std::ios::out); + if (!json_file_.is_open()) { + GELOGE(FAILED, "analyzer file does not exist[%s]", json_file_name_.c_str()); + return PARAM_INVALID; + } + + std::lock_guard lk(mutex_); + for (auto &ele : graph_infos_) { + for (auto &ele2 : ele.second) { + json jsn; + GraphInfoToJson(jsn, *(ele2.second)); + json_file_ << jsn.dump(kJsonDumpLevel) << std::endl; + } + } + + json_file_.close(); + return SUCCESS; +} + +ge::Status Analyzer::DoAnalyze(DataInfo &data_info) { + GELOGD("start to do analyzer!"); + + auto pnode = data_info.node_ptr; + GE_CHECK_NOTNULL(pnode); + auto desc = pnode->GetOpDesc(); + GE_CHECK_NOTNULL(desc); + // buff analyze data + std::lock_guard lg(mutex_); + auto graph_info = GetJsonObject(data_info.session_id, data_info.graph_id); + GE_CHECK_NOTNULL(graph_info); + auto status = SaveOpInfo(desc, data_info, graph_info); + if (status != SUCCESS) { + GELOGE(status, "save op info failed!"); + return FAILED; + } + // save data to file + return SaveAnalyzerDataToFile(); +} + +ge::Status Analyzer::SaveOpInfo(ge::OpDescPtr desc, DataInfo &data_info, + std::shared_ptr graph_info) { + auto iter = errors_map.find(data_info.analyze_type); + if (iter == errors_map.end()) { + return PARAM_INVALID; + } + OpInfo op_info; + op_info.error_type = iter->second; + op_info.op_name = desc->GetName(); + op_info.op_type = desc->GetType(); + op_info.reason = data_info.reason; + + for (const auto &ptr : desc->GetAllInputsDescPtr()) { + TensorInfo tensor_info; + tensor_info.shape = ptr->GetShape().GetDims(); + tensor_info.d_type = ge::TypeUtils::DataTypeToSerialString(ptr->GetDataType()); + tensor_info.layout = ge::TypeUtils::FormatToSerialString(ptr->GetFormat()); + op_info.input_info.emplace_back(tensor_info); + } + for (const auto &ptr : desc->GetAllOutputsDescPtr()) { + TensorInfo tensor_info; + tensor_info.shape = ptr->GetShape().GetDims(); + tensor_info.d_type = ge::TypeUtils::DataTypeToSerialString(ptr->GetDataType()); + tensor_info.layout = ge::TypeUtils::FormatToSerialString(ptr->GetFormat()); + op_info.output_info.emplace_back(tensor_info); + } + graph_info->op_info.emplace_back(op_info); + + return SUCCESS; +} + +void Analyzer::TensorInfoToJson(json &j, const TensorInfo &tensor_info) { + j[kShape] = tensor_info.shape; + j[kDataType] = tensor_info.d_type; + j[kLayout] = tensor_info.layout; +} + +void Analyzer::OpInfoToJson(json &j, const OpInfo &op_info) { + j[kErrorType] = op_info.error_type; + j[kOpName] = op_info.op_name; + j[kOpType] = op_info.op_type; + j[kReason] = op_info.reason; + for (size_t i = 0; i < op_info.input_info.size(); i++) { + json json_tensor_info; + TensorInfoToJson(json_tensor_info, op_info.input_info.at(i)); + j[kInput + std::to_string(i)] = json_tensor_info; + } + for (size_t i = 0; i < op_info.output_info.size(); i++) { + json json_tensor_info; + TensorInfoToJson(json_tensor_info, op_info.output_info.at(i)); + j[kOutput + std::to_string(i)] = json_tensor_info; + } +} + +void Analyzer::GraphInfoToJson(json &j, const GraphInfo &graph_info) { + GELOGD("start to buff graph info!"); + j[kSessionId] = graph_info.session_id; + j[kGraphId] = graph_info.graph_id; + std::vector json_op_infos; + for (size_t i = 0; i < graph_info.op_info.size(); i++) { + json json_op_info; + OpInfoToJson(json_op_info, graph_info.op_info.at(i)); + json_op_infos.emplace_back(json_op_info); + } + j[kOp] = json_op_infos; +} +} // namespace ge diff --git a/src/ge/analyzer/analyzer.h b/src/ge/analyzer/analyzer.h new file mode 100644 index 00000000..4ac8b391 --- /dev/null +++ b/src/ge/analyzer/analyzer.h @@ -0,0 +1,186 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DOMI_ANALYZER_ANANLYZER_H_ +#define DOMI_ANALYZER_ANANLYZER_H_ + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include + +#include "external/ge/ge_api_types.h" +#include "graph/compute_graph.h" +#include "graph/node.h" + +namespace ge { +namespace analyzer { +enum AnalyzeType { + PARSER = 0, + INFER_SHAPE = 1, + CHECKSUPPORT = 2, + GRAPH_OPTIMIZE = 3, + GRAPH_PARTION = 4, + GRAPH_BUILDER = 5, +}; + +struct TensorInfo { + vector shape; + string d_type; + string layout; +}; + +struct OpInfo { + string error_type; + string op_name; + string op_type; + std::vector input_info; + std::vector output_info; + string reason; +}; + +struct GraphInfo { + uint64_t session_id = 0; + uint64_t graph_id = 0; + std::vector op_info; +}; + +struct DataInfo { + DataInfo() = default; + ~DataInfo() = default; + + DataInfo(uint64_t sess, uint64_t graph, AnalyzeType type, ge::NodePtr node, std::string error_info) { + session_id = sess; + graph_id = graph; + analyze_type = type; + node_ptr = node; + reason = error_info; + } + uint64_t session_id; + uint64_t graph_id; + AnalyzeType analyze_type; + ge::NodePtr node_ptr{nullptr}; + std::string reason; +}; +} // namespace analyzer + +class Analyzer { + public: + /** + * @ingroup ge + * @brief: get analyzer instance. + * @param [in]: None + * @return: Analyzer instance ptr + */ + static Analyzer *GetInstance(); + + /** + * @ingroup ge + * @brief: check whether env var ENABLE_NETWORK_ANALYSIS_DEBUG is enabled. + * When enable env, it will keep adaptor sink geop graph even though fail. + * @param [in]: None + * @return: true: enable env false : disable env + */ + bool IsEnableNetAnalyzeDebug() { return std::getenv("ENABLE_NETWORK_ANALYSIS_DEBUG") != nullptr; } + + /** + * @ingroup ge + * @brief: build buff object by sess id and graph id . + * @param [in]: session id & graph id + * @return: 0: success other: failed + */ + ge::Status BuildJsonObject(uint64_t session_id, uint64_t graph_id); + + /** + * @ingroup ge + * @brief: get buff object by sess id and graph id . + * @param [in]: session id & graph id + * @return: nullptr if failed + */ + std::shared_ptr GetJsonObject(uint64_t session_id, uint64_t graph_id); + + /** + * @ingroup ge + * @brief: analyzer globle init method. + * @param [in]: None + * @return: None + */ + ge::Status Initialize(); + + /** + * @ingroup ge + * @brief: DeConstruct method. Release all used resource of analyzer. + * @param [in]: None + * @return: None + */ + void Finalize(); + + /** + * @ingroup ge + * @brief: DeConstruct method. Only release resource about session id. + * @param [in]: None + * @return: None + */ + void DestroySessionJsonObject(uint64_t session_id); + + /** + * @ingroup ge + * @brief: DeConstruct method. Only release resource about session id and graph id. + * @param [in]: None + * @return: None + */ + void DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id); + + /** + * @ingroup ge + * @brief: main process method. Buff analyzed data and output to json file + * @param [in]: DataInfo Object + * @return: 0: SUCCESS other: FAILED + */ + ge::Status DoAnalyze(analyzer::DataInfo &data_info); + + Analyzer(const Analyzer &) = delete; + Analyzer &operator=(const Analyzer &) = delete; + Analyzer(Analyzer &&) = delete; + Analyzer &operator=(Analyzer &&) = delete; + + private: + void TensorInfoToJson(nlohmann::json &j, const analyzer::TensorInfo &tensor_info); + void OpInfoToJson(nlohmann::json &j, const analyzer::OpInfo &op_info); + void GraphInfoToJson(nlohmann::json &j, const analyzer::GraphInfo &graph_info); + + ge::Status SaveAnalyzerDataToFile(); + ge::Status SaveOpInfo(ge::OpDescPtr desc, analyzer::DataInfo &data_info, + std::shared_ptr graph_info); + + void ClearHistoryFile(); + ge::Status CreateAnalyzerFile(); + + explicit Analyzer(){}; + ~Analyzer() = default; + + private: + std::map>> graph_infos_; + std::recursive_mutex mutex_; // protect graph_infos_ + std::mutex file_mutex_; // protect json_file_ + std::ofstream json_file_; + std::string json_file_name_; +}; +} // namespace ge +#endif // DOMI_ANALYZER_ANANLYZER_H_ diff --git a/src/ge/client/ge_api.cc b/src/ge/client/ge_api.cc index 9eb15ee4..0458a508 100644 --- a/src/ge/client/ge_api.cc +++ b/src/ge/client/ge_api.cc @@ -32,7 +32,6 @@ #include "register/op_registry.h" #include "common/ge/tbe_plugin_manager.h" -using domi::GetContext; using domi::OpRegistry; using std::map; using std::string; diff --git a/src/ge/common/CMakeLists.txt b/src/ge/common/CMakeLists.txt index adcdb1bc..58ba9bac 100755 --- a/src/ge/common/CMakeLists.txt +++ b/src/ge/common/CMakeLists.txt @@ -25,6 +25,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../model/ge_model.cc" "auth/file_saver.cc" "context/ctx.cc" + "cust_aicpu_kernel_store.cc" "debug/memory_dumper.cc" "fmk_error_codes.cc" "formats/format_transfers/datatype_transfer.cc" @@ -52,6 +53,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "ge_format_util.cc" "helper/model_helper.cc" "helper/om_file_helper.cc" + "kernel_store.cc" "math/fp16_math.cc" "model_parser/base.cc" "model_saver.cc" diff --git a/src/ge/common/base64.h b/src/ge/common/base64.h new file mode 100644 index 00000000..26819c88 --- /dev/null +++ b/src/ge/common/base64.h @@ -0,0 +1,119 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_BASE64_H_ +#define GE_COMMON_BASE64_H_ + +#include +#include + +#include "debug/ge_log.h" +#include "ge_error_codes.h" + +namespace ge { +namespace { +const char *kBase64Chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; +const char kEqualSymbol = '='; +const size_t kBase64CharsNum = 64; +const size_t kThreeByteOneGroup = 3; +const size_t kFourByteOneGroup = 4; +} // namespace + +namespace base64 { +static inline bool IsBase64Char(const char &c) { return (isalnum(c) || (c == '+') || (c == '/')); } + +static std::string EncodeToBase64(const std::string &raw_data) { + size_t encode_length = raw_data.size() / kThreeByteOneGroup * kFourByteOneGroup; + encode_length += raw_data.size() % kThreeByteOneGroup == 0 ? 0 : kFourByteOneGroup; + size_t raw_data_index = 0; + size_t encode_data_index = 0; + std::string encode_data; + encode_data.resize(encode_length); + + for (; raw_data_index + kThreeByteOneGroup <= raw_data.size(); raw_data_index += kThreeByteOneGroup) { + auto char_1 = static_cast(raw_data[raw_data_index]); + auto char_2 = static_cast(raw_data[raw_data_index + 1]); + auto char_3 = static_cast(raw_data[raw_data_index + 2]); + encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u]; + encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)]; + encode_data[encode_data_index++] = kBase64Chars[((char_2 << 2u) & 0x3c) | (char_3 >> 6u)]; + encode_data[encode_data_index++] = kBase64Chars[char_3 & 0x3f]; + } + + if (raw_data_index < raw_data.size()) { + auto tail = raw_data.size() - raw_data_index; + auto char_1 = static_cast(raw_data[raw_data_index]); + if (tail == 1) { + encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u]; + encode_data[encode_data_index++] = kBase64Chars[(char_1 << 4u) & 0x30]; + encode_data[encode_data_index++] = kEqualSymbol; + encode_data[encode_data_index++] = kEqualSymbol; + } else { + auto char_2 = static_cast(raw_data[raw_data_index + 1]); + encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u]; + encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)]; + encode_data[encode_data_index++] = kBase64Chars[(char_2 << 2u) & 0x3c]; + encode_data[encode_data_index++] = kEqualSymbol; + } + } + return encode_data; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" +static Status DecodeFromBase64(const std::string &base64_data, std::string &decode_data) { + if (base64_data.size() % kFourByteOneGroup != 0) { + GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu", base64_data.size()); + return PARAM_INVALID; + } + decode_data.clear(); + size_t base64_data_len = base64_data.size(); + uint8_t byte_4[kFourByteOneGroup]; + auto FindCharInBase64Chars = [&](const char &raw_char) -> uint8_t { + auto char_pos = std::find(kBase64Chars, kBase64Chars + kBase64CharsNum, raw_char); + return static_cast(std::distance(kBase64Chars, char_pos)) & 0xff; + }; + + for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += 4) { + for (size_t i = 0; i < kFourByteOneGroup; ++i) { + if (base64_data[input_data_index + i] == kEqualSymbol && input_data_index >= base64_data_len - 4 && i > 1) { + byte_4[i] = kBase64CharsNum; + } else if (IsBase64Char(base64_data[input_data_index + i])) { + byte_4[i] = FindCharInBase64Chars(base64_data[input_data_index + i]); + } else { + GELOGE(PARAM_INVALID, "given base64 data is illegal"); + return PARAM_INVALID; + } + } + decode_data += static_cast((byte_4[0] << 2u) + ((byte_4[1] & 0x30) >> 4u)); + if (byte_4[2] >= kBase64CharsNum) { + break; + } else if (byte_4[3] >= kBase64CharsNum) { + decode_data += static_cast(((byte_4[1] & 0x0f) << 4u) + ((byte_4[2] & 0x3c) >> 2u)); + break; + } + decode_data += static_cast(((byte_4[1] & 0x0f) << 4u) + ((byte_4[2] & 0x3c) >> 2u)); + decode_data += static_cast(((byte_4[2] & 0x03) << 6u) + byte_4[3]); + } + return SUCCESS; +} +#pragma GCC diagnostic pop +} // namespace base64 +} // namespace ge +#endif // GE_COMMON_BASE64_H_ \ No newline at end of file diff --git a/src/ge/common/cust_aicpu_kernel_store.cc b/src/ge/common/cust_aicpu_kernel_store.cc new file mode 100644 index 00000000..46eb484b --- /dev/null +++ b/src/ge/common/cust_aicpu_kernel_store.cc @@ -0,0 +1,37 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/cust_aicpu_kernel_store.h" + +namespace ge { + +CustAICPUKernelStore::CustAICPUKernelStore() {} + +void CustAICPUKernelStore::AddCustAICPUKernel(const CustAICPUKernelPtr &kernel) { AddKernel(kernel); } + +void CustAICPUKernelStore::LoadCustAICPUKernelBinToOpDesc(const std::shared_ptr &op_desc) const { + GELOGI("LoadCustAICPUKernelBinToOpDesc in"); + if (op_desc != nullptr) { + auto kernel_bin = FindKernel(op_desc->GetName()); + if (kernel_bin != nullptr) { + GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_CUSTAICPU_KERNEL, kernel_bin), + GELOGW("LoadKernelCustAICPUBinToOpDesc: SetExtAttr for kernel_bin failed");) + GELOGI("Load cust aicpu kernel:%s, %zu", kernel_bin->GetName().c_str(), kernel_bin->GetBinDataSize()); + } + } + GELOGI("LoadCustAICPUKernelBinToOpDesc success"); +} +} // namespace ge diff --git a/src/ge/common/cust_aicpu_kernel_store.h b/src/ge/common/cust_aicpu_kernel_store.h new file mode 100644 index 00000000..6dff0435 --- /dev/null +++ b/src/ge/common/cust_aicpu_kernel_store.h @@ -0,0 +1,35 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_CUST_AICPU_KERNEL_STORE_H_ +#define GE_COMMON_CUST_AICPU_KERNEL_STORE_H_ + +#include "common/kernel_store.h" + +namespace ge { + +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY CustAICPUKernelStore : public KernelStore { + public: + CustAICPUKernelStore(); + ~CustAICPUKernelStore() {} + + void AddCustAICPUKernel(const CustAICPUKernelPtr &kernel); + + void LoadCustAICPUKernelBinToOpDesc(const std::shared_ptr &op_desc) const; +}; +} // namespace ge + +#endif // GE_COMMON_CUST_AICPU_KERNEL_STORE_H_ diff --git a/src/ge/common/debug/memory_dumper.cc b/src/ge/common/debug/memory_dumper.cc index 56724be8..1a7d9db8 100644 --- a/src/ge/common/debug/memory_dumper.cc +++ b/src/ge/common/debug/memory_dumper.cc @@ -157,7 +157,7 @@ int MemoryDumper::OpenFile(const char *filename) { // Using the O_EXCL, if the file already exists,return failed to avoid privilege escalation vulnerability. mode_t mode = S_IRUSR | S_IWUSR; - int32_t fd = mmOpen2(real_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode); + int32_t fd = mmOpen2(real_path.c_str(), O_RDWR | O_CREAT | O_APPEND, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { GELOGE(kInvalidFd, "open file failed. errno = %d, %s", fd, strerror(errno)); return kInvalidFd; diff --git a/src/ge/common/formats/format_transfers/datatype_transfer.cc b/src/ge/common/formats/format_transfers/datatype_transfer.cc index 08c6889f..a603b2f4 100644 --- a/src/ge/common/formats/format_transfers/datatype_transfer.cc +++ b/src/ge/common/formats/format_transfers/datatype_transfer.cc @@ -44,6 +44,9 @@ enum DataTypeTransMode { kTransferWithDatatypeInt8ToFloat, kTransferWithDatatypeInt8ToInt32, kTransferWithDatatypeInt64ToInt32, + kTransferWithDatatypeInt32ToInt64, + kTransferWithDatatypeInt32ToDouble, + kTransferWithDatatypeDoubleToInt32, }; std::map, DataTypeTransMode> trans_mode_map{ @@ -59,7 +62,11 @@ std::map, DataTypeTransMode> trans_mode_map{ {std::pair(DT_UINT8, DT_INT32), kTransferWithDatatypeUint8ToInt32}, {std::pair(DT_INT8, DT_FLOAT), kTransferWithDatatypeInt8ToFloat}, {std::pair(DT_INT8, DT_INT32), kTransferWithDatatypeInt8ToInt32}, - {std::pair(DT_INT64, DT_INT32), kTransferWithDatatypeInt64ToInt32}}; + {std::pair(DT_INT64, DT_INT32), kTransferWithDatatypeInt64ToInt32}, + {std::pair(DT_INT32, DT_INT64), kTransferWithDatatypeInt32ToInt64}, + {std::pair(DT_INT32, DT_DOUBLE), kTransferWithDatatypeInt32ToDouble}, + {std::pair(DT_DOUBLE, DT_INT32), kTransferWithDatatypeDoubleToInt32}, +}; template Status TransDataSrc2Dst(const CastArgs &args, uint8_t *dst, const size_t data_size) { @@ -82,38 +89,30 @@ Status TransDataSrc2Fp16(const CastArgs &args, uint8_t *dst, const size_t data_s } Status CastKernel(const CastArgs &args, uint8_t *dst, const size_t data_size, const DataTypeTransMode trans_mode) { - switch (trans_mode) { - case kTransferWithDatatypeFloatToFloat16: - return TransDataSrc2Fp16(args, dst, data_size); - case kTransferWithDatatypeFloatToInt32: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeFloat16ToFloat: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeFloat16ToInt32: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt32ToFloat: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt32ToFloat16: - return TransDataSrc2Fp16(args, dst, data_size); - case kTransferWithDatatypeInt32ToUint8: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt32ToInt8: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeUint8ToFloat: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeUint8ToInt32: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt8ToFloat: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt8ToInt32: - return TransDataSrc2Dst(args, dst, data_size); - case kTransferWithDatatypeInt64ToInt32: - return TransDataSrc2Dst(args, dst, data_size); - default: - GELOGE(PARAM_INVALID, "Trans data type from %s to %s is not supported.", - TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - TypeUtils::DataTypeToSerialString(args.dst_data_type).c_str()); - return UNSUPPORTED; + static std::map> transfer_handle = + { + {kTransferWithDatatypeFloatToFloat16, TransDataSrc2Fp16}, + {kTransferWithDatatypeFloatToInt32, TransDataSrc2Dst}, + {kTransferWithDatatypeFloat16ToFloat, TransDataSrc2Dst}, + {kTransferWithDatatypeFloat16ToInt32, TransDataSrc2Dst}, + {kTransferWithDatatypeInt32ToFloat, TransDataSrc2Dst}, + {kTransferWithDatatypeInt32ToFloat16, TransDataSrc2Fp16}, + {kTransferWithDatatypeInt32ToUint8, TransDataSrc2Dst}, + {kTransferWithDatatypeInt32ToInt8, TransDataSrc2Dst}, + {kTransferWithDatatypeUint8ToFloat, TransDataSrc2Dst}, + {kTransferWithDatatypeUint8ToInt32, TransDataSrc2Dst}, + {kTransferWithDatatypeInt8ToFloat, TransDataSrc2Dst}, + {kTransferWithDatatypeInt8ToInt32, TransDataSrc2Dst}, + {kTransferWithDatatypeInt64ToInt32, TransDataSrc2Dst}, + {kTransferWithDatatypeInt32ToInt64, TransDataSrc2Dst}, + {kTransferWithDatatypeInt32ToDouble, TransDataSrc2Dst}, + {kTransferWithDatatypeDoubleToInt32, TransDataSrc2Dst}, + }; + auto it = transfer_handle.find(trans_mode); + if (it == transfer_handle.end()) { + return UNSUPPORTED; + } else { + return (it->second)(args, dst, data_size); } } } // namespace diff --git a/src/ge/common/ge_common.mk b/src/ge/common/ge_common.mk index e913c8f5..7632b46d 100644 --- a/src/ge/common/ge_common.mk +++ b/src/ge/common/ge_common.mk @@ -36,7 +36,9 @@ GE_COMMON_LOCAL_SRC_FILES := \ properties_manager.cc \ types.cc\ model_parser/base.cc \ + kernel_store.cc \ tbe_kernel_store.cc \ + cust_aicpu_kernel_store.cc \ op/attr_value_util.cc \ op/ge_op_utils.cc \ thread_pool.cc \ diff --git a/src/ge/common/helper/model_cache_helper.cc b/src/ge/common/helper/model_cache_helper.cc index e9b1de83..d3b4dde5 100644 --- a/src/ge/common/helper/model_cache_helper.cc +++ b/src/ge/common/helper/model_cache_helper.cc @@ -310,7 +310,7 @@ Status ModelCacheHelper::GetNodesNeedRecompile(ComputeGraphPtr &graph, vectorGetOpKernelLibName(); if (kernel_lib_name.empty()) { // reset op kernel lib - (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc); + (void)instance->DNNEngineManagerObj().GetDNNEngineName(node); kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { GELOGW("Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str()); diff --git a/src/ge/common/helper/model_helper.cc b/src/ge/common/helper/model_helper.cc index 19614566..d860f7ba 100644 --- a/src/ge/common/helper/model_helper.cc +++ b/src/ge/common/helper/model_helper.cc @@ -41,6 +41,7 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr &om_fil const uint8_t *data, size_t size) { if (size < 1 || size > UINT32_MAX) { GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size); + ErrorManager::GetInstance().ATCReportErrMessage("E19022"); return PARAM_INVALID; } if (data == nullptr) { @@ -101,16 +102,22 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore(); GELOGI("TBE_KERNELS size is %zu", tbe_kernel_store.DataSize()); if (tbe_kernel_store.DataSize() > 0) { - if (SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS, tbe_kernel_store.Data(), - tbe_kernel_store.DataSize()) != SUCCESS) { - GELOGE(PARAM_INVALID, "Add tbe kernel partition failed"); - return PARAM_INVALID; - } + GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS, tbe_kernel_store.Data(), + tbe_kernel_store.DataSize()), + "Add tbe kernel partition failed"); } // no need to check value, DATA->NetOutput (void)tbe_kernel_store.Load(tbe_kernel_store.Data(), tbe_kernel_store.DataSize()); + CustAICPUKernelStore cust_aicpu_kernel_store = ge_model->GetCustAICPUKernelStore(); + GELOGI("cust aicpu kernels size is %zu", cust_aicpu_kernel_store.DataSize()); + if (cust_aicpu_kernel_store.DataSize() > 0) { + GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper, ModelPartitionType::CUST_AICPU_KERNELS, + cust_aicpu_kernel_store.Data(), cust_aicpu_kernel_store.DataSize()), + "Add cust aicpu kernel partition failed"); + } + std::shared_ptr model_task_def = ge_model->GetModelTaskDefPtr(); if (model_task_def == nullptr) { GELOGE(MEMALLOC_FAILED, "Create model task def ptr failed"); @@ -308,6 +315,10 @@ Status ModelHelper::GenerateGeModel(OmFileLoadHelper &om_load_helper) { if (ret != SUCCESS) { return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED; } + ret = LoadCustAICPUKernelStore(om_load_helper); + if (ret != SUCCESS) { + return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED; + } return SUCCESS; } @@ -384,6 +395,22 @@ Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) { return SUCCESS; } +Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) { + // Load cust aicpu kernels + ModelPartition partition_kernel_def; + CustAICPUKernelStore kernel_store; + if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def) == SUCCESS) { + GELOGI("Kernels partition size:%u", partition_kernel_def.size); + if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) { + GELOGI("Load cust aicpu kernels success"); + } else { + GELOGW("Load cust aicpu kernels failed"); + } + } + model_->SetCustAICPUKernelStore(kernel_store); + return SUCCESS; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeModel() { if (model_ != nullptr) { return model_; diff --git a/src/ge/common/helper/om_file_helper.cc b/src/ge/common/helper/om_file_helper.cc index f25e2af3..ca506731 100644 --- a/src/ge/common/helper/om_file_helper.cc +++ b/src/ge/common/helper/om_file_helper.cc @@ -27,6 +27,9 @@ using std::string; +namespace { +const int32_t kOptionalNum = 2; +} namespace ge { // For Load FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(const ge::ModelData &model) { @@ -67,7 +70,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetMod } if (!found) { - if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA) { + if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA && + type != ModelPartitionType::CUST_AICPU_KERNELS) { GELOGE(FAILED, "GetModelPartition:type:%d is not in partition_datas!", static_cast(type)); return FAILED; } @@ -114,7 +118,7 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint // Davinici model partition include graph-info weight-info task-info tbe-kernel : // Original model partition include graph-info if ((partition_table->num != PARTITION_SIZE) && (partition_table->num != (PARTITION_SIZE - 1)) && - (partition_table->num != 1)) { + (partition_table->num != (PARTITION_SIZE - kOptionalNum)) && (partition_table->num != 1)) { GELOGE(GE_EXEC_MODEL_PARTITION_NUM_INVALID, "Invalid partition_table->num:%u", partition_table->num); return GE_EXEC_MODEL_PARTITION_NUM_INVALID; } diff --git a/src/ge/common/kernel_store.cc b/src/ge/common/kernel_store.cc new file mode 100644 index 00000000..e465d184 --- /dev/null +++ b/src/ge/common/kernel_store.cc @@ -0,0 +1,118 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/kernel_store.h" + +namespace ge { + +void KernelStore::AddKernel(const KernelBinPtr &kernel) { + if (kernel != nullptr) { + kernels_[kernel->GetName()] = kernel; + } +} + +bool KernelStore::Build() { + buffer_.clear(); + size_t total_len = 0; + for (const auto &item : kernels_) { + auto kernel = item.second; + total_len += sizeof(KernelStoreItemHead); + total_len += kernel->GetName().length(); + total_len += kernel->GetBinDataSize(); + } + + try { + buffer_.resize(total_len); + } catch (std::bad_alloc &e) { + GELOGE(ge::MEMALLOC_FAILED, "All build memory failed, memory size %zu", total_len); + return false; + } + + uint8_t *next_buffer = buffer_.data(); + size_t remain_len = total_len; + errno_t mem_ret; + for (const auto &item : kernels_) { + auto kernel = item.second; + KernelStoreItemHead kernel_head{}; + kernel_head.magic = kKernelItemMagic; + kernel_head.name_len = static_cast(kernel->GetName().length()); + kernel_head.bin_len = static_cast(kernel->GetBinDataSize()); + + GELOGI("get kernel bin name %s, addr %p, size %u", kernel->GetName().c_str(), kernel->GetBinData(), + kernel->GetBinDataSize()); + mem_ret = memcpy_s(next_buffer, remain_len, &kernel_head, sizeof(kernel_head)); + GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); + next_buffer += sizeof(kernel_head); + + mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head), kernel->GetName().data(), kernel_head.name_len); + GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); + next_buffer += kernel_head.name_len; + + mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head) - kernel_head.name_len, kernel->GetBinData(), + kernel_head.bin_len); + GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); + + next_buffer += kernel_head.bin_len; + remain_len = remain_len - sizeof(kernel_head) - kernel_head.name_len - kernel_head.bin_len; + } + kernels_.clear(); + return true; +} + +const uint8_t *KernelStore::Data() const { return buffer_.data(); } + +size_t KernelStore::DataSize() const { return buffer_.size(); } + +bool KernelStore::Load(const uint8_t *data, const size_t &len) { + if (data == nullptr || len == 0) { + return false; + } + size_t buffer_len = len; + while (buffer_len > sizeof(KernelStoreItemHead)) { + const char *next_buffer = reinterpret_cast(data) + (len - buffer_len); + + const auto *kernel_head = reinterpret_cast(next_buffer); + if (buffer_len < kernel_head->name_len + kernel_head->bin_len + sizeof(KernelStoreItemHead)) { + GELOGW("Invalid kernel block remain buffer len %zu, name len %u, bin len %u", buffer_len, kernel_head->name_len, + kernel_head->bin_len); + break; + } + + next_buffer += sizeof(KernelStoreItemHead); + std::string name(next_buffer, kernel_head->name_len); + + next_buffer += kernel_head->name_len; + GELOGI("Load kernel from om:%s,%u,%u", name.c_str(), kernel_head->name_len, kernel_head->bin_len); + std::vector kernel_bin(next_buffer, next_buffer + kernel_head->bin_len); + KernelBinPtr teb_kernel_ptr = ge::MakeShared(name, std::move(kernel_bin)); + if (teb_kernel_ptr != nullptr) { + kernels_.emplace(name, teb_kernel_ptr); + } + buffer_len -= sizeof(KernelStoreItemHead) + kernel_head->name_len + kernel_head->bin_len; + } + + return true; +} + +KernelBinPtr KernelStore::FindKernel(const std::string &name) const { + auto it = kernels_.find(name); + if (it != kernels_.end()) { + return it->second; + } + return nullptr; +} + +} // namespace ge diff --git a/src/ge/common/kernel_store.h b/src/ge/common/kernel_store.h new file mode 100644 index 00000000..d73f26c5 --- /dev/null +++ b/src/ge/common/kernel_store.h @@ -0,0 +1,70 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_KERNEL_STORE_H_ +#define GE_COMMON_KERNEL_STORE_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "common/ge/ge_util.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" +#include "framework/common/fmk_types.h" +#include "graph/op_desc.h" +#include "graph/op_kernel_bin.h" + +namespace ge { +using KernelBin = ge::OpKernelBin; +using KernelBinPtr = std::shared_ptr; +using CustAICPUKernel = ge::OpKernelBin; +using CustAICPUKernelPtr = std::shared_ptr; +using TBEKernel = ge::OpKernelBin; +using TBEKernelPtr = std::shared_ptr; + +const uint32_t kKernelItemMagic = 0x5d776efd; + +struct KernelStoreItemHead { + uint32_t magic; + uint32_t name_len; + uint32_t bin_len; +}; + +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY KernelStore { + public: + KernelStore() = default; + virtual ~KernelStore() = default; + virtual bool Build(); + + virtual bool Load(const uint8_t *data, const size_t &len); + + virtual const uint8_t *Data() const; + virtual size_t DataSize() const; + virtual void AddKernel(const KernelBinPtr &kernel); + virtual KernelBinPtr FindKernel(const std::string &name) const; + + private: + std::unordered_map kernels_; + std::vector buffer_; +}; +} // namespace ge + +#endif // GE_COMMON_KERNEL_STORE_H_ diff --git a/src/ge/common/math/math_util.h b/src/ge/common/math/math_util.h index 86c62209..e5a53d16 100644 --- a/src/ge/common/math/math_util.h +++ b/src/ge/common/math/math_util.h @@ -612,295 +612,268 @@ inline Status CheckInt32DivOverflow(int32_t a, int32_t b) { return SUCCESS; } -#define FMK_INT_ADDCHECK(a, b) \ - if (ge::CheckIntAddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int %d and %d addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ - } - -#define FMK_INT8_ADDCHECK(a, b) \ - if (ge::CheckInt8AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int8 %d and %d addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ - } - -#define FMK_INT16_ADDCHECK(a, b) \ - if (ge::CheckInt16AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int16 %d and %d addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ +#define FMK_INT_ADDCHECK(a, b) \ + if (ge::CheckIntAddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int %d and %d addition can result in overflow!", static_cast(a), static_cast(b)); \ return INTERNAL_ERROR; \ } -#define FMK_INT32_ADDCHECK(a, b) \ - if (ge::CheckInt32AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int32 %d and %d addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT8_ADDCHECK(a, b) \ + if (ge::CheckInt8AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int8 %d and %d addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT64_ADDCHECK(a, b) \ - if (ge::CheckInt64AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int64 %ld and %ld addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT16_ADDCHECK(a, b) \ + if (ge::CheckInt16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int16 %d and %d addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT8_ADDCHECK(a, b) \ - if (ge::CheckUint8AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT8 %u and %u addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT32_ADDCHECK(a, b) \ + if (ge::CheckInt32AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int32 %d and %d addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT16_ADDCHECK(a, b) \ - if (ge::CheckUint16AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT16 %u and %u addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT64_ADDCHECK(a, b) \ + if (ge::CheckInt64AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int64 %ld and %ld addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT32_ADDCHECK(a, b) \ - if (ge::CheckUint32AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT32 %u and %u addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT8_ADDCHECK(a, b) \ + if (ge::CheckUint8AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint8 %u and %u addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT64_ADDCHECK(a, b) \ - if (ge::CheckUint64AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT16_ADDCHECK(a, b) \ + if (ge::CheckUint16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("UINT16 %u and %u addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FP16_ADDCHECK(a, b) \ - if (ge::CheckFp16AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "fp16 %f and %f addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT32_ADDCHECK(a, b) \ + if (ge::CheckUint32AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint32 %u and %u addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FLOAT_ADDCHECK(a, b) \ - if (ge::CheckFloatAddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "float %f and %f addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT64_ADDCHECK(a, b) \ + if (ge::CheckUint64AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint64 %lu and %lu addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_DOUBLE_ADDCHECK(a, b) \ - if (ge::CheckDoubleAddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "double %lf and %lf addition can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FP16_ADDCHECK(a, b) \ + if (ge::CheckFp16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Fp16 %f and %f addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT_SUBCHECK(a, b) \ - if (ge::CheckIntSubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FLOAT_ADDCHECK(a, b) \ + if (ge::CheckFloatAddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Float %f and %f addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT8_SUBCHECK(a, b) \ - if (ge::CheckInt8SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT8 %d and %d subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_DOUBLE_ADDCHECK(a, b) \ + if (ge::CheckDoubleAddOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Double %lf and %lf addition can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT16_SUBCHECK(a, b) \ - if (ge::CheckInt16SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT16 %d and %d subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ +#define FMK_INT_SUBCHECK(a, b) \ + if (ge::CheckIntSubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int %d and %d subtraction can result in overflow!", static_cast(a), static_cast(b)); \ return INTERNAL_ERROR; \ } -#define FMK_INT32_SUBCHECK(a, b) \ - if (ge::CheckInt32SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT8_SUBCHECK(a, b) \ + if (ge::CheckInt8SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int8 %d and %d subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT64_SUBCHECK(a, b) \ - if (ge::CheckInt64SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT16_SUBCHECK(a, b) \ + if (ge::CheckInt16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int16 %d and %d subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT8_SUBCHECK(a, b) \ - if (ge::CheckUint8SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT8 %u and %u subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT32_SUBCHECK(a, b) \ + if (ge::CheckInt32SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int32 %d and %d subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT16_SUBCHECK(a, b) \ - if (ge::CheckUint16SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT16 %u and %u subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT64_SUBCHECK(a, b) \ + if (ge::CheckInt64SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int64 %ld and %ld subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT32_SUBCHECK(a, b) \ - if (ge::CheckUint32SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT32 %u and %u subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT8_SUBCHECK(a, b) \ + if (ge::CheckUint8SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint8 %u and %u subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT64_SUBCHECK(a, b) \ - if (ge::CheckUint64SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT16_SUBCHECK(a, b) \ + if (ge::CheckUint16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint16 %u and %u subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FP16_SUBCHECK(a, b) \ - if (ge::CheckFp16SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "fp16 %f and %f subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT32_SUBCHECK(a, b) \ + if (ge::CheckUint32SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint32 %u and %u subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FLOAT_SUBCHECK(a, b) \ - if (ge::CheckFloatSubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "float %f and %f subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT64_SUBCHECK(a, b) \ + if (ge::CheckUint64SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint64 %lu and %lu subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_DOUBLE_SUBCHECK(a, b) \ - if (ge::CheckDoubleSubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "double %lf and %lf subtraction can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FP16_SUBCHECK(a, b) \ + if (ge::CheckFp16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Fp16 %f and %f subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT_MULCHECK(a, b) \ - if (ge::CheckIntMulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FLOAT_SUBCHECK(a, b) \ + if (ge::CheckFloatSubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Float %f and %f subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT8_MULCHECK(a, b) \ - if (ge::CheckInt8MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT8 %d and %d multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_DOUBLE_SUBCHECK(a, b) \ + if (ge::CheckDoubleSubOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Double %lf and %lf subtraction can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT16_MULCHECK(a, b) \ - if (ge::CheckInt16MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT16 %d and %d multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ +#define FMK_INT_MULCHECK(a, b) \ + if (ge::CheckIntMulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int %d and %d multiplication can result in overflow!", static_cast(a), static_cast(b)); \ return INTERNAL_ERROR; \ } -#define FMK_INT32_MULCHECK(a, b) \ - if (ge::CheckInt32MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT8_MULCHECK(a, b) \ + if (ge::CheckInt8MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int8 %d and %d multiplication can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT64_MULCHECK(a, b) \ - if (ge::Int64MulCheckOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT16_MULCHECK(a, b) \ + if (ge::CheckInt16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int16 %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT8_MULCHECK(a, b) \ - if (ge::CheckUint8MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT8 %u and %u multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT32_MULCHECK(a, b) \ + if (ge::CheckInt32MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int32 %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT16_MULCHECK(a, b) \ - if (ge::CheckUint16MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT16 %u and %u multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT64_MULCHECK(a, b) \ + if (ge::Int64MulCheckOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int64 %ld and %ld multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT32_MULCHECK(a, b) \ - if (ge::CheckUint32MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT32 %u and %u multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT8_MULCHECK(a, b) \ + if (ge::CheckUint8MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint8 %u and %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_UINT64_MULCHECK(a, b) \ - if (ge::CheckUint64MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT16_MULCHECK(a, b) \ + if (ge::CheckUint16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint16 %u and %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FP16_MULCHECK(a, b) \ - if (ge::CheckFp16MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "fp16 %f and %f multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT32_MULCHECK(a, b) \ + if (ge::CheckUint32MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint32 %u and %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FLOAT_MULCHECK(a, b) \ - if (ge::CheckFloatMulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "float %f and %f multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT64_MULCHECK(a, b) \ + if (ge::CheckUint64MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Uint64 %lu and %lu multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_DOUBLE_MULCHECK(a, b) \ - if (ge::CheckDoubleMulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "double %lf and %lf multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FP16_MULCHECK(a, b) \ + if (ge::CheckFp16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Fp16 %f and %f multiplication can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT_DIVCHECK(a, b) \ - if (CheckIntDivOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d division can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_FLOAT_MULCHECK(a, b) \ + if (ge::CheckFloatMulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Float %f and %f multiplication can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT32_DIVCHECK(a, b) \ - if (CheckInt32DivOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d division can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_DOUBLE_MULCHECK(a, b) \ + if (ge::CheckDoubleMulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Double %lf and %lf multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT64_UINT32_MULCHECK(a, b) \ - if (ge::CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT64 %ld and UINT32 %u multiplication can result in overflow!", static_cast(a), \ - static_cast(b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT_DIVCHECK(a, b) \ + if (CheckIntDivOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int %d and %d division can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FP16_ZEROCHECK(a) \ - if (fabs(a) < DBL_EPSILON) { \ - GELOGE(INTERNAL_ERROR, "fp16 %f can not be zero !", a); \ - return INTERNAL_ERROR; \ +#define FMK_INT32_DIVCHECK(a, b) \ + if (CheckInt32DivOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int32 %d and %d division can result in overflow!", static_cast(a), static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_FLOAT_ZEROCHECK(a) \ - if (fabs(a) < FLT_EPSILON) { \ - GELOGE(INTERNAL_ERROR, "float %f can not be zero !", a); \ - return INTERNAL_ERROR; \ +#define FMK_INT64_UINT32_MULCHECK(a, b) \ + if (ge::CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) { \ + GELOGW("Int64 %ld and UINT32 %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_DOUBLE_ZEROCHECK(a) \ - if (fabs(a) < DBL_EPSILON) { \ - GELOGE(INTERNAL_ERROR, "double %lf can not be zero !", a); \ +#define FMK_FP16_ZEROCHECK(a) \ + if (fabs(a) < DBL_EPSILON || a < 0) { \ + GELOGW("Fp16 %f can not less than or equal to zero! ", a); \ return INTERNAL_ERROR; \ } + +#define FMK_FLOAT_ZEROCHECK(a) \ + if (fabs(a) < FLT_EPSILON || a < 0) { \ + GELOGW("Float %f can not less than or equal to zero! ", a); \ + return INTERNAL_ERROR; \ + } + +#define FMK_DOUBLE_ZEROCHECK(a) \ + if (fabs(a) < DBL_EPSILON || a < 0) { \ + GELOGW("Double %lf can not less than or equal to zero! ", a); \ + return INTERNAL_ERROR; \ + } } // namespace ge #endif // GE_COMMON_MATH_MATH_UTIL_H_ diff --git a/src/ge/common/tbe_kernel_store.cc b/src/ge/common/tbe_kernel_store.cc index 10ed51a6..9acead2d 100644 --- a/src/ge/common/tbe_kernel_store.cc +++ b/src/ge/common/tbe_kernel_store.cc @@ -16,126 +16,19 @@ #include "common/tbe_kernel_store.h" -#include -#include - -#include "common/ge/ge_util.h" -#include "framework/common/debug/ge_log.h" -#include "framework/common/debug/log.h" - namespace ge { -const uint32_t kKernelItemMagic = 0x5d776efd; - -struct KernelStoreItemHead { - uint32_t magic; - uint32_t name_len; - uint32_t bin_len; -}; TBEKernelStore::TBEKernelStore() {} -void TBEKernelStore::AddTBEKernel(const TBEKernelPtr &kernel) { - if (kernel != nullptr) { - kernels_[kernel->GetName()] = kernel; - } -} - -bool TBEKernelStore::Build() { - buffer_.clear(); - size_t total_len = 0; - for (const auto &item : kernels_) { - auto kernel = item.second; - total_len += sizeof(KernelStoreItemHead); - total_len += kernel->GetName().length(); - total_len += kernel->GetBinDataSize(); - } - - try { - buffer_.resize(total_len); - } catch (std::bad_alloc &e) { - GELOGE(ge::MEMALLOC_FAILED, "All build memory failed, memory size %zu", total_len); - return false; - } - - uint8_t *next_buffer = buffer_.data(); - size_t remain_len = total_len; - errno_t mem_ret; - for (const auto &item : kernels_) { - auto kernel = item.second; - KernelStoreItemHead kernel_head{}; - kernel_head.magic = kKernelItemMagic; - kernel_head.name_len = static_cast(kernel->GetName().length()); - kernel_head.bin_len = static_cast(kernel->GetBinDataSize()); - - mem_ret = memcpy_s(next_buffer, remain_len, &kernel_head, sizeof(kernel_head)); - GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); - next_buffer += sizeof(kernel_head); - - mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head), kernel->GetName().data(), kernel_head.name_len); - GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); - next_buffer += kernel_head.name_len; - - mem_ret = memcpy_s(next_buffer, remain_len - sizeof(kernel_head) - kernel_head.name_len, kernel->GetBinData(), - kernel_head.bin_len); - GE_CHK_BOOL_EXEC_NOLOG(mem_ret == EOK, return false); - - next_buffer += kernel_head.bin_len; - remain_len = remain_len - sizeof(kernel_head) - kernel_head.name_len - kernel_head.bin_len; - } - kernels_.clear(); - return true; -} - -const uint8_t *TBEKernelStore::Data() const { return buffer_.data(); } - -size_t TBEKernelStore::DataSize() const { return buffer_.size(); } - -bool TBEKernelStore::Load(const uint8_t *data, const size_t &len) { - if (data == nullptr || len == 0) { - return false; - } - size_t buffer_len = len; - while (buffer_len > sizeof(KernelStoreItemHead)) { - const char *next_buffer = reinterpret_cast(data) + (len - buffer_len); - - const auto *kernel_head = reinterpret_cast(next_buffer); - if (buffer_len < kernel_head->name_len + kernel_head->bin_len + sizeof(KernelStoreItemHead)) { - GELOGW("Invalid kernel block remain buffer len %zu, name len %u, bin len %u", buffer_len, kernel_head->name_len, - kernel_head->bin_len); - break; - } - - next_buffer += sizeof(KernelStoreItemHead); - std::string name(next_buffer, kernel_head->name_len); - - next_buffer += kernel_head->name_len; - GELOGI("Load kernel from om:%s,%u,%u", name.c_str(), kernel_head->name_len, kernel_head->bin_len); - std::vector kernel_bin(next_buffer, next_buffer + kernel_head->bin_len); - TBEKernelPtr teb_kernel_ptr = ge::MakeShared(name, std::move(kernel_bin)); - if (teb_kernel_ptr != nullptr) { - kernels_.emplace(name, teb_kernel_ptr); - } - buffer_len -= sizeof(KernelStoreItemHead) + kernel_head->name_len + kernel_head->bin_len; - } - - return true; -} - -TBEKernelPtr TBEKernelStore::FindTBEKernel(const std::string &name) const { - auto it = kernels_.find(name); - if (it != kernels_.end()) { - return it->second; - } - return nullptr; -} +void TBEKernelStore::AddTBEKernel(const TBEKernelPtr &kernel) { AddKernel(kernel); } void TBEKernelStore::LoadTBEKernelBinToOpDesc(const std::shared_ptr &op_desc) const { if (op_desc != nullptr) { - auto tbe_kernel = FindTBEKernel(op_desc->GetName()); - if (tbe_kernel != nullptr) { - GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel), - GELOGW("LoadTBEKernelBinToOpDesc: SetExtAttr for tbe_kernel failed");) - GELOGI("Load tbe kernel:%s, %zu", tbe_kernel->GetName().c_str(), tbe_kernel->GetBinDataSize()); + auto kernel_bin = FindKernel(op_desc->GetName()); + if (kernel_bin != nullptr) { + GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, kernel_bin), + GELOGW("LoadKernelTBEBinToOpDesc: SetExtAttr for kernel_bin failed");) + GELOGI("Load tbe kernel:%s, %zu", kernel_bin->GetName().c_str(), kernel_bin->GetBinDataSize()); } } } diff --git a/src/ge/common/tbe_kernel_store.h b/src/ge/common/tbe_kernel_store.h index 51d69af2..ab1ab9b4 100644 --- a/src/ge/common/tbe_kernel_store.h +++ b/src/ge/common/tbe_kernel_store.h @@ -17,38 +17,17 @@ #ifndef GE_COMMON_TBE_KERNEL_STORE_H_ #define GE_COMMON_TBE_KERNEL_STORE_H_ -#include -#include -#include -#include -#include - -#include "framework/common/fmk_types.h" -#include "graph/op_desc.h" -#include "graph/op_kernel_bin.h" +#include "common/kernel_store.h" namespace ge { -using TBEKernel = ge::OpKernelBin; -using TBEKernelPtr = std::shared_ptr; -class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY TBEKernelStore { +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY TBEKernelStore : public KernelStore { public: TBEKernelStore(); - ~TBEKernelStore() = default; + ~TBEKernelStore() {} void AddTBEKernel(const TBEKernelPtr &kernel); - bool Build(); - - bool Load(const uint8_t *data, const size_t &len); - TBEKernelPtr FindTBEKernel(const std::string &name) const; void LoadTBEKernelBinToOpDesc(const std::shared_ptr &op_desc) const; - - const uint8_t *Data() const; - size_t DataSize() const; - - private: - std::unordered_map kernels_; - std::vector buffer_; }; } // namespace ge diff --git a/src/ge/engine_manager/dnnengine_manager.cc b/src/ge/engine_manager/dnnengine_manager.cc index fe3c1bc8..3389e1b9 100644 --- a/src/ge/engine_manager/dnnengine_manager.cc +++ b/src/ge/engine_manager/dnnengine_manager.cc @@ -26,7 +26,10 @@ #include "common/ge/ge_util.h" #include "common/util/error_manager/error_manager.h" #include "framework/common/debug/ge_log.h" +#include "analyzer/analyzer.h" #include "graph/ge_context.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" #include "init/gelib.h" namespace { @@ -164,11 +167,22 @@ bool DNNEngineManager::IsEngineRegistered(const std::string &name) { return false; } -void DNNEngineManager::InitPerformanceStaistic() { checksupport_cost_.clear(); } +void DNNEngineManager::InitPerformanceStaistic() { + std::lock_guard lock(mutex_); + checksupport_cost_.clear(); +} + +const map &DNNEngineManager::GetCheckSupportCost() const { + std::lock_guard lock(mutex_); + return checksupport_cost_; +} -const map &DNNEngineManager::GetCheckSupportCost() const { return checksupport_cost_; } +std::string DNNEngineManager::GetDNNEngineName(const ge::NodePtr &node_ptr) { + std::lock_guard lock(mutex_); -std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { + GE_IF_BOOL_EXEC(node_ptr == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: node_ptr is nullptr"); + return ""); + auto op_desc = node_ptr->GetOpDesc(); GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: op_desc is nullptr"); return ""); // Use the OpsKernelManager in GELib to get the opInfos for this opCode @@ -190,6 +204,7 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { std::string exclude_core_Type = (ge_core_type == kVectorCore) ? kAIcoreEngine : kVectorEngine; GELOGD("engine type will exclude: %s", exclude_core_Type.c_str()); + auto root_graph = ge::GraphUtils::FindRootGraph(node_ptr->GetOwnerComputeGraph()); std::map unsupported_reasons; for (const auto &it : op_infos) { if (it.engine == exclude_core_Type) { @@ -206,6 +221,9 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time; op_desc->SetOpEngineName(it.engine); op_desc->SetOpKernelLibName(kernel_name); + // set attrs for taking information when load txt to graph object + (void)AttrUtils::SetStr(op_desc, ATTR_NAME_ENGINE_NAME_FOR_LX, it.engine); + (void)AttrUtils::SetStr(op_desc, ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX, kernel_name); GELOGD("DNNEngineManager:Set OpKernelLibName %s and engine name %s to op_desc %s", kernel_name.c_str(), it.engine.c_str(), op_desc->GetName().c_str()); return it.engine; @@ -219,6 +237,9 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { "The custom operator registered by the user does not support the logic function delivered by this " "network. Check support failed, kernel_name is %s, op type is %s, op name is %s", kernel_name.c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str()); + std::string error_info = + "The custom operator registered by the user does not support the logic function" + "delivered by this network"; return ""; } unsupported_reasons.emplace(kernel_name, unsupported_reason); @@ -235,12 +256,22 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { kernel_name.c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str()); } } + + // concat unsupported reasons analyzed data selection + string reason; for (const auto &it : unsupported_reasons) { + reason += it.first + ":" + it.second + ";"; ErrorManager::GetInstance().ATCReportErrMessage("E13002", {"optype", "opskernel", "reason"}, {op_desc->GetType(), it.first, it.second}); GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "GetDNNEngineName:Op type %s of ops kernel %s is unsupported, reason:%s", op_desc->GetType().c_str(), it.first.c_str(), it.second.c_str()); } + + analyzer::DataInfo analyze_info{root_graph->GetSessionID(), root_graph->GetGraphID(), analyzer::CHECKSUPPORT, + node_ptr, reason}; + // do not change original process + (void)Analyzer::GetInstance()->DoAnalyze(analyze_info); + ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"}, {op_desc->GetName(), op_desc->GetType()}); GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, "Can't find any supported ops kernel and engine of %s, type is %s", diff --git a/src/ge/engine_manager/dnnengine_manager.h b/src/ge/engine_manager/dnnengine_manager.h index 6d5b02f9..c3ae5b95 100644 --- a/src/ge/engine_manager/dnnengine_manager.h +++ b/src/ge/engine_manager/dnnengine_manager.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "nlohmann/json.hpp" @@ -29,6 +30,7 @@ #include "common/opskernel/ops_kernel_info_types.h" #include "engine/dnnengine.h" #include "graph/op_desc.h" +#include "graph/node.h" using JsonHandle = void *; namespace ge { @@ -61,7 +63,7 @@ class DNNEngineManager { std::shared_ptr GetEngine(const std::string &name) const; bool IsEngineRegistered(const std::string &name); // If can't find appropriate engine name, return "", report error - string GetDNNEngineName(const OpDescPtr &op_desc); + string GetDNNEngineName(const ge::NodePtr &node_ptr); const map &GetSchedulers() const; const map &GetCheckSupportCost() const; void InitPerformanceStaistic(); @@ -83,6 +85,7 @@ class DNNEngineManager { std::map schedulers_; std::map checksupport_cost_; bool init_flag_; + mutable std::mutex mutex_; }; } // namespace ge diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt index 17508711..f3956e31 100755 --- a/src/ge/executor/CMakeLists.txt +++ b/src/ge/executor/CMakeLists.txt @@ -22,6 +22,7 @@ file(GLOB PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../../proto/insert_op.proto" "../../proto/op_mapping_info.proto" "../../proto/ge_ir.proto" + "../proto/dump_task.proto" ) file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} @@ -68,6 +69,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../graph/manager/graph_manager_utils.cc" "../graph/manager/graph_mem_allocator.cc" "../graph/manager/graph_var_manager.cc" + "../graph/manager/rdma_pool_allocator.cc" "../graph/manager/trans_var_data_utils.cc" "../graph/manager/util/debug.cc" "../hybrid/hybrid_davinci_model_stub.cc" diff --git a/src/ge/executor/ge_executor.cc b/src/ge/executor/ge_executor.cc index 0d334042..b4e9df35 100644 --- a/src/ge/executor/ge_executor.cc +++ b/src/ge/executor/ge_executor.cc @@ -344,47 +344,19 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u return FAILED; } - Status ret = GraphExecutor::SetDynamicSize(model_id, dynamic_dims, static_cast(DYNAMIC_DIMS)); + vector cur_dynamic_dims; + Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims); if (ret != SUCCESS) { - GELOGE(FAILED, "Set dynamic size failed"); + GELOGE(FAILED, "Set cur gear dynmaic dims failed"); return FAILED; } - vector cur_dynamic_dims; - std::vector input_desc; - std::vector output_desc; - ret = GetModelDescInfo(model_id, input_desc, output_desc); - if (ret != ge::SUCCESS) { - GELOGE(FAILED, "GetModelDescInfo failed."); - return FAILED; - } - vector user_designate_shape_order; - vector all_data_dims; - ret = GetUserDesignateShapeOrder(model_id, user_designate_shape_order); - if (ret != ge::SUCCESS) { - GELOGE(FAILED, "GetUserDesignateShapeOrder failed."); - return FAILED; - } - for (auto &data_name : user_designate_shape_order) { - for (size_t j = 0; j < input_desc.size(); ++j) { - if (input_desc.at(j).GetName() == data_name) { - for (auto dim : input_desc.at(j).GetShape().GetDims()) { - all_data_dims.push_back(dim); - } - break; - } - } - } - if (dynamic_dims.size() != all_data_dims.size()) { - GELOGE(FAILED, "Dynamic input size [%lu] is not equal with all data dims size [%lu]!", dynamic_dims.size(), - all_data_dims.size()); + ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast(DYNAMIC_DIMS)); + if (ret != SUCCESS) { + GELOGE(FAILED, "Set dynamic size failed"); return FAILED; } - for (std::size_t i = 0; i < all_data_dims.size(); ++i) { - if (all_data_dims[i] < 0) { - cur_dynamic_dims.push_back(dynamic_dims[i]); - } - } + size_t dynamic_dim_num = cur_dynamic_dims.size(); uint64_t dynamic_input_size = static_cast(dynamic_dim_num * sizeof(uint64_t)); if (length < dynamic_input_size) { @@ -403,58 +375,43 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u return SUCCESS; } -Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector &combined_dims, +Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector &dynamic_dims, vector &cur_dynamic_dims) { - vector> combined_batch; - if (GraphExecutor::GetCombinedDynamicDims(model_id, combined_batch) != SUCCESS) { - GELOGE(FAILED, "Get combined dynamic dims info failed."); - return FAILED; - } - if (combined_batch.empty()) { - GELOGE(FAILED, "Combined dynamic dims is empty."); + cur_dynamic_dims.clear(); + vector input_desc; + vector output_desc; + auto ret = GetModelDescInfo(model_id, input_desc, output_desc); + if (ret != ge::SUCCESS) { + GELOGE(FAILED, "GetModelDescInfo failed."); return FAILED; } - - if (combined_dims.size() != combined_batch[0].size()) { - GELOGE(FAILED, "Input dynamic dims's dimension size[%zu] is different from model[%zu].", combined_dims.size(), - combined_batch[0].size()); + vector user_designate_shape_order; + vector all_data_dims; + ret = GetUserDesignateShapeOrder(model_id, user_designate_shape_order); + if (ret != ge::SUCCESS) { + GELOGE(FAILED, "GetUserDesignateShapeOrder failed."); return FAILED; } - bool matched = false; - size_t idx = 0; - for (size_t i = 0; i < combined_batch.size(); i++) { - bool is_match = true; - for (size_t j = 0; j < combined_dims.size(); j++) { - if (combined_dims[j] != static_cast(combined_batch[i][j])) { - is_match = false; + for (auto &data_name : user_designate_shape_order) { + for (auto &desc : input_desc) { + if (desc.GetName() == data_name) { + for (auto dim : desc.GetShape().GetDims()) { + all_data_dims.push_back(dim); + } break; } } - if (is_match) { - idx = i; - matched = true; - break; - } - } - - if (!matched) { - GELOGE(FAILED, "Input dynamic dims can not match model."); - return FAILED; } - - // batch_info save the dynamic info of combined_dims - vector> batch_info; - int32_t dynamic_type = static_cast(FIXED); - if (GraphExecutor::GetDynamicBatchInfo(model_id, batch_info, dynamic_type) != SUCCESS) { - GELOGE(FAILED, "Get dynamic input info failed."); + if (dynamic_dims.size() != all_data_dims.size()) { + GELOGE(FAILED, "Dynamic input size [%lu] is not equal with all data dims size [%lu]!", dynamic_dims.size(), + all_data_dims.size()); return FAILED; } - - cur_dynamic_dims.clear(); - for (size_t i = 0; i < batch_info[idx].size(); i++) { - cur_dynamic_dims.emplace_back(static_cast(batch_info[idx][i])); + for (std::size_t i = 0; i < all_data_dims.size(); ++i) { + if (all_data_dims[i] < 0) { + cur_dynamic_dims.push_back(dynamic_dims[i]); + } } - return SUCCESS; } @@ -924,13 +881,6 @@ Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModel GELOGE(ret, "Get dynamic input info failed."); return ret; } - if (dynamic_type == static_cast(DYNAMIC_DIMS)) { - ret = GraphExecutor::GetCombinedDynamicDims(model_id, batch_info); - if (ret != SUCCESS) { - GELOGE(FAILED, "Get dynamic input info failed."); - return FAILED; - } - } if (!batch_info.empty()) { SetDynamicInputDataFlag(run_input_data, batch_info, input_data); } diff --git a/src/ge/executor/module.mk b/src/ge/executor/module.mk index 878341b6..6b2de8f2 100644 --- a/src/ge/executor/module.mk +++ b/src/ge/executor/module.mk @@ -13,6 +13,7 @@ local_ge_executor_src_files := \ ../omm/csa_interact.cc \ ../graph/manager/graph_manager_utils.cc \ ../graph/manager/graph_var_manager.cc \ + ../graph/manager/rdma_pool_allocator.cc \ ../graph/manager/graph_mem_allocator.cc \ ../graph/manager/graph_caching_allocator.cc \ ../graph/manager/trans_var_data_utils.cc \ @@ -63,6 +64,7 @@ local_ge_executor_src_files := \ local_ge_executor_c_include := \ proto/insert_op.proto \ proto/op_mapping_info.proto \ + proto/dump_task.proto \ proto/ge_ir.proto \ proto/task.proto \ proto/om.proto \ diff --git a/src/ge/ge_inference.mk b/src/ge/ge_inference.mk index 0cc0d6fb..3b9e17ea 100644 --- a/src/ge/ge_inference.mk +++ b/src/ge/ge_inference.mk @@ -59,6 +59,7 @@ GRAPH_MANAGER_LOCAL_SRC_FILES := \ generator/ge_generator.cc \ generator/generator_api.cc \ graph/manager/graph_var_manager.cc \ + graph/manager/rdma_pool_allocator.cc \ graph/manager/graph_mem_allocator.cc \ graph/manager/graph_caching_allocator.cc \ @@ -66,6 +67,9 @@ BUILER_SRC_FILES := \ ir_build/ge_ir_build.cc \ ir_build/atc_ir_common.cc \ +ANALYZER_SRC_FILES:= \ + analyzer/analyzer.cc \ + OMG_HOST_SRC_FILES := \ model/ge_model.cc \ model/ge_root_model.cc \ @@ -103,6 +107,7 @@ OMG_HOST_SRC_FILES := \ graph/passes/mark_graph_unknown_status_pass.cc \ graph/common/omg_util.cc \ graph/common/bcast.cc \ + graph/common/local_context.cc \ graph/passes/dimension_compute_pass.cc \ graph/passes/dimension_adjust_pass.cc \ graph/passes/get_original_format_pass.cc \ @@ -260,6 +265,7 @@ COMMON_LOCAL_C_INCLUDES := \ proto/ge_ir.proto \ proto/fwk_adapter.proto \ proto/op_mapping_info.proto \ + proto/dump_task.proto \ proto/tensorflow/attr_value.proto \ proto/tensorflow/function.proto \ proto/tensorflow/graph.proto \ @@ -284,6 +290,9 @@ COMMON_LOCAL_C_INCLUDES := \ third_party/protobuf/include \ third_party/opencv/include \ +ANALYZER_LOCAL_INCLUDES := \ + $(TOPDIR)framework/domi/analyzer \ + NEW_OMG_HOST_SRC_FILES := \ graph/preprocess/insert_op/util_insert_aipp_op.cc \ graph/preprocess/insert_op/ge_aipp_op.cc \ @@ -348,6 +357,7 @@ LOCAL_CFLAGS += -g -O0 endif LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES) +LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES) LOCAL_SRC_FILES := $(COMMON_LOCAL_SRC_FILES) LOCAL_SRC_FILES += $(GRAPH_MANAGER_LOCAL_SRC_FILES) @@ -355,6 +365,7 @@ LOCAL_SRC_FILES += $(OMG_HOST_SRC_FILES) LOCAL_SRC_FILES += $(OME_HOST_SRC_FILES) LOCAL_SRC_FILES += $(NEW_OME_DEVICE_SRC_FILES) LOCAL_SRC_FILES += $(BUILER_SRC_FILES) +LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES) LOCAL_STATIC_LIBRARIES := libge_memory \ @@ -414,9 +425,11 @@ LOCAL_SRC_FILES += $(GRAPH_MANAGER_LOCAL_SRC_FILES) LOCAL_SRC_FILES += $(OMG_DEVICE_SRC_FILES) LOCAL_SRC_FILES += $(OME_DEVICE_SRC_FILES) LOCAL_SRC_FILES += $(BUILER_SRC_FILES) +LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES) LOCAL_C_INCLUDES := $(DEVICE_LOCAL_C_INCLUDES) +LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES) LOCAL_STATIC_LIBRARIES := libge_memory \ diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.cc b/src/ge/ge_local_engine/engine/host_cpu_engine.cc index fd1b20d3..eb7741c0 100644 --- a/src/ge/ge_local_engine/engine/host_cpu_engine.cc +++ b/src/ge/ge_local_engine/engine/host_cpu_engine.cc @@ -19,10 +19,48 @@ #include "graph/common/omg_util.h" #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_adapter.h" -#include "mmpa/mmpa_api.h" #include "register/op_kernel_registry.h" +#include "register/host_cpu_context.h" #include "common/ge/ge_util.h" #include "common/ge/plugin_manager.h" +#include "graph/utils/type_utils.h" +#include "common/fp16_t.h" + +namespace { +#define CREATE_OUTPUT_CASE(DTYPE, TYPE) \ + case (DTYPE): { \ + GeTensorPtr ge_tensor = nullptr; \ + if (need_create_flag) { \ + int64_t data_num = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize(); \ + std::unique_ptr buf(new (std::nothrow) TYPE[data_num]()); \ + if (buf == nullptr) { \ + GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", \ + static_cast(sizeof(TYPE) * data_num)); \ + return MEMALLOC_FAILED; \ + } \ + ge_tensor = MakeShared(out_desc); \ + GE_CHECK_NOTNULL(ge_tensor); \ + GELOGI("node:%s allocate output %zu, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \ + ge_tensor->SetData(reinterpret_cast(buf.get()), data_num * sizeof(TYPE)); \ + ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); \ + ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); \ + outputs.emplace_back(ge_tensor); \ + } else { \ + ge_tensor = outputs[i]; \ + GE_CHECK_NOTNULL(ge_tensor); \ + GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \ + reinterpret_cast(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \ + } \ + auto tensor = TensorAdapter::AsTensor(*ge_tensor); \ + auto tensor_name = op_desc->GetOutputNameByIndex(i); \ + GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", \ + op_desc->GetName().c_str(), i); \ + GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \ + op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize()); \ + named_outputs.emplace(tensor_name, tensor); \ + break; \ + } +} // namespace namespace ge { namespace { @@ -105,17 +143,32 @@ Status HostCpuEngine::PrepareInputs(const ge::ConstOpDescPtr &op_desc, const vec Status HostCpuEngine::PrepareOutputs(const ge::ConstOpDescPtr &op_desc, vector &outputs, map &named_outputs) { + if (!outputs.empty() && (outputs.size() != op_desc->GetOutputsSize())) { + GELOGW("size of ouputs not match, size of outputs = %zu, exactly output_num=%zu.", outputs.size(), + op_desc->GetOutputsSize()); + outputs.clear(); + } + bool need_create_flag = (outputs.size() != op_desc->GetOutputsSize()); for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { - auto ge_tensor = MakeShared(op_desc->GetOutputDesc(i)); - GE_CHECK_NOTNULL(ge_tensor); - outputs.emplace_back(ge_tensor); - auto tensor = TensorAdapter::AsTensor(*ge_tensor); - auto tensor_name = op_desc->GetOutputNameByIndex(i); - GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", - op_desc->GetName().c_str(), i); - GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s", op_desc->GetName().c_str(), - i, tensor_name.c_str()); - named_outputs.emplace(tensor_name, tensor); + const auto &out_desc = op_desc->GetOutputDesc(i); + switch (out_desc.GetDataType()) { + CREATE_OUTPUT_CASE(DT_BOOL, bool) + CREATE_OUTPUT_CASE(DT_INT8, int8_t) + CREATE_OUTPUT_CASE(DT_INT16, int16_t) + CREATE_OUTPUT_CASE(DT_INT32, int32_t) + CREATE_OUTPUT_CASE(DT_INT64, int64_t) + CREATE_OUTPUT_CASE(DT_UINT8, uint8_t) + CREATE_OUTPUT_CASE(DT_UINT16, uint16_t) + CREATE_OUTPUT_CASE(DT_UINT32, uint32_t) + CREATE_OUTPUT_CASE(DT_UINT64, uint64_t) + CREATE_OUTPUT_CASE(DT_FLOAT16, fp16_t) + CREATE_OUTPUT_CASE(DT_FLOAT, float) + CREATE_OUTPUT_CASE(DT_DOUBLE, double) + default: + GELOGE(PARAM_INVALID, "data type %s not support.", + TypeUtils::DataTypeToSerialString(out_desc.GetDataType()).c_str()); + return PARAM_INVALID; + } } return SUCCESS; @@ -146,6 +199,7 @@ Status HostCpuEngine::Run(NodePtr &node, const vector &inputs, std::map named_inputs; std::vector tmp_outputs; + tmp_outputs.swap(outputs); std::map named_outputs; auto op_desc = node->GetOpDesc(); GE_CHK_STATUS_RET_NOLOG(PrepareInputs(op_desc, inputs, named_inputs)); @@ -233,6 +287,15 @@ Status HostCpuEngine::LoadLib(const std::string &lib_path) { return INTERNAL_ERROR; } + auto initialize = (Status(*)(const HostCpuContext &))dlsym(handle, "Initialize"); + if (initialize != nullptr) { + GELOGI("Invoke function Initialize in lib: %s", lib_path.c_str()); + if (initialize(HostCpuContext()) != SUCCESS) { + GELOGW("Failed to invoke function Initialize in lib: %s", lib_path.c_str()); + } + } + + GELOGI("Lib: %s has been opened", lib_path.c_str()); lib_handles_.emplace_back(handle); return SUCCESS; } @@ -247,4 +310,4 @@ Status HostCpuEngine::GetRealPath(std::string &path) { path = real_path; return SUCCESS; } -} // namespace ge \ No newline at end of file +} // namespace ge diff --git a/src/ge/ge_runner.mk b/src/ge/ge_runner.mk index 66e2be5a..b4d27b1b 100644 --- a/src/ge/ge_runner.mk +++ b/src/ge/ge_runner.mk @@ -42,6 +42,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/build/stream_graph_optimizer.cc \ graph/build/task_generator.cc \ graph/common/bcast.cc \ + graph/common/local_context.cc \ graph/common/omg_util.cc \ graph/common/transop_util.cc \ graph/execute/graph_execute.cc \ @@ -88,6 +89,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/manager/graph_mem_allocator.cc \ graph/manager/graph_caching_allocator.cc \ graph/manager/graph_var_manager.cc \ + graph/manager/rdma_pool_allocator.cc \ graph/manager/model_manager/event_manager.cc \ graph/manager/trans_var_data_utils.cc \ graph/manager/util/debug.cc \ @@ -289,6 +291,7 @@ LIBGE_LOCAL_SRC_FILES := \ hybrid/node_executor/task_context.cc \ hybrid/hybrid_davinci_model.cc \ executor/ge_executor.cc \ + analyzer/analyzer.cc \ LIBCLIENT_LOCAL_SRC_FILES := \ proto/ge_api.proto \ @@ -308,11 +311,13 @@ RUNNER_LOCAL_C_INCLUDES := \ $(TOPDIR)inc/runtime \ $(TOPDIR)libc_sec/include \ $(TOPDIR)ops/built-in/op_proto/inc \ + $(TOPDIR)framework/domi/analyzer \ proto/fwk_adapter.proto \ proto/ge_ir.proto \ proto/insert_op.proto \ proto/om.proto \ proto/op_mapping_info.proto \ + proto/dump_task.proto \ proto/task.proto \ proto/tensorflow/attr_value.proto \ proto/tensorflow/function.proto \ diff --git a/src/ge/ge_runtime/task/aicpu_task.cc b/src/ge/ge_runtime/task/aicpu_task.cc index 15324919..5b3d8e82 100644 --- a/src/ge/ge_runtime/task/aicpu_task.cc +++ b/src/ge/ge_runtime/task/aicpu_task.cc @@ -75,7 +75,8 @@ bool AicpuTask::Distribute() { return false; } - flag = rtMemcpy(ext_info_, ext_size, reinterpret_cast(ext_info.data()), ext_size, RT_MEMCPY_HOST_TO_DEVICE); + flag = rtMemcpy(ext_info_, ext_size, const_cast(reinterpret_cast(ext_info.data())), ext_size, + RT_MEMCPY_HOST_TO_DEVICE); if (flag != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt api(rtMemCpy) failed, ret: 0x%X.", flag); return false; diff --git a/src/ge/generator/ge_generator.cc b/src/ge/generator/ge_generator.cc index 0d4fac3f..edd7a155 100644 --- a/src/ge/generator/ge_generator.cc +++ b/src/ge/generator/ge_generator.cc @@ -15,6 +15,9 @@ */ #include "generator/ge_generator.h" + +#include + #include "common/ge/ge_util.h" #include "common/ge/plugin_manager.h" #include "common/helper/model_helper.h" @@ -212,6 +215,9 @@ static void GetOpsProtoPath(string &opsproto_path) { class GeGenerator::Impl { public: + Impl(OmgContext &omg_context) : omg_context_(omg_context), graph_manager_(omg_context) {} + ~Impl() = default; + Status BuildModel(const Graph &graph, const vector &inputs, GeRootModelPtr &ge_models); Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model); @@ -221,10 +227,14 @@ class GeGenerator::Impl { Status GenerateInfershapeGraph(const Graph &graph); + OmgContext &omg_context_; GraphManager graph_manager_; SaveParam save_param_; bool is_offline_ = true; bool is_singleop_unregistered_ = false; + std::string build_mode_; + std::string build_step_; + static std::mutex mutex_; private: static std::string Trim(const std::string &str); @@ -234,8 +244,10 @@ class GeGenerator::Impl { bool SetOppVersionInfo(AttrHolder &obj); }; -Status GeGenerator::Initialize(const map &options) { - impl_ = ge::MakeShared(); +Status GeGenerator::Initialize(const map &options) { return Initialize(options, domi::GetContext()); } + +Status GeGenerator::Initialize(const map &options, OmgContext &omg_context) { + impl_ = ge::MakeShared(omg_context); if (impl_ == nullptr) { GELOGE(MEMALLOC_FAILED, "Make shared failed"); return MEMALLOC_FAILED; @@ -273,6 +285,17 @@ Status GeGenerator::Initialize(const map &options) { if (iter != options.end()) { impl_->save_param_.pri_key_file = iter->second; } + + // get build mode + iter = options.find(BUILD_MODE); + if (iter != options.end()) { + impl_->build_mode_ = iter->second; + } + // get build step + iter = options.find(BUILD_STEP); + if (iter != options.end()) { + impl_->build_step_ = iter->second; + } return SUCCESS; } @@ -312,6 +335,8 @@ Status GeGenerator::GenerateInfershapeGraph(const Graph &graph) { return SUCCESS; } +std::mutex GeGenerator::Impl::mutex_; + // Remove the space and tab before and after the string std::string GeGenerator::Impl::Trim(const std::string &str) { if (str.empty()) { @@ -436,8 +461,7 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr auto rt = rtCtxGetCurrent(&ctx); if (rt != RT_ERROR_NONE) { GELOGW("Current ctx is null."); - } else { - ge::RtContextUtil::GetInstance().SetNormalModeContext(ctx); + ctx = nullptr; } GeRootModelPtr ge_root_model = nullptr; @@ -451,6 +475,17 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr } return ret; } + + /// BUILD_MODE_TUNING with BUILD_STEP_BEFORE_UB_MATCH no need save model; + /// BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER no need save model; + /// BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need save model. + if ((impl_->build_mode_ == BUILD_MODE_TUNING) && + (impl_->build_step_ == BUILD_STEP_BEFORE_UB_MATCH || impl_->build_step_ == BUILD_STEP_AFTER_BUILDER || + impl_->build_step_ == BUILD_STEP_AFTER_BUILDER_SUB)) { + GELOGI("Build mode:%s with step:%s no need SaveModel.", impl_->build_mode_.c_str(), impl_->build_step_.c_str()); + return SUCCESS; + } + GE_CHECK_NOTNULL(ge_root_model); GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); ModelHelper model_helper; @@ -474,8 +509,8 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr return ret; } - if (RtContextUtil::GetInstance().GetNormalModeContext() != nullptr) { - (void)rtCtxSetCurrent(RtContextUtil::GetInstance().GetNormalModeContext()); + if (ctx != nullptr) { + (void)rtCtxSetCurrent(ctx); } GELOGI("GenerateOfflineModel success."); @@ -495,7 +530,8 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in return PARAM_INVALID; } - domi::GetContext().is_dynamic_input = ContainsDynamicInpus(*op_desc); + OmgContext &omg_context = (impl_ == nullptr) ? domi::GetContext() : impl_->omg_context_; + omg_context.is_dynamic_input = ContainsDynamicInpus(*op_desc); if (op_desc->HasAttr(ATTR_NAME_UNREGST_OPPATH)) { impl_->is_singleop_unregistered_ = true; @@ -633,35 +669,32 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr & Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector &inputs, GeRootModelPtr &ge_root_model) { - static GraphId id = 0; + static std::atomic atomic_graph_id(0); + auto graph_id = atomic_graph_id.fetch_add(1); const std::map options; - Status ret = graph_manager_.AddGraph(id, graph, options); + Status ret = graph_manager_.AddGraph(graph_id, graph, options); if (ret != SUCCESS) { - GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph fail, graph id: %u", id); + GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph fail, graph id: %u", graph_id); (void)graph_manager_.Finalize(); return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED; } GELOGI("Model inputs size is %zu", inputs.size()); graph_manager_.SetOptionsRunGraphFlag(false); - struct timeval tv; - if (gettimeofday(&tv, nullptr) != 0) { - GELOGE(INTERNAL_ERROR, "get the time of day failed."); - return INTERNAL_ERROR; - } - uint64_t session_id = static_cast(tv.tv_sec * 1000000 + tv.tv_usec); // 1000000us + + static std::atomic atomic_session_id(0); + auto session_id = atomic_session_id.fetch_add(1); if (is_singleop_unregistered_) { - ret = graph_manager_.BuildGraphForUnregisteredOp(id, inputs, ge_root_model, session_id); + ret = graph_manager_.BuildGraphForUnregisteredOp(graph_id, inputs, ge_root_model, session_id); } else { - ret = graph_manager_.BuildGraph(id, inputs, ge_root_model, session_id); + ret = graph_manager_.BuildGraph(graph_id, inputs, ge_root_model, session_id); } if (ret != SUCCESS) { - GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", id); + GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", graph_id); VarManagerPool::Instance().RemoveVarManager(session_id); return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED; } - id += 1; VarManagerPool::Instance().RemoveVarManager(session_id); @@ -669,21 +702,21 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector } Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph) { - static GraphId id = 0; + static std::atomic atomic_graph_id(0); + auto graph_id = atomic_graph_id.fetch_add(1); const std::map options; - Status ret = graph_manager_.AddGraph(id, graph, options); + Status ret = graph_manager_.AddGraph(graph_id, graph, options); if (ret != SUCCESS) { - GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, graph id: %u", id); + GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, graph id: %u", graph_id); (void)graph_manager_.Finalize(); return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED; } - ret = graph_manager_.GenerateInfershapeGraph(id); + ret = graph_manager_.GenerateInfershapeGraph(graph_id); if (ret != SUCCESS) { GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager generate graph failed"); return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED; } - id += 1; return SUCCESS; } diff --git a/src/ge/graph/build/graph_builder.cc b/src/ge/graph/build/graph_builder.cc index ac83d4ec..27d0b13f 100644 --- a/src/ge/graph/build/graph_builder.cc +++ b/src/ge/graph/build/graph_builder.cc @@ -63,7 +63,7 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) { std::string kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName(); if (kernel_lib_name.empty()) { // reset op kernel lib - (void)instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr->GetOpDesc()); + (void)instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr); kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName(); if (kernel_lib_name.empty()) { GELOGE(INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node_ptr->GetName().c_str(), @@ -84,6 +84,7 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) { GELOGE(ret, "Calculate op running param failed, node name is %s", node_ptr->GetName().c_str()); return ret; } + GE_CHK_STATUS_RET(AddOutputMemTypeForNode(node_ptr)); } else { GELOGE(GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node_ptr->GetName().c_str()); return INTERNAL_ERROR; @@ -497,4 +498,24 @@ Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, vectorGetOpDesc(), ATTR_INPUT_MEMORY_TYPE, mem_type)) { + GELOGD("[%s] has attr input_memory_type %ld", node->GetName().c_str(), mem_type); + for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { + const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + const auto &src_node = peer_out_anchor->GetOwnerNode(); + const auto &src_op = src_node->GetOpDesc(); + GE_IF_BOOL_EXEC(src_op == nullptr, continue); + if (!AttrUtils::SetInt(src_op, ATTR_OUTPUT_MEMORY_TYPE, mem_type)) { + GELOGE(INTERNAL_ERROR, "Set out_memory_type attr failed."); + return INTERNAL_ERROR; + } + return SUCCESS; + } + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/build/graph_builder.h b/src/ge/graph/build/graph_builder.h index dd229bc6..a70a5464 100644 --- a/src/ge/graph/build/graph_builder.h +++ b/src/ge/graph/build/graph_builder.h @@ -67,6 +67,7 @@ class GraphBuilder { GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID); Status BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID); + Status AddOutputMemTypeForNode(const NodePtr &node); Status BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID); int build_mode_; diff --git a/src/ge/graph/build/memory/binary_block_mem_assigner.h b/src/ge/graph/build/memory/binary_block_mem_assigner.h index 678a8adf..de6cae0d 100644 --- a/src/ge/graph/build/memory/binary_block_mem_assigner.h +++ b/src/ge/graph/build/memory/binary_block_mem_assigner.h @@ -24,7 +24,9 @@ namespace ge { class BinaryBlockMemAssigner : public BlockMemAssigner { public: - explicit BinaryBlockMemAssigner(ge::ComputeGraphPtr compute_graph) : BlockMemAssigner(std::move(compute_graph)) {} + BinaryBlockMemAssigner(ComputeGraphPtr compute_graph, const std::map &anchor_to_symbol, + const std::map> &symbol_to_anchors) + : BlockMemAssigner(std::move(compute_graph), anchor_to_symbol, symbol_to_anchors) {} BinaryBlockMemAssigner(const BinaryBlockMemAssigner &) = delete; diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc index 3d956230..53b5b71c 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.cc +++ b/src/ge/graph/build/memory/block_mem_assigner.cc @@ -32,10 +32,12 @@ #include "graph/debug/ge_attr_define.h" +#include "graph/common/local_context.h" #include "graph/optimize/common/params.h" #include "omg/omg_inner_types.h" #include "runtime/mem.h" +using std::list; using std::map; using std::pair; using std::set; @@ -402,8 +404,13 @@ string MemoryBlock::String() { return ss.str(); } -BlockMemAssigner::BlockMemAssigner(ge::ComputeGraphPtr compute_graph) - : mem_offset_(0), compute_graph_(std::move(compute_graph)), life_time_(0) {} +BlockMemAssigner::BlockMemAssigner(ComputeGraphPtr compute_graph, const map &anchor_to_symbol, + const map> &symbol_to_anchors) + : mem_offset_(0), + compute_graph_(std::move(compute_graph)), + symbol_to_anchors_(symbol_to_anchors), + anchor_to_symbol_(anchor_to_symbol), + life_time_(0) {} BlockMemAssigner::~BlockMemAssigner() { for (MemoryBlock *memory_block : memory_blocks_) { @@ -412,11 +419,6 @@ BlockMemAssigner::~BlockMemAssigner() { } void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { - if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str()); - return; - } - vector temp; for (const NodePtr &n : compute_graph_->GetAllNodes()) { auto node_op_desc = n->GetOpDesc(); @@ -692,13 +694,16 @@ bool BlockMemAssigner::IsPostReuse(const MemoryBlock *mem_block) const { /// @ingroup GE /// @brief check if symbol of cur node_index_io has block /// @param [in] node_index_io +/// @param [out] symbol /// @return bool /// -bool BlockMemAssigner::IsSymbolExist(const NodeIndexIO &node_index_io) { +bool BlockMemAssigner::IsSymbolExist(const NodeIndexIO &node_index_io, string &symbol) { auto iter = anchor_to_symbol_.find(node_index_io.ToString()); if (iter == anchor_to_symbol_.end()) { return false; } + + symbol = iter->second; return symbol_blocks_.find(iter->second) != symbol_blocks_.end(); } @@ -883,8 +888,8 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetNoAlignSize(*node_op_desc, index, no_align_size) != SUCCESS, return nullptr, "Get no align size failed"); - if (IsSymbolExist(node_index_io)) { - const std::string &symbol = anchor_to_symbol_[node_index_io.ToString()]; + std::string symbol; + if (IsSymbolExist(node_index_io, symbol)) { block = symbol_blocks_[symbol]; block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size); block->ref_count_++; @@ -949,8 +954,8 @@ bool IsOutputBlock(const ge::InDataAnchorPtr &in_data_anchor) { GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, GELOGE(FAILED, "Peer out anchor is nullptr."); return false); auto src = peer_out_anchor->GetOwnerNode(); int32_t index = peer_out_anchor->GetIdx(); - auto iter = domi::GetContext().out_nodes_map.find(src->GetName()); - if (iter != domi::GetContext().out_nodes_map.end()) { + auto iter = GetLocalOmgContext().out_nodes_map.find(src->GetName()); + if (iter != GetLocalOmgContext().out_nodes_map.end()) { for (auto id : iter->second) { if (index == id) { return true; diff --git a/src/ge/graph/build/memory/block_mem_assigner.h b/src/ge/graph/build/memory/block_mem_assigner.h index eedc7bec..7e37fe8e 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.h +++ b/src/ge/graph/build/memory/block_mem_assigner.h @@ -159,7 +159,8 @@ class MemoryBlock { class BlockMemAssigner : public MemAssigner { public: - explicit BlockMemAssigner(ge::ComputeGraphPtr compute_graph); + BlockMemAssigner(ComputeGraphPtr compute_graph, const std::map &anchor_to_symbol, + const std::map> &symbol_to_anchors); BlockMemAssigner(const BlockMemAssigner &) = delete; @@ -241,9 +242,10 @@ class BlockMemAssigner : public MemAssigner { /// @ingroup GE /// @brief check if symbol of cur node_index_io has block /// @param [in] node_index_io + /// @param [out] symbol /// @return bool /// - bool IsSymbolExist(const NodeIndexIO &node_index_io); + bool IsSymbolExist(const NodeIndexIO &node_index_io, std::string &symbol); /// /// @ingroup GE @@ -261,8 +263,8 @@ class BlockMemAssigner : public MemAssigner { std::vector zero_memory_list_; // ref mapping - std::map> symbol_to_anchors_; - std::map anchor_to_symbol_; + const std::map> &symbol_to_anchors_; + const std::map &anchor_to_symbol_; std::map pre_reuse_flag_; std::map post_reuse_flag_; std::map symbol_size_; diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc index affa82c8..c9a6b8a2 100644 --- a/src/ge/graph/build/memory/graph_mem_assigner.cc +++ b/src/ge/graph/build/memory/graph_mem_assigner.cc @@ -18,6 +18,7 @@ #include #include #include "common/math/math_util.h" +#include "common/util/error_manager/error_manager.h" #include "framework/common/debug/ge_log.h" #include "graph/build/memory/hybrid_mem_assigner.h" #include "graph/build/memory/var_mem_assign_util.h" @@ -226,6 +227,7 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) { GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset, VarManager::Instance(session_id)->GetGraphMemoryMaxSize()); + ErrorManager::GetInstance().ATCReportErrMessage("E19022"); return ge::FAILED; } return SUCCESS; diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.cc b/src/ge/graph/build/memory/hybrid_mem_assigner.cc index 925d742a..a75487de 100644 --- a/src/ge/graph/build/memory/hybrid_mem_assigner.cc +++ b/src/ge/graph/build/memory/hybrid_mem_assigner.cc @@ -41,10 +41,17 @@ Status HybridMemAssigner::AssignMemory(std::unique_ptr &block_ } Status HybridMemAssigner::Assign() { - std::unique_ptr binary_assigner(new (std::nothrow) BinaryBlockMemAssigner(compute_graph_)); + if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str()); + return FAILED; + } + + std::unique_ptr binary_assigner( + new (std::nothrow) BinaryBlockMemAssigner(compute_graph_, anchor_to_symbol_, symbol_to_anchors_)); GE_CHECK_NOTNULL(binary_assigner); - std::unique_ptr max_assigner(new (std::nothrow) MaxBlockMemAssigner(compute_graph_)); + std::unique_ptr max_assigner( + new (std::nothrow) MaxBlockMemAssigner(compute_graph_, anchor_to_symbol_, symbol_to_anchors_)); GE_CHECK_NOTNULL(max_assigner); size_t bin_mem_size = 0; diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.h b/src/ge/graph/build/memory/hybrid_mem_assigner.h index db3741d4..fba70a59 100644 --- a/src/ge/graph/build/memory/hybrid_mem_assigner.h +++ b/src/ge/graph/build/memory/hybrid_mem_assigner.h @@ -54,6 +54,9 @@ class HybridMemAssigner : public MemAssigner { ge::ComputeGraphPtr compute_graph_; BlockMemAssignerPtr priority_assigner_; + + std::map anchor_to_symbol_; + std::map> symbol_to_anchors_; }; } // namespace ge #endif // GE_GRAPH_BUILD_MEMORY_HYBRID_MEM_ASSIGNER_H_ diff --git a/src/ge/graph/build/memory/max_block_mem_assigner.h b/src/ge/graph/build/memory/max_block_mem_assigner.h index cb46880a..f5626ebf 100644 --- a/src/ge/graph/build/memory/max_block_mem_assigner.h +++ b/src/ge/graph/build/memory/max_block_mem_assigner.h @@ -23,7 +23,9 @@ namespace ge { class MaxBlockMemAssigner : public BlockMemAssigner { public: - explicit MaxBlockMemAssigner(ge::ComputeGraphPtr compute_graph) : BlockMemAssigner(std::move(compute_graph)) {} + MaxBlockMemAssigner(ComputeGraphPtr compute_graph, const std::map &anchor_to_symbol, + const std::map> &symbol_to_anchors) + : BlockMemAssigner(std::move(compute_graph), anchor_to_symbol, symbol_to_anchors) {} MaxBlockMemAssigner(const MaxBlockMemAssigner &) = delete; diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc index 9a314d80..9a37478d 100644 --- a/src/ge/graph/build/model_builder.cc +++ b/src/ge/graph/build/model_builder.cc @@ -28,6 +28,7 @@ #include "graph/build/stream_allocator.h" #include "graph/common/omg_util.h" #include "graph/common/ge_call_wrapper.h" +#include "graph/common/local_context.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_attr_value.h" #include "graph/ge_context.h" @@ -244,7 +245,7 @@ Status ModelBuilder::SetInputOutputDesc() { } // if user set input node format ND, the expected node for data and netoutput format is ND in // final graph. - if ((domi::GetContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) && + if ((GetLocalOmgContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) && ((node_op_desc->GetType() == DATA_TYPE) || (node_op_desc->GetType() == NETOUTPUT))) { GELOGI("The node [%s] format should be set ND.", node_op_desc->GetName().c_str()); auto inputDescsPtr = node_op_desc->GetAllInputsDescPtr(); @@ -406,7 +407,7 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) { GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_ZERO_COPY_MEMORY_SIZE, zero_copy_mem_size_), GELOGE(FAILED, "SetInt of ATTR_MODEL_ZERO_COPY_MEMORY_SIZE failed."); return FAILED); - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, domi::GetContext().net_out_nodes), + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, GetLocalOmgContext().net_out_nodes), GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed."); return FAILED); GELOGI("For model, max_mem_offset_: %zu, zero_copy_mem_size_: %zu", max_mem_offset_, zero_copy_mem_size_); @@ -571,26 +572,59 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { // Add weight ge_model.SetWeight(weight_buffer_); - // Add TBE Kernels - std::set name_set; + // Add TBE Kernels and custom aicpu op bin + std::set tbe_name_set; + std::set aicpu_name_set; for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) { auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); + if (tbe_kernel == nullptr) { + std::string kernel_name; + GeAttrValue::BYTES kernel_buffer; + (void)AttrUtils::GetStr(node_op_desc, ATTR_NAME_TBE_KERNEL_NAME, kernel_name); + (void)AttrUtils::GetBytes(node_op_desc, ATTR_NAME_TBE_KERNEL_BUFFER, kernel_buffer); + if (!kernel_name.empty() && (kernel_buffer.GetSize() > 0)) { + GE_CHECK_NOTNULL(kernel_buffer.GetData()); + std::vector data(kernel_buffer.GetData(), kernel_buffer.GetData() + kernel_buffer.GetSize()); + tbe_kernel = std::make_shared(kernel_name, std::move(data)); + } + } GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue); - if (name_set.count(tbe_kernel->GetName()) > 0) { + if (tbe_name_set.count(tbe_kernel->GetName()) > 0) { GELOGE(FAILED, "tbe_kernel name %s can't be the same", tbe_kernel->GetName().c_str()); return FAILED; } - name_set.insert(tbe_kernel->GetName()); + tbe_name_set.insert(tbe_kernel->GetName()); tbe_kernel_store_.AddTBEKernel(tbe_kernel); - GELOGD("Add tbe kernel bin %s", tbe_kernel->GetName().c_str()); + GELOGI("Add tbe kernel bin %s", tbe_kernel->GetName().c_str()); + } + + for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) { + auto node_op_desc = n->GetOpDesc(); + GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); + CustAICPUKernelPtr cust_aicpu_kernel = + node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); + GE_IF_BOOL_EXEC(cust_aicpu_kernel == nullptr, continue); + if (aicpu_name_set.count(cust_aicpu_kernel->GetName()) > 0) { + GELOGE(FAILED, "aicpu_kernel name %s can't be the same", cust_aicpu_kernel->GetName().c_str()); + return FAILED; + } + aicpu_name_set.insert(cust_aicpu_kernel->GetName()); + cust_aicpu_kernel_store_.AddCustAICPUKernel(cust_aicpu_kernel); + GELOGI("Add cust aicpu kernel bin %s", cust_aicpu_kernel->GetName().c_str()); } + if (!tbe_kernel_store_.Build()) { GELOGE(FAILED, "TBE Kernels store build failed!"); return FAILED; } + if (!cust_aicpu_kernel_store_.Build()) { + GELOGE(FAILED, "custom AICPU kernels store build failed!"); + return FAILED; + } ge_model.SetTBEKernelStore(tbe_kernel_store_); + ge_model.SetCustAICPUKernelStore(cust_aicpu_kernel_store_); // Add task GeAttrValue::BYTES task_def_bytes; @@ -744,7 +778,7 @@ Status ModelBuilder::CompileSingleOp() { string kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { // Reset op kernel lib - (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc); + (void)instance->DNNEngineManagerObj().GetDNNEngineName(node); kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { GELOGE(ge::INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node->GetName().c_str(), diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h index 86b34c6d..e54d6695 100644 --- a/src/ge/graph/build/model_builder.h +++ b/src/ge/graph/build/model_builder.h @@ -25,6 +25,7 @@ #include #include "common/op/ge_op_utils.h" #include "common/tbe_kernel_store.h" +#include "common/cust_aicpu_kernel_store.h" #include "common/types.h" #include "common/util.h" #include "graph/compute_graph.h" @@ -108,6 +109,7 @@ class ModelBuilder { size_t zero_copy_mem_size_; TBEKernelStore tbe_kernel_store_; + CustAICPUKernelStore cust_aicpu_kernel_store_; uint8_t platform_type_; bool is_loop_graph_; diff --git a/src/ge/graph/build/stream_allocator.cc b/src/ge/graph/build/stream_allocator.cc index b7643e47..bcfea1d8 100644 --- a/src/ge/graph/build/stream_allocator.cc +++ b/src/ge/graph/build/stream_allocator.cc @@ -15,8 +15,8 @@ */ #include "graph/build/stream_allocator.h" -#include #include +#include #include "common/ge/ge_util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/fmk_error_codes.h" @@ -1062,12 +1062,12 @@ Status StreamAllocator::SetActiveStreamsForLoop() { GELOGI("there are %zu next iterator target streams has streamswitch node.", streams_skip_iterator_event.size()); for (auto iter : stream_id_to_last_node) { if (streams_skip_iterator_event.find(iter.first) != streams_skip_iterator_event.end()) { - GELOGI("skip stream %ld which has streamswitch node when add event to next iterator active node", + GELOGI("Skip stream %ld which has streamswitch node when adding event to next iterator active node", iter.first); continue; } if (iter.second->GetOwnerComputeGraph()->GetParentGraph() != nullptr) { - GELOGI("skip stream %ld which last node in subgraph when add event to next iterator active node", + GELOGI("Skip stream %ld which is last node in subgraph when adding event to next iterator active node", iter.first); continue; } @@ -1264,15 +1264,6 @@ void StreamAllocator::DumpEvents() { } Status StreamAllocator::GetMaxStreamAndTask(bool huge_stream, uint32_t &max_stream_count, uint32_t &max_task_count) { - const char *buffer_optimize_on = std::getenv("BUFFER_OPTIMIZE_ON"); - if (buffer_optimize_on != nullptr) { - rtError_t ret = rtSetPlatformType(PLATFORM_MINI_V1); - if (ret != RT_ERROR_NONE) { - GELOGE(FAILED, "Get max stream and task count by rts failed."); - return FAILED; - } - } - uint32_t stream_type = RT_NORMAL_STREAM; if (huge_stream) { stream_type = RT_HUGE_STREAM; diff --git a/src/ge/graph/build/stream_graph_optimizer.cc b/src/ge/graph/build/stream_graph_optimizer.cc index a3e8044d..49ecc674 100644 --- a/src/ge/graph/build/stream_graph_optimizer.cc +++ b/src/ge/graph/build/stream_graph_optimizer.cc @@ -102,12 +102,9 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com continue; } - const char *buffer_optimize_on = std::getenv("BUFFER_OPTIMIZE_ON"); - if (buffer_optimize_on == nullptr) { - if (!IsSameStreamId(subgraph)) { - GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str()); - continue; - } + if (!IsSameStreamId(subgraph)) { + GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str()); + continue; } OpDescPtr op_desc = nodes.at(0)->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc index 91f70f2a..cf6b7a0d 100644 --- a/src/ge/graph/build/task_generator.cc +++ b/src/ge/graph/build/task_generator.cc @@ -31,6 +31,8 @@ #include "graph/utils/type_utils.h" #include "graph/common/ge_call_wrapper.h" #include "init/gelib.h" +#include "graph/ge_local_context.h" +#include "ge/ge_api_types.h" using domi::LogTimeStampDef; using domi::ModelTaskDef; @@ -527,7 +529,7 @@ Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) { // Reset op kernel lib name if (op_desc->GetOpKernelLibName().empty()) { - (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(op_desc); + (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node); } all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc); @@ -762,24 +764,26 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin return SUCCESS; } -Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, - vector &all_reduce_nodes) const { - GELOGI("Start FindProfilingTaskIndex."); - GE_CHECK_NOTNULL(graph); - const char *profiling_mode = std::getenv(kProfilingMode); - bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn(); - if (!is_profiling) { +Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, + vector &all_reduce_nodes, std::string &fp_point_str, + std::string &bp_point_str) const { + if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_FPPONIT_OPTIONS, fp_point_str) == SUCCESS && + ge::GetContext().GetOption(OPTION_EXEC_PROFILING_BPPONIT_OPTIONS, bp_point_str) == SUCCESS && + !fp_point_str.empty() && !bp_point_str.empty()) { return SUCCESS; } + Status ret = SUCCESS; const char *fp_point = std::getenv(kProfilingFpPoint); - Status ret; if (fp_point == nullptr) { ret = AutoFindFpOpIndex(graph, profiling_point); if (ret != SUCCESS) { GELOGW("First forward profiling op_index not set and FindFpOpIndex failed."); - return SUCCESS; + return FAILED; } + } else { + fp_point_str = string(fp_point); + GELOGI("Get fp_point_str from env %s", fp_point_str.c_str()); } const char *bp_point = std::getenv(kProfilingBpPoint); @@ -787,20 +791,47 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi ret = AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes); if (ret != SUCCESS) { GELOGW("Last backward profiling op_index not set and FindBpOpIndex failed."); - return SUCCESS; + return FAILED; } + } else { + bp_point_str = string(bp_point); + GELOGI("Get bp_point_str from env %s", bp_point_str.c_str()); } - if (fp_point != nullptr) { - string fp_point_str = string(fp_point); + return SUCCESS; +} + +Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, + vector &all_reduce_nodes) const { + GELOGI("Start FindProfilingTaskIndex."); + GE_CHECK_NOTNULL(graph); + const char *profiling_mode = std::getenv(kProfilingMode); + bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn(); + if (!is_profiling) { + GELOGW("Profiling is not open."); + return SUCCESS; + } + + GELOGI("Start get FP/BP index."); + std::string fp_point_str; + std::string bp_point_str; + Status ret = GetFpBpIndex(graph, profiling_point, all_reduce_nodes, fp_point_str, bp_point_str); + if (ret != SUCCESS) { + GELOGW("Get FP_POINT BP_POINT failed."); + return SUCCESS; + } + + GELOGI("fp_point_str:%s, bp_point_str:%s.", fp_point_str.c_str(), bp_point_str.c_str()); + + if (!fp_point_str.empty()) { ret = FindFpOfEnv(graph, fp_point_str, profiling_point); if (ret != SUCCESS) { GELOGW("First backward profiling op name set but FindFpOfEnv failed."); return SUCCESS; } } - if (bp_point != nullptr) { - string bp_point_str = string(bp_point); + + if (!bp_point_str.empty()) { ret = FindBpOfEnv(graph, bp_point_str, profiling_point, all_reduce_nodes); if (ret != SUCCESS) { GELOGW("Last backward profiling op name set but FindBpOfEnv failed."); diff --git a/src/ge/graph/build/task_generator.h b/src/ge/graph/build/task_generator.h index b2ca4470..6bd3ab03 100644 --- a/src/ge/graph/build/task_generator.h +++ b/src/ge/graph/build/task_generator.h @@ -118,6 +118,9 @@ class TaskGenerator { Status FindBpOfEnv(const ComputeGraphPtr &graph, const std::string &bp_point_str, ProfilingPoint &profiling_point, vector &all_reduce_nodes) const; + Status GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, vector &all_reduce_nodes, + std::string &fp_point_str, std::string &bp_point_str) const; + Status FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, std::vector &all_reduce_nodes) const; Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, diff --git a/src/ge/graph/common/local_context.cc b/src/ge/graph/common/local_context.cc new file mode 100644 index 00000000..43d3bc7c --- /dev/null +++ b/src/ge/graph/common/local_context.cc @@ -0,0 +1,38 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/common/local_context.h" + +#include "common/ge_inner_error_codes.h" +#include "common/debug/ge_log.h" +#include "omg/omg_inner_types.h" + +namespace ge { +namespace { +thread_local OmgContext *omg_context = nullptr; +} + +void SetLocalOmgContext(OmgContext &context) { omg_context = &context; } + +OmgContext &GetLocalOmgContext() { + if (omg_context != nullptr) { + return *omg_context; + } else { + GELOGW("omg_context is nullptr."); + return domi::GetContext(); + } +} +} // namespace ge diff --git a/src/ge/graph/common/local_context.h b/src/ge/graph/common/local_context.h new file mode 100644 index 00000000..1cdd2ca1 --- /dev/null +++ b/src/ge/graph/common/local_context.h @@ -0,0 +1,26 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_COMMON_LOCAL_CONTEXT_H_ +#define GE_GRAPH_COMMON_LOCAL_CONTEXT_H_ + +#include "omg/omg_inner_types.h" + +namespace ge { +void SetLocalOmgContext(OmgContext &context); +OmgContext &GetLocalOmgContext(); +} // namespace ge +#endif // GE_GRAPH_COMMON_LOCAL_CONTEXT_H_ diff --git a/src/ge/graph/load/graph_loader.cc b/src/ge/graph/load/graph_loader.cc index d181f3a5..c173d67a 100644 --- a/src/ge/graph/load/graph_loader.cc +++ b/src/ge/graph/load/graph_loader.cc @@ -121,70 +121,50 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, ModelData &model_data) { Status ret; - try { - if (!CheckInputPathValid(path)) { - GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); - return GE_EXEC_MODEL_PATH_INVALID; - } - - GELOGI("Load model begin, model path is: %s", path.c_str()); - if (!key_path.empty() && !CheckInputPathValid(key_path)) { - GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); - return GE_EXEC_MODEL_KEY_PATH_INVALID; - } - - ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); - if (ret != SUCCESS) { - GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); - return ret; - } + if (!CheckInputPathValid(path)) { + GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); + return GE_EXEC_MODEL_PATH_INVALID; + } - return SUCCESS; - } catch (std::bad_alloc &) { - GELOGE(MEMALLOC_FAILED, "Load model from file failed, bad memory allocation"); - ret = MEMALLOC_FAILED; - } catch (...) { - GELOGE(FAILED, "Load model from file failed with exception"); - ret = FAILED; + GELOGI("Load model begin, model path is: %s", path.c_str()); + if (!key_path.empty() && !CheckInputPathValid(key_path)) { + GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); + return GE_EXEC_MODEL_KEY_PATH_INVALID; } - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; + ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); + if (ret != SUCCESS) { + GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); + if (model_data.model_data != nullptr) { + delete[] static_cast(model_data.model_data); + model_data.model_data = nullptr; + } + return ret; } - return ret; + return SUCCESS; } Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority, const std::shared_ptr &listener, uint32_t &model_id) { Status ret; ModelData model_data; - - try { - ret = LoadDataFromFile(path, key_path, priority, model_data); - if (ret != SUCCESS) { - GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; - } - return ret; + ret = LoadDataFromFile(path, key_path, priority, model_data); + if (ret != SUCCESS) { + GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); + if (model_data.model_data != nullptr) { + delete[] static_cast(model_data.model_data); + model_data.model_data = nullptr; } + return ret; + } - ret = LoadModel(model_data, listener, model_id); - if (ret != SUCCESS) { - GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; - } + ret = LoadModel(model_data, listener, model_id); + if (ret != SUCCESS) { + GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); + if (model_data.model_data != nullptr) { + delete[] static_cast(model_data.model_data); + model_data.model_data = nullptr; } - } catch (std::bad_alloc &) { - GELOGE(MEMALLOC_FAILED, "Load model from file failed, bad memory allocation"); - ret = MEMALLOC_FAILED; - } catch (...) { - GELOGE(FAILED, "Load model from file failed with exception"); - ret = FAILED; } if (model_data.model_data != nullptr) { @@ -197,36 +177,27 @@ Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string Status GraphLoader::LoadModel(const ModelData &model_data, const std::shared_ptr &listener, uint32_t &model_id) { - try { - GELOGI("Load model begin, model_id:%u.", model_id); + GELOGI("Load model begin, model_id:%u.", model_id); - // For GeOp, Open Device 0 here. - GE_CHK_RT_RET(rtSetDevice(0)); - auto model_manager = ModelManager::GetInstance(); - GE_CHECK_NOTNULL(model_manager); - Status ret = model_manager->LoadModelOffline(model_id, model_data, listener); - if (ret != SUCCESS) { - GE_CHK_RT(rtDeviceReset(0)); - GELOGE(ret, "LoadModel: Load failed."); - return ret; - } - ret = model_manager->Start(model_id); - if (ret != SUCCESS) { - if (model_manager->Unload(model_id) != SUCCESS) { - GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start."); - } - GELOGE(ret, "LoadModel: Start failed."); - return ret; + // For GeOp, Open Device 0 here. + GE_CHK_RT_RET(rtSetDevice(0)); + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + Status ret = model_manager->LoadModelOffline(model_id, model_data, listener); + if (ret != SUCCESS) { + GE_CHK_RT(rtDeviceReset(0)); + GELOGE(ret, "LoadModel: Load failed."); + return ret; + } + ret = model_manager->Start(model_id); + if (ret != SUCCESS) { + if (model_manager->Unload(model_id) != SUCCESS) { + GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start."); } - GELOGI("LoadModel: Start model success, model_id:%u.", model_id); - } catch (std::bad_alloc &) { - GELOGE(MEMALLOC_FAILED, "Load model failed, bad memory allocation occur !"); - return MEMALLOC_FAILED; - } catch (...) { - GELOGE(FAILED, "Load model failed, some exceptions occur !"); - return FAILED; + GELOGE(ret, "LoadModel: Start failed."); + return ret; } - + GELOGI("LoadModel: Start model success, model_id:%u.", model_id); return SUCCESS; } @@ -255,28 +226,16 @@ Status GraphLoader::CommandHandle(const Command &command) { Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr, size_t memsize, void *weight_ptr, size_t weightsize) { - try { - GELOGI("Load model begin, model_id:%u.", model_id); - - // For ACL, Open Device from App. - auto model_manager = ModelManager::GetInstance(); - GE_CHECK_NOTNULL(model_manager); - Status ret = - model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize); - if (ret != SUCCESS) { - GELOGE(ret, "Load model failed, model_id:%u.", model_id); - return ret; - } - - GELOGI("Load model success, model_id:%u.", model_id); - } catch (std::bad_alloc &) { - GELOGE(MEMALLOC_FAILED, "Load model failed, bad memory allocation occur !"); - return MEMALLOC_FAILED; - } catch (...) { - GELOGE(FAILED, "Load model failed, some exceptions occur !"); - return FAILED; + GELOGI("Load model begin, model_id:%u.", model_id); + // For ACL, Open Device from App. + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + Status ret = model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize); + if (ret != SUCCESS) { + GELOGE(ret, "Load model failed, model_id:%u.", model_id); + return ret; } - + GELOGI("Load model success, model_id:%u.", model_id); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/data_dumper.cc b/src/ge/graph/load/new_model_manager/data_dumper.cc index b94add80..e4e3a63f 100644 --- a/src/ge/graph/load/new_model_manager/data_dumper.cc +++ b/src/ge/graph/load/new_model_manager/data_dumper.cc @@ -16,21 +16,28 @@ #include "graph/load/new_model_manager/data_dumper.h" +#include +#include #include #include #include #include +#include "common/debug/memory_dumper.h" #include "common/properties_manager.h" +#include "common/util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/util.h" #include "graph/anchor.h" #include "graph/debug/ge_attr_define.h" #include "graph/load/new_model_manager/model_utils.h" +#include "graph/manager/util/debug.h" #include "graph/utils/attr_utils.h" #include "graph/utils/tensor_utils.h" +#include "proto/dump_task.pb.h" #include "proto/ge_ir.pb.h" #include "proto/op_mapping_info.pb.h" +#include "runtime/base.h" #include "runtime/mem.h" namespace { @@ -66,6 +73,16 @@ static bool ParseNameIndex(const std::string &node_name_index, std::string &node static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector v_memory_type, size_t i) { return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1); } + +static uint64_t GetNowTime() { + uint64_t ret = 0; + struct timeval tv; + if (gettimeofday(&tv, NULL) == 0) { + ret = tv.tv_sec * 1000000ULL + tv.tv_usec; + } + + return ret; +} } // namespace static int32_t GetIrDataType(ge::DataType data_type) { @@ -176,6 +193,7 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id); OpDescInfo op_desc_info; op_desc_info.op_name = op->GetName(); + op_desc_info.op_type = op->GetType(); op_desc_info.task_id = task_id; op_desc_info.stream_id = stream_id; for (size_t i = 0; i < op->GetInputsSize(); ++i) { @@ -183,12 +201,28 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr op_desc_info.input_format.emplace_back(input_desc.GetFormat()); op_desc_info.input_shape.emplace_back(input_desc.GetShape().GetDims()); op_desc_info.input_data_type.emplace_back(input_desc.GetDataType()); + int64_t input_size = 0; + auto tensor_descs = op->GetAllInputsDesc(); + if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(i), input_size) != SUCCESS) { + GELOGW("Get input size failed"); + return; + } + GELOGI("Save dump op info, the input size is %ld", input_size); + op_desc_info.input_size.emplace_back(input_size); } for (size_t j = 0; j < op->GetOutputsSize(); ++j) { GeTensorDesc output_desc = op->GetOutputDesc(j); op_desc_info.output_format.emplace_back(output_desc.GetFormat()); op_desc_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); op_desc_info.output_data_type.emplace_back(output_desc.GetDataType()); + int64_t output_size = 0; + auto tensor_descs = op->GetAllOutputsDesc(); + if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(j), output_size) != SUCCESS) { + GELOGW("Get input size failed"); + return; + } + GELOGI("Save dump op info, the output size is %ld", output_size); + op_desc_info.output_size.emplace_back(output_size); } op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); @@ -810,4 +844,90 @@ void DataDumper::PrintCheckLog(string &dump_list_key) { } } } + +Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) { + GELOGI("Start to dump exception input"); + for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { + GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) { + GELOGI("Start to dump exception output"); + for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != + SUCCESS) { + GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status DataDumper::DumpExceptionInfo(const std::vector exception_infos) { + GELOGI("Start to dump exception info"); + for (const rtExceptionInfo &iter : exception_infos) { + OpDescInfo op_desc_info; + if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { + toolkit::dumpdata::DumpData dump_data; + dump_data.set_version("2.0"); + dump_data.set_dump_time(GetNowTime()); + for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { + toolkit::dumpdata::OpInput input; + input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i]))); + input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); + for (auto dim : op_desc_info.input_shape[i]) { + input.mutable_shape()->add_dim(dim); + } + input.set_size(op_desc_info.input_size[i]); + GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]); + dump_data.mutable_input()->Add(std::move(input)); + } + for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { + toolkit::dumpdata::OpOutput output; + output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j]))); + output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); + for (auto dim : op_desc_info.output_shape[j]) { + output.mutable_shape()->add_dim(dim); + } + output.set_size(op_desc_info.output_size[j]); + GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]); + dump_data.mutable_output()->Add(std::move(output)); + } + uint64_t now_time = GetNowTime(); + string dump_file_path = "./" + op_desc_info.op_type + "." + op_desc_info.op_name + "." + + to_string(op_desc_info.task_id) + "." + to_string(now_time); + uint64_t proto_size = dump_data.ByteSizeLong(); + unique_ptr proto_msg(new (std::nothrow) char[proto_size]); + bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); + if (!ret || proto_size == 0) { + GELOGE(PARAM_INVALID, "Dump data proto serialize failed"); + return PARAM_INVALID; + } + + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), + "Failed to dump proto size"); + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), + "Failed to dump proto msg"); + if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "Dump exception input failed"); + return PARAM_INVALID; + } + + if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "Dump exception output failed"); + return PARAM_INVALID; + } + GELOGI("Dump exception info SUCCESS"); + } else { + GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid); + return PARAM_INVALID; + } + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/data_dumper.h b/src/ge/graph/load/new_model_manager/data_dumper.h index cb5bbd41..0a1c2274 100644 --- a/src/ge/graph/load/new_model_manager/data_dumper.h +++ b/src/ge/graph/load/new_model_manager/data_dumper.h @@ -31,6 +31,7 @@ #include "runtime/mem.h" #include "task_info/task_info.h" #include "framework/common/ge_types.h" +#include "runtime/base.h" namespace ge { class DataDumper { @@ -88,6 +89,11 @@ class DataDumper { const DumpProperties &GetDumpProperties() const { return dump_properties_; } bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + // Dump exception info + Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file); + Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file); + Status DumpExceptionInfo(const std::vector exception_infos); + private: void ReleaseDevMem(void **ptr) noexcept; diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc index 7daeb1b8..45cec2cf 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.cc +++ b/src/ge/graph/load/new_model_manager/davinci_model.cc @@ -43,6 +43,7 @@ #include "graph/graph.h" #include "graph/load/new_model_manager/cpu_queue_schedule.h" #include "graph/load/new_model_manager/tbe_handle_store.h" +#include "graph/load/new_model_manager/model_manager.h" #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/graph_var_manager.h" #include "graph/manager/trans_var_data_utils.h" @@ -253,13 +254,7 @@ Status DavinciModel::Assign(const GeModelPtr &ge_model) { /// void DavinciModel::Shrink() { ge_model_.reset(); // delete object. - - // Old dump need op list, clear when closed. - char *ge_dump_env = std::getenv("DUMP_OP"); - int dump_op_switch = (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0; - if (dump_op_switch == 0) { - op_list_.clear(); - } + op_list_.clear(); } Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { @@ -295,8 +290,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size); return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED; } - GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, - mem_base_, data_size); + GEEVENT("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + mem_base_, data_size); weights_mem_base_ = mem_base_; @@ -337,8 +332,8 @@ Status DavinciModel::InitVariableMem() { return ret; } var_mem_base_ = VarManager::Instance(session_id_)->GetVarMemoryBase(RT_MEMORY_HBM); - GELOGI("[IMAS]InitVariableMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, - var_mem_base_, TotalVarMemSize()); + GEEVENT("[IMAS]InitVariableMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + var_mem_base_, TotalVarMemSize()); } runtime_param_.var_base = var_mem_base_; return SUCCESS; @@ -774,6 +769,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { map data_by_index; auto nodes = compute_graph->GetAllNodes(); const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore(); + const CustAICPUKernelStore &aicpu_kernel_store = ge_model_->GetCustAICPUKernelStore(); for (size_t i = 0; i < nodes.size(); i++) { auto node = nodes.at(i); auto op_desc = node->GetOpDesc(); @@ -786,6 +782,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { GE_TIMESTAMP_RESTART(LoadTBEKernelBinToOpDesc); tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc); + aicpu_kernel_store.LoadCustAICPUKernelBinToOpDesc(op_desc); GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc); if (IsDataOp(op_desc->GetType())) { @@ -1076,30 +1073,42 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) { /// /// @ingroup ge /// @brief output zero copy node Initialize. -/// @param [in] NodePtr: netoutput Op or merge op. +/// @param [in] NodePtr: netoutput Op. /// @return Status /// Status DavinciModel::InitOutputZeroCopyNodes(const NodePtr &node) { + set nodes_need_record; for (auto &in_data_anchor : node->GetAllInDataAnchors()) { auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); if (peer_out_data_anchor == nullptr) { continue; } - auto node = peer_out_data_anchor->GetOwnerNode(); - auto op_desc = node->GetOpDesc(); - if (op_desc == nullptr) { - GELOGE(FAILED, "Op desc is nullptr"); - return FAILED; - } + auto peer_node = peer_out_data_anchor->GetOwnerNode(); + nodes_need_record.emplace(peer_node); // Merge node output multiplexed input, upstream nodes need to be considered in multiple batch scenarios - if (node->GetType() == MERGE) { - if (InitOutputZeroCopyNodes(node) != SUCCESS) { - GELOGE(PARAM_INVALID, "Output merge zero copy nodes init failed!"); - return PARAM_INVALID; + if (peer_node->GetType() == MERGE) { + for (const auto &merge_peer_in_data_anchor : peer_node->GetAllInDataAnchors()) { + auto merge_peer_out_data_anchor = merge_peer_in_data_anchor->GetPeerOutAnchor(); + if (merge_peer_out_data_anchor == nullptr) { + continue; + } + auto merge_peer_node = merge_peer_out_data_anchor->GetOwnerNode(); + nodes_need_record.emplace(merge_peer_node); + } + } else { + for (const auto &other_in_data_anchor : peer_out_data_anchor->GetPeerInDataAnchors()) { + auto other_in_node = other_in_data_anchor->GetOwnerNode(); + if (other_in_node->GetType() != NETOUTPUT) { + nodes_need_record.emplace(other_in_node); + } } } + } + for (const auto &node_need_record : nodes_need_record) { + auto op_desc = node_need_record->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); if (batch_label.empty()) { @@ -2152,7 +2161,6 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) { Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind) { if (output_op_list_.empty()) { Status ret = SyncVarData(); - DumpOpInputOutput(); return ret; } @@ -2198,8 +2206,6 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r runtime_param_.graph_id, output.first, output.second.GetBasicAddr(), data_size, buffer_length); GE_CHK_RT_RET(rtMemcpy(buffer_addr, buffer_length, output.second.GetBasicAddr(), data_size, kind)); } - - DumpOpInputOutput(); return SUCCESS; } @@ -2264,6 +2270,14 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b // return result is not required if (!rslt_flg && !seq_end_flag) { GELOGW("Compute failed, model id: %u", model_id_); + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (exception_infos.size() > 0) { + GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed"); + } else { + GELOGI("Exception info is null"); + } GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed."); return INTERNAL_ERROR; } @@ -2302,7 +2316,6 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS, outputs), "OnComputeDone failed"); return SUCCESS; } - /// /// @ingroup ge /// @brief return not output to upper layer for cloud case @@ -2318,114 +2331,12 @@ Status DavinciModel::ReturnNoOutput(uint32_t data_id) { op_desc->GetName().c_str()); } - DumpOpInputOutput(); GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null!"); std::vector outputs; GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS, outputs), "OnComputeDone failed."); return SUCCESS; } -/// -/// @ingroup ge -/// @brief dump all op input and output information -/// @return void -/// -void DavinciModel::DumpOpInputOutput() { - char *ge_dump_env = std::getenv("DUMP_OP"); - int dump_op_switch = (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0; - if (dump_op_switch == 0) { - GELOGI("need to set DUMP_OP for dump op input and output"); - return; - } - - if (op_list_.empty()) { - GELOGW("op list is empty"); - return; - } - - int64_t cnt = 1; - for (auto it : op_list_) { - if (maxDumpOpNum_ != 0 && cnt > maxDumpOpNum_) { - GELOGW("dump op cnt > maxDumpOpNum, maxDumpOpNum: %ld", maxDumpOpNum_); - return; - } - - cnt++; - if (DumpSingleOpInputOutput(it.second) != SUCCESS) { - GELOGW("dump single op failed, model_id: %u", model_id_); - return; - } - } -} - -/// -/// @ingroup ge -/// @brief dump single op input and output information -/// @param [in] op_def: the op_desc which will be dump -/// @return Status result -/// -Status DavinciModel::DumpSingleOpInputOutput(const OpDescPtr &op_def) { - GE_CHK_BOOL_EXEC(nullptr != op_def, return PARAM_INVALID, "op_def is null!"); - string op_name = ge::StringUtils::ReplaceAll(op_def->GetName(), "/", "-"); - GELOGI("dump op name:%s, type:%s, model_id: %u.", op_def->GetName().c_str(), op_def->GetType().c_str(), model_id_); - string model_path = "./dump" + to_string(model_id_); - if (mmAccess(model_path.c_str()) != EN_OK) { - int32_t ret = mmMkdir(model_path.c_str(), S_IRUSR | S_IWUSR | S_IXUSR); - if (ret != EN_OK) { - GELOGE(FAILED, "make dir failed, model_id: %u", model_id_); - return FAILED; - } - } - const vector input_size_vec = ModelUtils::GetInputSize(op_def); - const vector input_addr_vec = ModelUtils::GetInputDataAddrs(runtime_param_, op_def); - vector v_memory_type; - bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type); - GELOGD("DumpSingleOp[%s], input size[%zu], input memory type size[%zu]", op_def->GetName().c_str(), - op_def->GetInputsSize(), v_memory_type.size()); - for (size_t i = 0; i < input_addr_vec.size(); i++) { - if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) { - continue; - } - int64_t input_size = input_size_vec.at(i); - char input_file_name[PATH_MAX] = {0}; - if ((sprintf_s(input_file_name, PATH_MAX, "%s/dump_%u_%s_%s_input_%zu.bin", model_path.c_str(), model_id_, - op_def->GetType().c_str(), op_name.c_str(), i)) == -1) { - GELOGE(FAILED, "construct input dump file path failed."); - return FAILED; - } - if ((Debug::DumpDevMem(input_file_name, input_addr_vec.at(i), input_size)) != SUCCESS) { - GELOGE(FAILED, "dump to input_file failed"); - return FAILED; - } - } - - const vector output_size_vec = ModelUtils::GetOutputSize(op_def); - const vector output_addr_vec = ModelUtils::GetOutputDataAddrs(runtime_param_, op_def); - v_memory_type.clear(); - has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type); - GELOGD("DumpSingleOp[%s], output size[%zu], output memory type size[%zu]", op_def->GetName().c_str(), - op_def->GetOutputsSize(), v_memory_type.size()); - if (!(op_def->GetType() == "Const")) { - for (size_t i = 0; i < output_addr_vec.size(); i++) { - if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) { - continue; - } - int64_t output_size = output_size_vec.at(i); - char output_file_name[PATH_MAX] = {0}; - if ((sprintf_s(output_file_name, PATH_MAX, "%s/dump_%u_%s_%s_output_%zu.bin", model_path.c_str(), model_id_, - op_def->GetType().c_str(), op_name.c_str(), i)) == -1) { - GELOGE(FAILED, "construct output dump file path failed."); - return FAILED; - } - if ((Debug::DumpDevMem(output_file_name, output_addr_vec.at(i), output_size)) != SUCCESS) { - GELOGE(FAILED, "dump to output_file failed"); - return FAILED; - } - } - } - return SUCCESS; -} - void *DavinciModel::Run(DavinciModel *model) { GE_CHK_BOOL_EXEC(model != nullptr, CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); @@ -3127,8 +3038,8 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map & void *addr = data.second.GetDataInfo().at(count).second; void *buffer_addr = reinterpret_cast(reinterpret_cast(buffer.data) + data.second.GetRelativeOffset().at(count)); - GELOGI("[ZCPY] Copy blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", data.first, addr, size, - buffer_addr); + GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", input_or_output.c_str(), + data.first, addr, size, buffer_addr); // For input data, just copy for rts task. for (ZeroCopyTask &task : zero_copy_tasks_) { uintptr_t addr_val = reinterpret_cast(addr); @@ -3486,7 +3397,6 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa is_async_mode_ = async_mode; GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_); GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed."); - is_dynamic_ = input_data.is_dynamic_batch; if (!is_dynamic_) { zero_copy_batch_label_addrs_.clear(); diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h index e77c5510..ea94c22c 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.h +++ b/src/ge/graph/load/new_model_manager/davinci_model.h @@ -345,21 +345,6 @@ class DavinciModel { Status ReturnNoOutput(uint32_t data_id); - /// - /// @ingroup ge - /// @brief dump all op input and output information - /// @return void - /// - void DumpOpInputOutput(); - - /// - /// @ingroup ge - /// @brief dump single op input and output information - /// @param [in] dump_op model_id - /// @return Status - /// - Status DumpSingleOpInputOutput(const OpDescPtr &dump_op); - Status ModelRunStart(); /// diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc index 33e39847..9f0b114b 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.cc +++ b/src/ge/graph/load/new_model_manager/model_manager.cc @@ -18,9 +18,9 @@ #include +#include "common/dump/dump_manager.h" #include "common/l2_cache_optimize.h" #include "common/profiling/profiling_manager.h" -#include "common/dump/dump_manager.h" #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "framework/common/util.h" @@ -38,6 +38,7 @@ const int kDumpCmdPairSize = 2; } // namespace DumpProperties ModelManager::dump_properties_; +std::mutex ModelManager::exeception_infos_mutex_; std::shared_ptr ModelManager::GetInstance() { static const std::shared_ptr instance_ptr = @@ -154,6 +155,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { GELOGI("The session: %lu not created.", session_id); return; } else { + GE_CHK_RT(rtSetDevice(static_cast(GetContext().DeviceId()))); Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0); if (ret != SUCCESS) { GELOGW("The session: %lu destroy failed.", session_id); @@ -161,6 +163,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { (void)sess_ids_.erase(session_id); GELOGI("The session: %lu destroyed.", session_id); } + GE_CHK_RT(rtDeviceReset(static_cast(GetContext().DeviceId()))); } } @@ -369,7 +372,8 @@ Status ModelManager::Unload(uint32_t model_id) { } else { GELOGI("Unload model %u success.no need reset device,device_count: %u", model_id, device_count); } - + std::lock_guard lock(exeception_infos_mutex_); + exception_infos_.clear(); return SUCCESS; } @@ -1106,4 +1110,23 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3 return FAILED; } +Status ModelManager::EnableExceptionDump(const std::map &options) { + auto iter = options.find(OPTION_EXEC_ENABLE_EXCEPTION_DUMP); + if (iter != options.end()) { + GELOGI("Find option enable_exeception_dump is %s", iter->second.c_str()); + if (iter->second == "1") { + rtError_t rt_ret = rtSetTaskFailCallback(ExceptionCallback); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtSetTaskFailCallback failed"); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + } else { + GELOGI("Option enable exception dump is %s", iter->second.c_str()); + } + } else { + GELOGI("Not find option enable exception dump"); + } + return SUCCESS; +} + } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/model_manager.h b/src/ge/graph/load/new_model_manager/model_manager.h index a25b56a8..2c650c82 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.h +++ b/src/ge/graph/load/new_model_manager/model_manager.h @@ -274,6 +274,22 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { bool IsDynamicShape(uint32_t model_id); ge::Status GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info); + ge::Status EnableExceptionDump(const std::map &options); + + const std::vector &GetExceptionInfos() { return exception_infos_; } + + void AddExceptionInfo(const rtExceptionInfo &exception_info) { exception_infos_.emplace_back(exception_info); } + + static void ExceptionCallback(rtExceptionInfo *exception_info) { + std::lock_guard lock(exeception_infos_mutex_); + auto instance = ModelManager::GetInstance(); + if (instance == nullptr) { + GELOGE(FAILED, "Instance is nullptr"); + return; + } + instance->AddExceptionInfo(*exception_info); + } + private: /// /// @ingroup domi_ome @@ -309,8 +325,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::mutex map_mutex_; std::mutex sess_ids_mutex_; std::mutex session_id_create_mutex_; + static ::std::mutex exeception_infos_mutex_; uint64_t session_id_bias_; std::set sess_ids_; + std::vector exception_infos_; static DumpProperties dump_properties_; }; diff --git a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc index 98d1d5a4..11eaaca9 100644 --- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc @@ -258,7 +258,7 @@ Status HcclTaskInfo::SetAddrs(const std::shared_ptr &op_desc, return SUCCESS; } - hcclRedOp_t op_type = HCCL_REP_OP_SUM; + HcclReduceOp op_type = HCCL_REDUCE_SUM; GE_CHECK_NOTNULL(davinci_model_); GELOGI("Calc opType[%s] input address before. Node name[%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str()); if (!davinci_model_->IsKnownNode()) { diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc index 7c873c68..0cac91eb 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc @@ -37,11 +37,17 @@ const uint8_t kL2NotLoadToDdr = 0; // for skt constexpr int64_t kInvalidGroupKey = -1; constexpr uint32_t kSKTSingleSize = 1; -constexpr uint32_t kSKTMaxSizeLimit = 20000; const char *kIsLastNode = "is_last_node"; const char *kIsFirstNode = "is_first_node"; const int64_t kCloseSkt = 100; const uint32_t kAddrLen = sizeof(void *); +const char *const kLoadOpFromBuf = "loadOpFromBuf"; +struct CustAicpuSoBuf { + uint64_t kernelSoBuf; + uint32_t kernelSoBufLen; + uint64_t kernelSoName; + uint32_t kernelSoNameLen; +} __attribute__((packed)); } // namespace namespace ge { @@ -49,10 +55,7 @@ KernelTaskInfo::SuperKernelTaskInfo KernelTaskInfo::skt_info_ = { 0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr}; Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { - if (davinci_model == nullptr) { - GELOGE(PARAM_INVALID, "davinci model is null!"); - return PARAM_INVALID; - } + GE_CHECK_NOTNULL(davinci_model); davinci_model_ = davinci_model; is_l1_fusion_enable_ = davinci_model_->GetL1FusionEnableOption(); GELOGD("KernelTaskInfo init start, ge.enableL1Fusion in davinci model is %d.", is_l1_fusion_enable_); @@ -71,16 +74,12 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci kernel_type_ = static_cast(context.kernel_type()); // get opdesc op_desc_ = davinci_model_->GetOpByIndex(context.op_index()); - if (op_desc_ == nullptr) { - GELOGE(INTERNAL_ERROR, "Get op desc failed, index is out of range!"); - return INTERNAL_ERROR; - } + GE_CHECK_NOTNULL(op_desc_); (void)AttrUtils::GetBool(*op_desc_, ATTR_N_BATCH_SPILT, is_n_batch_spilt_); GELOGD("node[%s] is_n_batch_spilt %d", op_desc_->GetName().c_str(), is_n_batch_spilt_); (void)AttrUtils::GetInt(*op_desc_, ATTR_NAME_FUSION_GROUP_KEY, group_key_); has_group_key_ = (group_key_ != kInvalidGroupKey); GELOGD("node[%s] has_group_key_ %ld, group key is [%ld]", op_desc_->GetName().c_str(), has_group_key_, group_key_); - // fusion_op_info vector original_op_names; bool result = AttrUtils::GetListStr(op_desc_, ge::ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names); @@ -99,7 +98,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s", kernel_def.stub_func().c_str()); return RT_ERROR_TO_GE_STATUS(rt_ret);); - } else if (kernel_type_ != cce::ccKernelType::AI_CPU) { + } else if (kernel_type_ == cce::ccKernelType::TE) { rtError_t rt_ret; rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, @@ -127,7 +126,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci ret = InitTVMTask(args_offset_tmp[0], kernel_def); } else if (kernel_type_ == cce::ccKernelType::CUSTOMIZED) { ret = InitAICPUCustomTask(context.op_index(), kernel_def); - } else if (kernel_type_ == cce::ccKernelType::AI_CPU) { + } else if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { ret = InitAicpuTask(context.op_index(), kernel_def); } else { if (kernel_def.args().empty() || args_size_ == 0) { @@ -332,10 +331,6 @@ bool KernelTaskInfo::DoubleCallSKTSaveCheck() { return (!is_n_batch_spilt_ && !h Status KernelTaskInfo::SuperKernelDistribute() { Status ret; - char *skt_task_num = getenv("SKT_TASK_NUM"); - auto task_num = static_cast((skt_task_num != nullptr) ? strtol(skt_task_num, nullptr, 10) - : kSKTMaxSizeLimit); // 10 for decimal number - GELOGI("SKT: SuperKernel Distribute Task num[skt_id:%lu]", task_num); if (FirstCallSKTLaunchCheck()) { ret = SuperKernelLaunch(); if (ret != SUCCESS) { @@ -381,7 +376,8 @@ Status KernelTaskInfo::Distribute() { char *skt_enable_env = getenv("SKT_ENABLE"); int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0; bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_); - if (kernel_type_ == cce::ccKernelType::AI_CPU) { + if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { + GELOGI("distribute task info kernel_type %d, flag %d", kernel_type_, dump_flag_); // blockDim is reserved parameter, set to 1 rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast(so_name_.c_str()), reinterpret_cast(kernel_name_.c_str()), 1, args_, args_size_, @@ -865,10 +861,98 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { return SUCCESS; } +Status KernelTaskInfo::LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def) { + CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); + if (aicpu_kernel == nullptr) { + GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str()); + return INTERNAL_ERROR; + } + const void *aicpu_data = aicpu_kernel->GetBinData(); + uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize(); + + void *d_aicpu_data = nullptr; + rtError_t status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + void *d_so_name = nullptr; + status = rtMalloc(&d_so_name, so_name_.size(), RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + status = rtMemcpy(d_so_name, so_name_.size(), reinterpret_cast(so_name_.c_str()), so_name_.size(), + RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + CustAicpuSoBuf cust_aicpu_so_buf; + cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast(reinterpret_cast(d_aicpu_data)); + cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; + cust_aicpu_so_buf.kernelSoName = reinterpret_cast(reinterpret_cast(d_so_name)); + cust_aicpu_so_buf.kernelSoNameLen = so_name_.size(); + + void *args = nullptr; + uint32_t args_size = sizeof(CustAicpuSoBuf); + status = rtMalloc(&args, args_size, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + GELOGI("loadOpFromBuf kernelSoBuf %p, kernelSoBufLen %u, kernelSoName %p, kernelSoNameLen %u.", d_aicpu_data, + aicpu_data_length, d_so_name, so_name_.size()); + + status = rtMemcpy(args, args_size, static_cast(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + rtStream_t stream = nullptr; + status = rtStreamCreate(&stream, 0); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt create stream failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + status = rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch loadOpFromBuf failed, status: 0x%X", status); + return RT_ERROR_TO_GE_STATUS(status); + } + GELOGI("Cpu kernel launch loadOpFromBuf."); + + status = rtStreamSynchronize(stream); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + + GE_CHK_RT(rtFree(args)); + GE_CHK_RT(rtFree(d_aicpu_data)); + GE_CHK_RT(rtFree(d_so_name)); + + GELOGI("Cpu kernel launch loadOpFromBuf task success."); + return SUCCESS; +} + Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) { GELOGI("Do InitAicpuTask"); so_name_ = kernel_def.so_name(); kernel_name_ = kernel_def.kernel_name(); + GELOGI("node[%s] test so name %s, kernel name %s", op_desc_->GetName().c_str(), so_name_.c_str(), + kernel_name_.c_str()); OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index); if (op_desc == nullptr) { @@ -876,6 +960,10 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k return INTERNAL_ERROR; } + if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { + GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, kernel_def), "launch cust aicpu so failed"); + } + // copy args to new host memory std::unique_ptr args_addr(new (std::nothrow) uint8_t[args_size_]); GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_) @@ -940,6 +1028,9 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } dump_args_ = static_cast(args_) + sizeof(aicpu::AicpuParamHead); } + if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { + dump_flag_ |= RT_KERNEL_CUSTOM_AICPU; + } davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, args_addr.get(), args_, args_size_, sizeof(aicpu::AicpuParamHead)); @@ -1195,16 +1286,6 @@ uint8_t KernelTaskInfo::IsL2CpToDDR(uint8_t origain_L2_load_to_ddr) { if (dump_flag_ == RT_KERNEL_DUMPFLAG) { return kL2LoadToDdr; } - - static char *ge_dump_env = std::getenv("DUMP_OP"); - if (ge_dump_env != nullptr) { - static std::string ge_dump_str(ge_dump_env); - static std::string open_ge_dump("1"); - if (ge_dump_str == open_ge_dump) { - return kL2LoadToDdr; - } - } - return kL2NotLoadToDdr; } diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h index 8ada2082..1c45682e 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h @@ -106,6 +106,8 @@ class KernelTaskInfo : public TaskInfo { Status InitAicpuTaskExtInfo(const std::string &ext_info); + Status LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def); + Status StoreInputOutputTensor(const std::vector &input_data_addrs, const std::vector &output_data_addrs, const std::vector<::tagCcAICPUTensor> &input_descs, diff --git a/src/ge/graph/load/new_model_manager/zero_copy_task.cc b/src/ge/graph/load/new_model_manager/zero_copy_task.cc index 00920aad..30ce8a86 100644 --- a/src/ge/graph/load/new_model_manager/zero_copy_task.cc +++ b/src/ge/graph/load/new_model_manager/zero_copy_task.cc @@ -130,8 +130,8 @@ Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const ma } auto dst_addr = static_cast(buffer_addr); - GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx", name_.c_str(), - args_addr_, args_size_, offset, addr); + GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx, user_data_addr: %p", + name_.c_str(), args_addr_, args_size_, offset, addr, buffer_addr); *(uintptr_t *)(args_info + offset) = reinterpret_cast(dst_addr); is_updated_ = true; } diff --git a/src/ge/graph/manager/graph_caching_allocator.h b/src/ge/graph/manager/graph_caching_allocator.h index 94a5066a..850a73e8 100644 --- a/src/ge/graph/manager/graph_caching_allocator.h +++ b/src/ge/graph/manager/graph_caching_allocator.h @@ -29,6 +29,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "graph/node.h" +#include "graph/manager/block_memory.h" #include "runtime/mem.h" namespace ge { @@ -38,30 +39,8 @@ constexpr size_t kKByteSize = 1024; constexpr size_t kMByteSize = 1024 * 1024; constexpr size_t kGByteSize = 1024 * 1024 * 1024; -struct Block; -typedef bool (*Comparison)(const Block *, const Block *); -using BlockBin = std::set; static const uint32_t kNumBins = 8; -struct Block { - uint32_t device_id; // npu device id - size_t size; // block size in bytes - BlockBin *bin; // owning block bin - uint8_t *ptr; // memory address - bool allocated; // in-use flag - Block *prev; // prev block if split from a larger allocation - Block *next; // next block if split from a larger allocation - - Block(uint32_t device, size_t size, BlockBin *bin, uint8_t *ptr) - : device_id(device), size(size), bin(bin), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {} - - // constructor for search key - Block(uint32_t device, size_t size, uint8_t *ptr) - : device_id(device), size(size), bin(nullptr), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {} - - bool IsSplit() const { return (prev != nullptr) || (next != nullptr); } -}; - class MemoryAllocator; class CachingAllocator { diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc index 582b206a..08f7ec9e 100644 --- a/src/ge/graph/manager/graph_manager.cc +++ b/src/ge/graph/manager/graph_manager.cc @@ -33,7 +33,9 @@ #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "framework/common/ge_types.h" +#include "analyzer/analyzer.h" #include "graph/common/ge_call_wrapper.h" +#include "graph/common/local_context.h" #include "graph/common/transop_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" @@ -42,6 +44,7 @@ #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/util/rt_context_util.h" #include "graph/partition/dynamic_shape_partition.h" +#include "graph/passes/enter_pass.h" #include "graph/passes/addn_pass.h" #include "graph/passes/bitcast_pass.h" #include "graph/passes/atomic_addr_clean_pass.h" @@ -110,6 +113,9 @@ const char *const kSend = "Send"; const char *const kRecv = "Recv"; const char *const kCheckPointForGetVar = "CheckPointGraphForGetVar"; const char *const kCheckPointGraph = "checkpoint_graph"; +const char *const kVectorEngine = "VectorEngine"; +const char *const kAIcoreEngine = "AIcoreEngine"; +const char *const kOffOptimize = "off_optimize"; bool IsTailingOptimization() { string is_tailing_optimization_option; @@ -125,7 +131,10 @@ bool IsTailingOptimization() { } // namespace namespace ge { -GraphManager::GraphManager() : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false) {} +GraphManager::GraphManager(OmgContext &omg_context) + : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false), omg_context_(omg_context) { + SetLocalOmgContext(omg_context); +} Status GraphManager::Initialize(const std::map &options) { if (init_flag_) { @@ -321,14 +330,56 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::Com return SUCCESS; } -Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) { +Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_graph, + Graph2SubGraphInfoList &sub_graph_map, + std::unordered_map ©_graphs) { + GE_CHECK_NOTNULL(compute_graph); + vector old_compute_graphs; + const auto &root_subgraph_list = sub_graph_map[compute_graph]; + for (const auto &subgraph : root_subgraph_list) { + old_compute_graphs.emplace_back(subgraph->GetSubGraph()); + } + for (const auto &function_graph : compute_graph->GetAllSubgraphs()) { + const auto &subgraph_list = sub_graph_map[function_graph]; + for (const auto &subgraph : subgraph_list) { + old_compute_graphs.emplace_back(subgraph->GetSubGraph()); + } + } + + for (const auto &old_compute_graph : old_compute_graphs) { + std::vector input_nodes; + std::vector output_nodes; + ComputeGraphPtr new_compute_graph = GraphUtils::CloneGraph(old_compute_graph, "", input_nodes, output_nodes); + if (new_compute_graph == nullptr) { + GELOGE(INTERNAL_ERROR, "Clone graph failed."); + return INTERNAL_ERROR; + } + copy_graphs.emplace(old_compute_graph->GetName(), new_compute_graph); + if (!AttrUtils::SetBool(old_compute_graph, ATTR_NAME_NEED_LX_FUSION, true)) { + GELOGE(INTERNAL_ERROR, "Set attr lx_fusion to graph failed."); + return INTERNAL_ERROR; + } + } + + GELOGI("Copy %zu graphs successfully.", copy_graphs.size()); + return SUCCESS; +} + +Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph, + Graph2SubGraphInfoList &sub_graph_map, uint64_t session_id) { + GE_CHECK_NOTNULL(compute_graph); // use default 16 multi thread const uint32_t thread_num = 16; ThreadPool executor(thread_num); - auto sub_graph_map = graph_partitioner_.GetSubGraphMap(); std::vector> vector_future; const auto &root_subgraph_list = sub_graph_map[compute_graph]; + std::string op_compile_strategy; + (void)AttrUtils::GetStr(compute_graph, ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy); + GELOGI("OptimizeSubGraphWithMultiThreads Process op_compile_strategy:%s", op_compile_strategy.c_str()); for (const auto &subgraph : root_subgraph_list) { + if (!op_compile_strategy.empty()) { + (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy); + } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id, GetThreadLocalContext()); if (!f.valid()) { @@ -341,6 +392,9 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr for (auto &function_graph : compute_graph->GetAllSubgraphs()) { auto subgraph_list = sub_graph_map[function_graph]; for (const auto &subgraph : subgraph_list) { + if (!op_compile_strategy.empty()) { + (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy); + } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id, GetThreadLocalContext()); if (!f.valid()) { @@ -361,6 +415,130 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr return SUCCESS; } +bool GraphManager::CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_graph, + Graph2SubGraphInfoList &sub_graph_map) { + if (compute_graph == nullptr) { + GELOGE(PARAM_INVALID, "Input param compute_graph is nullptr."); + return false; + } + + /// 1. FE will set attr optimize_group with true(false) while lx fusion is success(fail); + /// 2. FE will not set attr optimize_group while fe.ini set l2fusion enable false; + /// 3. Other engine will not set attr optimize_group. + const auto &root_subgraph_list = sub_graph_map[compute_graph]; + for (const auto &subgraph : root_subgraph_list) { + bool optimize_group = true; + (void)AttrUtils::GetBool(subgraph->GetSubGraph(), ATTR_NAME_OPTIMIZE_GROUP, optimize_group); + if (!optimize_group) { + GELOGW("Run lx optimize for subgraph:%s failed.", subgraph->GetSubGraph()->GetName().c_str()); + return false; + } + } + for (auto &function_graph : compute_graph->GetAllSubgraphs()) { + const auto &subgraph_list = sub_graph_map[function_graph]; + for (const auto &subgraph : subgraph_list) { + bool optimize_group = true; + (void)AttrUtils::GetBool(subgraph->GetSubGraph(), ATTR_NAME_OPTIMIZE_GROUP, optimize_group); + if (!optimize_group) { + GELOGW("Run lx optimize for subgraph:%s failed.", subgraph->GetSubGraph()->GetName().c_str()); + return false; + } + } + } + GELOGI("All subgraph are optimized successfully, no need to reuse buffer optimize."); + return true; +} + +Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_graph, + Graph2SubGraphInfoList &sub_graph_map, + std::unordered_map ©_graphs) { + GE_CHECK_NOTNULL(compute_graph); + const auto &root_subgraph_list = sub_graph_map[compute_graph]; + for (const auto &subgraph : root_subgraph_list) { + auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName()); + if (iter == copy_graphs.end()) { + GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); + return FAILED; + } + subgraph->SetSubGraph(iter->second); + } + + for (auto &function_graph : compute_graph->GetAllSubgraphs()) { + const auto &subgraph_list = sub_graph_map[function_graph]; + for (const auto &subgraph : subgraph_list) { + auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName()); + if (iter == copy_graphs.end()) { + GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); + return FAILED; + } + subgraph->SetSubGraph(iter->second); + } + } + GELOGI("All subgraphs are successfully replaced."); + return SUCCESS; +} + +Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) { + GE_CHECK_NOTNULL(compute_graph); + auto sub_graph_map = graph_partitioner_.GetSubGraphMap(); + std::string buffer_optimize; + graphStatus graph_status = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize); + bool need_lx_fusion = (graph_status == GRAPH_SUCCESS) && (buffer_optimize != kOffOptimize); + if (options_.build_mode.empty() && need_lx_fusion) { + GELOGI("Enter normal mode with buffer_optimize:%s.", buffer_optimize.c_str()); + /// 1. Copy subgraph for buffer optimize while lx fusion failed. + /// 2. Set graph with attr "lx_fusion" for fusion optimize. + std::unordered_map copy_graphs; + GE_TIMESTAMP_START(CopySubGraphAndMarkFusion); + Status ret = CopySubGraphAndMarkFusion(compute_graph, sub_graph_map, copy_graphs); + GE_TIMESTAMP_EVENT_END(CopySubGraphAndMarkFusion, "SetSubgraph:CopySubGraphAndMarkFusion"); + if (ret != SUCCESS) { + GELOGE(ret, "CopySubGraphAndMarkFusion failed."); + return ret; + } + + // Multiply optimize subgraph with lx fusion + ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Multiply optimize subgraph with lx fusion failed."); + return ret; + } + + // Check whether all subgraph lx fusion success + GE_TIMESTAMP_START(CheckAllFusionOptimizeSuccess); + if (CheckAllFusionOptimizeSuccess(compute_graph, sub_graph_map)) { + GE_TIMESTAMP_EVENT_END(CheckAllFusionOptimizeSuccess, "SetSubgraph:CheckAllFusionOptimizeSuccess"); + return SUCCESS; + } + + // Replace subgraph with original graph for lx buffer + ret = ReplaceSubgraphWithOriGraph(compute_graph, sub_graph_map, copy_graphs); + if (ret != SUCCESS) { + GELOGE(ret, "Replace subgraph with original graph failed."); + return ret; + } + + // Multiply optimize subgraph with lx buffer + ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Multiply optimize subgraph with lx buffer failed."); + return ret; + } + } else { + /// Multiply optimize subgraph: + /// 1. run lx buffer while build_mode is normal and buffer_optimize is empty or "off_optimize"; + /// 2. run lx fusion or buffer according build_mode and build_step in fe. + GELOGI("Directly optimize subgraph with build mode:%s, and step:%s, buffer_optimize:%s.", + options_.build_mode.c_str(), options_.build_step.c_str(), buffer_optimize.c_str()); + Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Multiply optimize subgraph with lx buffer"); + return ret; + } + } + return SUCCESS; +} + #define GM_RUN_AND_DUMP_PERF(name, func, ...) \ do { \ GE_RUN_PERF(GraphManager, func, __VA_ARGS__); \ @@ -368,18 +546,10 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr GELOGI("Run %s on graph %s(%u) success.", name, compute_graph->GetName().c_str(), graph_node->GetGraphId()); \ } while (0) -Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector &inputs, - GeRootModelPtr &ge_root_model, uint64_t session_id) { +Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, const std::vector &inputs, + ge::ComputeGraphPtr &compute_graph, uint64_t session_id) { GE_CHECK_NOTNULL(graph_node); - GE_CHECK_NOTNULL(graph_node->GetGraph()); - auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph()); GE_CHECK_NOTNULL(compute_graph); - - GEEVENT("PreRun start, graph node size %zu, session id %lu, graph id %u, graph name %s", - compute_graph->GetDirectNodesSize(), session_id, compute_graph->GetGraphID(), - compute_graph->GetName().c_str()); - GE_DUMP(compute_graph, "PreRunBegin"); - GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", graph_optimize_.OptimizeOriginalGraphForQuantize, compute_graph); GM_RUN_AND_DUMP_PERF("HandleSummaryOp", graph_optimize_.HandleSummaryOp, compute_graph); GM_RUN_AND_DUMP_PERF("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph, @@ -388,10 +558,6 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorInferShapeInNeed); - const char *unknown_shape_skip = std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION"); - if (unknown_shape_skip != nullptr) { - PassManager graph_pass; - GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass)) - GE_CHK_STATUS_RET(graph_pass.Run(compute_graph)); - } + + PassManager graph_pass; + GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass)) + GE_CHK_STATUS_RET(graph_pass.Run(compute_graph)); GE_CHK_STATUS_RET(graph_optimize_.IdentifyReference(compute_graph), "Identify reference failed."); + GELOGI("PreRun:PreRunOptimizeOriginalGraph success."); + return SUCCESS; +} + +Status GraphManager::PreRunOptimizeSubGraph(const GraphNodePtr &graph_node, ge::ComputeGraphPtr &compute_graph, + uint64_t session_id) { + GE_CHECK_NOTNULL(graph_node); + GE_CHECK_NOTNULL(compute_graph); GM_RUN_AND_DUMP_PERF("OptimizeSubgraph", OptimizeSubgraph, graph_node, compute_graph, session_id); + + // Dump graph to tuning path + if (options_.build_mode == BUILD_MODE_TUNING && options_.build_step == BUILD_STEP_AFTER_UB_MATCH) { + std::string tuning_path; + (void)GetContext().GetOption(TUNING_PATH, tuning_path); + GELOGI("Dump path:%s.", tuning_path.c_str()); + GraphUtils::DumpGEGraph(compute_graph, "", true, tuning_path); + } + GELOGI("PreRun:PreRunOptimizeSubGraph success."); + return SUCCESS; +} + +Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, + GeRootModelPtr &ge_root_model, uint64_t session_id) { + GE_CHECK_NOTNULL(graph_node); + GE_CHECK_NOTNULL(compute_graph); GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph); GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts", graph_optimize_.OptimizeGraphBeforeBuildForRts, compute_graph); GM_RUN_AND_DUMP_PERF("Build", Build, graph_node, compute_graph, ge_root_model, session_id); + GELOGI("PreRun:PreRunAfterOptimizeSubGraph success."); + return SUCCESS; +} + +Status GraphManager::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint64_t session_id, uint32_t graph_id) { + GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id, graph_id, + static_cast(mode), ge::GetContext().DeviceId()); + + rtError_t rt_ret = rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId()); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return FAILED; + } + rt_ret = rtCtxSetCurrent(rt_context); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return FAILED; + } + RtContextUtil::GetInstance().AddRtContext(session_id, graph_id, rt_context); + return SUCCESS; +} + +Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector &inputs, + GeRootModelPtr &ge_root_model, uint64_t session_id) { + GE_CHECK_NOTNULL(graph_node); + GE_CHECK_NOTNULL(graph_node->GetGraph()); + auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph()); + GE_CHECK_NOTNULL(compute_graph); + compute_graph->SetSessionID(session_id); + auto analyzer_instance = Analyzer::GetInstance(); + GE_CHK_STATUS_RET(analyzer_instance->BuildJsonObject(session_id, compute_graph->GetGraphID()), + "BuildJsonObject Failed") + + GEEVENT("PreRun start, graph node size %zu, session id %lu, graph id %u, graph name %s", + compute_graph->GetDirectNodesSize(), session_id, compute_graph->GetGraphID(), + compute_graph->GetName().c_str()); + GE_DUMP(compute_graph, "PreRunBegin"); + // rtContext_t + Status ret = SetRtContext(rtContext_t(), RT_CTX_GEN_MODE, session_id, compute_graph->GetGraphID()); + if (ret != SUCCESS) { + GELOGE(ret, "Set rt context failed."); + return ret; + } + + /// 1. BUILD_MODE_TUNING with BUILD_STEP_AFTER_UB_MATCH no need PreRunOptimizeOriginalGraph; + /// 2. BUILD_MODE_TUNING with BUILD_STEP_AFTER_MERGE no need PreRunOptimizeOriginalGraph. + /// 3. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need PreRunOptimizeOriginalGraph. + bool run_optimize_original_graph = + !((options_.build_mode == BUILD_MODE_TUNING) && + (options_.build_step == BUILD_STEP_AFTER_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_MERGE || + options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB)); + if (run_optimize_original_graph) { + Status ret = PreRunOptimizeOriginalGraph(graph_node, inputs, compute_graph, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Run PreRunOptimizeOriginalGraph failed for graph:%s.", compute_graph->GetName().c_str()); + return ret; + } + } + + // BUILD_MODE_TUNING with BUILD_STEP_AFTER_MERGE no need PreRunOptimizeSubGraph. + bool run_optimize_subgraph = + !((options_.build_mode == BUILD_MODE_TUNING) && (options_.build_step == BUILD_STEP_AFTER_MERGE)); + if (run_optimize_subgraph) { + Status ret = PreRunOptimizeSubGraph(graph_node, compute_graph, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Run PreRunOptimizeSubGraph failed for graph:%s.", compute_graph->GetName().c_str()); + return ret; + } + } + + /// 1. BUILD_MODE_TUNING with BUILD_STEP_BEFORE_UB_MATCH no need PreRunAfterOptimizeSubGraph; + /// 2. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER no need PreRunAfterOptimizeSubGraph. + /// 3. BUILD_MODE_TUNING with BUILD_STEP_AFTER_BUILDER_SUB no need PreRunAfterOptimizeSubGraph. + bool run_after_optimize_subgraph = + !((options_.build_mode == BUILD_MODE_TUNING) && + (options_.build_step == BUILD_STEP_BEFORE_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_BUILDER || + options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB)); + if (run_after_optimize_subgraph) { + Status ret = PreRunAfterOptimizeSubGraph(graph_node, compute_graph, ge_root_model, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "Run PreRunAfterOptimizeSubGraph failed for graph:%s.", compute_graph->GetName().c_str()); + return ret; + } + } // when set incre build, save om model and var manager GeModelPtr ge_model = nullptr; @@ -456,7 +728,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: if (ret != SUCCESS) { ret = PreRun(graph_node, inputs, ge_root_model, session_id); // release rts generate context - RtContextUtil::GetInstance().DestroyRtContexts(session_id); + RtContextUtil::GetInstance().DestroyRtContexts(session_id, graph_node->GetGraphId()); if (ret != SUCCESS) { GELOGE(ret, "PreRun Failed."); return ret; @@ -1065,7 +1337,7 @@ Status GraphManager::ParseOptions(const std::map &opti // net output node dataType ParseOption(options, OUTPUT_DATATYPE, options_.output_datatype); if (!options_.output_datatype.empty()) { - domi::GetContext().output_type = options_.output_datatype; + omg_context_.output_type = options_.output_datatype; } // Set save_original_model flag (ge.save_original_model) @@ -1074,6 +1346,10 @@ Status GraphManager::ParseOptions(const std::map &opti // Original model file name ParseOption(options, ORIGINAL_MODEL_FILE, options_.original_model_file); + // Set Build model and step + ParseOption(options, BUILD_MODE, options_.build_mode); + ParseOption(options, BUILD_STEP, options_.build_step); + return SUCCESS; } @@ -1659,6 +1935,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { ReshapeRemovePass reshape_remove_pass; ConstantFoldingPass constant_folding_pass; DimensionAdjustPass dimension_adjust_pass; + EnterPass enter_pass; AddNPass addn_pass; SwitchDeadBranchElimination switch_dead_branch_elimination; SwitchLogicRemovePass switch_logic_remove_pass; @@ -1667,15 +1944,16 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { TransposeTransDataPass transpose_transdata_pass; TransOpSymmetryEliminationPass symmetry_elimination_pass; DimensionComputePass dimension_compute_pass; + names_to_passes.emplace_back("EnterPass", &enter_pass); names_to_passes.emplace_back("AddNPass", &addn_pass); names_to_passes.emplace_back("SwitchDeadBranchElimination", &switch_dead_branch_elimination); names_to_passes.emplace_back("SwitchLogicRemovePass", &switch_logic_remove_pass); names_to_passes.emplace_back("MergePass", &merge_pass); names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass); names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass); + names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass); names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass); - names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass); @@ -1975,6 +2253,7 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager Status ret = SUCCESS; GetThreadLocalContext() = ge_context; if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) { + SetLocalOmgContext(graph_manager->omg_context_); ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph(); const std::string &engine_name = sub_graph_info_ptr->GetEngineName(); GELOGI("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu", @@ -2079,6 +2358,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) { GELOGW("Set thread name failed."); } + SetLocalOmgContext(graph_manager->omg_context_); + PreRunArgs args; while (graph_manager->thread_run_flag_) { bool pop_status = graph_manager->prerun_args_q_.Pop(args); @@ -2146,10 +2427,10 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id); // release rts generate context - RtContextUtil::GetInstance().DestroyRtContexts(args.session_id); + RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId()); if (ret != SUCCESS) { graph_node->SetRunFlag(false); - if (!std::getenv("AnalyzeMode")) { + if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) { ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit.."); graph_node->Unlock(); return; @@ -2176,6 +2457,8 @@ void GraphManager::RunThread(GraphManager *graph_manager) { if (prctl(PR_SET_NAME, ("GE_Run")) != 0) { GELOGW("Set thread name failed."); } + SetLocalOmgContext(graph_manager->omg_context_); + RunArgs args; while (graph_manager->thread_run_flag_) { bool pop_status = graph_manager->run_args_q_.Pop(args); @@ -2287,17 +2570,11 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ return; } tensor.length = len * size; - auto pbuff = new (std::nothrow) uint8_t[tensor.length]; - if (!pbuff) { - GELOGE(MEMALLOC_FAILED, "new buff failed!"); - callback(GRAPH_FAILED, outputs); - return; - } + tensor.data.reset(new (std::nothrow) uint8_t[tensor.length]); // To avoid global step too small and can not stop, totally set a bigger value for (int64_t i = 0; i < tensor.length; i++) { - *(pbuff + i) = 0x7F; // here stands for a positive max value + tensor.data[i] = 0x7F; // here stands for a positive max value } - tensor.data.reset(pbuff); outputs.emplace_back(std::move(tensor)); } } @@ -2373,6 +2650,20 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra return ret; } GE_TIMESTAMP_EVENT_END(SetSubgraph, "OptimizeSubgraph::SetSubGraph"); + if ((options_.build_mode == BUILD_MODE_TUNING) && + (options_.build_step == BUILD_STEP_BEFORE_UB_MATCH || options_.build_step == BUILD_STEP_AFTER_BUILDER || + options_.build_step == BUILD_STEP_AFTER_BUILDER_SUB)) { + GE_TIMESTAMP_START(ConvertGraphToFile); + std::string tuning_path; + (void)GetContext().GetOption(TUNING_PATH, tuning_path); + Status ret = ConvertGraphToFile(compute_graph, tuning_path, (options_.build_step == BUILD_STEP_AFTER_BUILDER)); + if (ret != SUCCESS) { + GELOGE(ret, "Convert graph[%s] to file failed", compute_graph->GetName().c_str()); + return ret; + } + GE_TIMESTAMP_EVENT_END(ConvertGraphToFile, "OptimizeSubgraph::ConvertGraphToFile"); + return SUCCESS; + } ComputeGraphPtr merged_compute_graph = nullptr; std::vector merged_sub_graph_list; @@ -2400,6 +2691,32 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra } return SUCCESS; } + +Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string path, bool exe_flag) { + GE_CHECK_NOTNULL(compute_graph); + GELOGI("compute_graph [%s] path [%s] Enter ConvertGraphToFile.", compute_graph->GetName().c_str(), path.c_str()); + std::vector non_tuning_subgraphs; + auto input_node_sub_graph_map = graph_partitioner_.graph_2_input_subgraph_; + const auto &input_subgraph_info = input_node_sub_graph_map[compute_graph]; + GE_CHECK_NOTNULL(input_subgraph_info); + ComputeGraphPtr input_graph_tmp = input_subgraph_info->GetSubGraph(); + non_tuning_subgraphs.push_back(input_graph_tmp); + auto sub_graph_map = graph_partitioner_.GetSubGraphMap(); + const auto &subgraph_infos = sub_graph_map[compute_graph]; + std::vector tuning_subgraphs; + for (const auto &sub_graph_info_ptr : subgraph_infos) { + GE_CHECK_NOTNULL(sub_graph_info_ptr); + ComputeGraphPtr sub_graph_tmp = sub_graph_info_ptr->GetSubGraph(); + // need to tuning + if (sub_graph_info_ptr->GetEngineName() == kVectorEngine || sub_graph_info_ptr->GetEngineName() == kAIcoreEngine) { + tuning_subgraphs.push_back(sub_graph_tmp); + } else { + non_tuning_subgraphs.push_back(sub_graph_tmp); + } + } + return TuningUtils::ConvertGraphToFile(tuning_subgraphs, non_tuning_subgraphs, exe_flag, path); +} + Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, GeRootModelPtr &ge_root_model, uint64_t session_id) { // build diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h index 6dc83120..9096f4a8 100644 --- a/src/ge/graph/manager/graph_manager.h +++ b/src/ge/graph/manager/graph_manager.h @@ -39,12 +39,13 @@ #include "graph/optimize/graph_optimize.h" #include "graph/partition/graph_partition.h" #include "graph/preprocess/graph_preprocess.h" +#include "graph/tuning_utils.h" #include "model/ge_model.h" namespace ge { class GraphManager { public: - GraphManager(); + GraphManager(OmgContext &omg_context); ~GraphManager() = default; @@ -248,6 +249,8 @@ class GraphManager { Status MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph); + Status ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string file_path, bool exe_flag = false); + Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph); void SetAttrForHcomBroadCastOp(ge::ComputeGraphPtr &compute_graph); @@ -304,6 +307,25 @@ class GraphManager { void ChangeConstTypeWhenTraining(const ComputeGraphPtr &compute_graph); + Status PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, const std::vector &inputs, + ge::ComputeGraphPtr &compute_graph, uint64_t session_id); + Status PreRunOptimizeSubGraph(const GraphNodePtr &graph_node, ge::ComputeGraphPtr &compute_graph, + uint64_t session_id); + Status PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, + GeRootModelPtr &ge_root_model, uint64_t session_id); + + Status CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map, + std::unordered_map ©_graphs); + + Status OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph, Graph2SubGraphInfoList &sub_graph_map, + uint64_t session_id); + + bool CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map); + + Status ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map, + std::unordered_map ©_graphs); + Status SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint64_t session_id, uint32_t graph_id); + std::atomic_bool thread_run_flag_; BlockingQueue prerun_args_q_{}; BlockingQueue run_args_q_{}; @@ -326,6 +348,7 @@ class GraphManager { bool init_flag_; GraphManagerOptions options_; + OmgContext &omg_context_; GraphPrepare graph_preparer_; GraphOptimize graph_optimize_; diff --git a/src/ge/graph/manager/graph_manager_utils.cc b/src/ge/graph/manager/graph_manager_utils.cc index 90f91c8e..edacadb9 100644 --- a/src/ge/graph/manager/graph_manager_utils.cc +++ b/src/ge/graph/manager/graph_manager_utils.cc @@ -163,42 +163,4 @@ bool HasCalcOp(const ComputeGraphPtr &graph) { return false; } - -Status ParseOutNodes(const string &out_nodes) { - try { - if (!out_nodes.empty()) { - domi::GetContext().out_nodes_map.clear(); - domi::GetContext().user_out_nodes.clear(); - - vector nodes_v = StringUtils::Split(out_nodes, ';'); - for (const string &node : nodes_v) { - vector key_value_v = StringUtils::Split(node, ':'); - if (key_value_v.size() != 2) { // must contain 2 items - GELOGE(GE_GRAPH_PARAM_NULLPTR, "Invalid outNodes: %s", node.c_str()); - return GE_GRAPH_PARAM_NULLPTR; - } - auto iter = domi::GetContext().out_nodes_map.find(key_value_v[0]); - int32_t index = std::stoi(StringUtils::Trim(key_value_v[1])); - if (iter != domi::GetContext().out_nodes_map.end()) { - iter->second.emplace_back(index); - } else { - std::vector index_v; - index_v.emplace_back(index); - domi::GetContext().out_nodes_map.emplace(key_value_v[0], index_v); - } - domi::GetContext().user_out_nodes.emplace_back(key_value_v[0], index); - } - } - } catch (std::invalid_argument &) { - GELOGE(PARAM_INVALID, "out nodes: %s, key value[1] is invalid argument", out_nodes.c_str()); - return PARAM_INVALID; - } catch (std::out_of_range &) { - GELOGE(PARAM_INVALID, "out nodes: %s, key value[1] is out of range", out_nodes.c_str()); - return PARAM_INVALID; - } catch (...) { - GELOGE(GE_GRAPH_PARAM_NULLPTR, "Invalid outNodes: %s", out_nodes.c_str()); - return GE_GRAPH_PARAM_NULLPTR; - } - return SUCCESS; -} } // namespace ge diff --git a/src/ge/graph/manager/graph_manager_utils.h b/src/ge/graph/manager/graph_manager_utils.h index 869d4a81..be39df21 100644 --- a/src/ge/graph/manager/graph_manager_utils.h +++ b/src/ge/graph/manager/graph_manager_utils.h @@ -116,6 +116,7 @@ class SubGraphInfo { using SubGraphInfoPtr = std::shared_ptr; using Graph2SubGraphInfoList = std::unordered_map>; +using Graph2InputNodesSubGraphInfo = std::unordered_map; // for run graph async listener class RunAsyncListener : public ge::ModelListener { @@ -220,8 +221,6 @@ class GraphModelListener : public ge::ModelListener { std::condition_variable &condition_; }; -Status ParseOutNodes(const string &out_nodes); - struct GraphManagerOptions { int32_t stream_num; int32_t perf_level; @@ -248,6 +247,8 @@ struct GraphManagerOptions { std::string output_datatype; std::string original_model_file; std::string save_original_model; + std::string build_mode; + std::string build_step; GraphManagerOptions() : stream_num(1), perf_level(domi::GEN_TASK_WITHOUT_FUSION), @@ -269,7 +270,9 @@ struct GraphManagerOptions { hcom_parallel(false), enable_print_op_pass(true), is_single_op(false), - save_original_model("false") {} + save_original_model("false"), + build_mode(""), + build_step("") {} }; } // namespace ge diff --git a/src/ge/graph/manager/graph_mem_allocator.cc b/src/ge/graph/manager/graph_mem_allocator.cc index e63039dc..20ca12ae 100644 --- a/src/ge/graph/manager/graph_mem_allocator.cc +++ b/src/ge/graph/manager/graph_mem_allocator.cc @@ -15,13 +15,13 @@ */ #include "graph/manager/graph_mem_allocator.h" -#include "graph/manager/graph_caching_allocator.h" #include #include -#include #include "framework/common/debug/ge_log.h" +#include "graph/manager/graph_caching_allocator.h" +#include "graph/manager/rdma_pool_allocator.h" namespace ge { void MemoryAllocator::Initialize(uint32_t device_id) { @@ -185,30 +185,36 @@ Status MemManager::Initialize(const std::vector &memory_type) { } } - return InitCachingAllocator(memory_type); + if (InitAllocator(memory_type, caching_allocator_map_) != SUCCESS) { + GELOGE(ge::INTERNAL_ERROR, "Create CachingAllocator failed."); + return ge::INTERNAL_ERROR; + } + if (InitAllocator(memory_type, rdma_allocator_map_) != SUCCESS) { + GELOGE(ge::INTERNAL_ERROR, "Create RdmaAllocator failed."); + return ge::INTERNAL_ERROR; + } + return SUCCESS; } -void MemManager::Finalize() noexcept { - GELOGI("Finalize."); - std::lock_guard lock(allocator_mutex_); - // caching allocator use memory allocator, so finalize it first - for (auto &caching_allocator : caching_allocator_map_) { - if (caching_allocator.second != nullptr) { - caching_allocator.second->Finalize(); - delete caching_allocator.second; - caching_allocator.second = nullptr; +template +void FinalizeAllocatorMap(std::map &allocate_map) { + for (auto &allocator : allocate_map) { + if (allocator.second != nullptr) { + allocator.second->Finalize(); + delete allocator.second; + allocator.second = nullptr; } } - caching_allocator_map_.clear(); + allocate_map.clear(); +} - for (auto &memory_allocator : memory_allocator_map_) { - if (memory_allocator.second != nullptr) { - memory_allocator.second->Finalize(); - delete memory_allocator.second; - memory_allocator.second = nullptr; - } - } - memory_allocator_map_.clear(); +void MemManager::Finalize() noexcept { + GELOGI("Finalize."); + std::lock_guard lock(allocator_mutex_); + // caching and rdma allocator use memory allocator, so finalize them first + FinalizeAllocatorMap(caching_allocator_map_); + FinalizeAllocatorMap(rdma_allocator_map_); + FinalizeAllocatorMap(memory_allocator_map_); } MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) { @@ -229,53 +235,11 @@ MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) { return memory_allocator; } -Status MemManager::InitCachingAllocator(const std::vector &memory_type) { - CachingAllocator *caching_allocator = nullptr; - for (unsigned int index : memory_type) { - auto it = caching_allocator_map_.find(index); - if (it == caching_allocator_map_.end()) { - caching_allocator = new (std::nothrow) CachingAllocator(index); - if (caching_allocator != nullptr) { - caching_allocator_map_[index] = caching_allocator; - GELOGI("Create CachingAllocator memory type[%u] success.", index); - } else { - GELOGE(ge::INTERNAL_ERROR, "Alloc CachingAllocator failed."); - } - } else { - caching_allocator = it->second; - } - - if (caching_allocator == nullptr) { - GELOGE(ge::INTERNAL_ERROR, "Create CachingAllocator failed."); - return ge::INTERNAL_ERROR; - } else { - if (caching_allocator->Initialize() != ge::SUCCESS) { - return ge::INTERNAL_ERROR; - } - } - } - return ge::SUCCESS; -} - -CachingAllocator &MemManager::GetCachingAllocator(rtMemType_t memory_type) { - std::lock_guard lock(allocator_mutex_); - CachingAllocator *caching_allocator = nullptr; - auto it = caching_allocator_map_.find(memory_type); - if (it != caching_allocator_map_.end()) { - caching_allocator = it->second; - } - - // Usually impossible - if (caching_allocator == nullptr) { - GELOGE(ge::INTERNAL_ERROR, "GetCachingAllocator failed, memory type is %u.", memory_type); - static CachingAllocator default_caching_allocator(RT_MEMORY_RESERVED); - return default_caching_allocator; - ; - } - return *caching_allocator; +CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) { + return Instance().GetAllocator(memory_type, caching_allocator_map_); } -CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) { - return Instance().GetCachingAllocator(memory_type); +RdmaPoolAllocator &MemManager::RdmaPoolInstance(rtMemType_t memory_type) { + return Instance().GetAllocator(memory_type, rdma_allocator_map_); } } // namespace ge diff --git a/src/ge/graph/manager/graph_mem_allocator.h b/src/ge/graph/manager/graph_mem_allocator.h index e4eeded3..bebdedb6 100644 --- a/src/ge/graph/manager/graph_mem_allocator.h +++ b/src/ge/graph/manager/graph_mem_allocator.h @@ -24,6 +24,7 @@ #include #include +#include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/node.h" #include "runtime/mem.h" @@ -136,6 +137,7 @@ class MemoryAllocator { using MemoryAllocatorPtr = std::shared_ptr; class CachingAllocator; +class RdmaPoolAllocator; class MemManager { public: @@ -143,7 +145,8 @@ class MemManager { virtual ~MemManager(); static MemManager &Instance(); static MemoryAllocator *Instance(rtMemType_t memory_type); - static CachingAllocator &CachingInstance(rtMemType_t memory_type); + CachingAllocator &CachingInstance(rtMemType_t memory_type); + RdmaPoolAllocator &RdmaPoolInstance(rtMemType_t memory_type); MemManager(const MemManager &) = delete; MemManager &operator=(const MemManager &) = delete; /// @@ -172,22 +175,65 @@ class MemManager { /// /// @ingroup ge_graph - /// @brief ge caching allocator /// @param [in] memory_type memory type - /// @return CachingAllocator ptr + /// @param [in] allocate_map memory allocator map + /// @return Status result of function /// - CachingAllocator &GetCachingAllocator(rtMemType_t memory_type); - + template + Status InitAllocator(const std::vector &memory_type, std::map &allocate_map) { + T *allocator = nullptr; + for (unsigned int index : memory_type) { + auto it = allocate_map.find(index); + if (it == allocate_map.end()) { + allocator = new (std::nothrow) T(index); + if (allocator != nullptr) { + allocate_map[index] = allocator; + GELOGI("Create Allocator memory type[%u] success.", index); + } else { + GELOGE(INTERNAL_ERROR, "Alloc Allocator failed."); + } + } else { + allocator = it->second; + } + + if (allocator == nullptr) { + GELOGE(INTERNAL_ERROR, "Create Allocator failed."); + return INTERNAL_ERROR; + } else { + if (allocator->Initialize() != SUCCESS) { + return INTERNAL_ERROR; + } + } + } + return SUCCESS; + } /// /// @ingroup ge_graph - /// @brief ge create caching allocator /// @param [in] memory_type memory type - /// @return Status result of function - /// - Status InitCachingAllocator(const std::vector &memory_type); + /// @param [in] allocate_map memory allocator map + /// @return Allocator ptr + /// + template + T &GetAllocator(rtMemType_t memory_type, std::map allocate_map) { + std::lock_guard lock(allocator_mutex_); + T *allocator = nullptr; + auto it = allocate_map.find(memory_type); + if (it != allocate_map.end()) { + allocator = it->second; + } + + // Usually impossible + if (allocator == nullptr) { + GELOGE(ge::INTERNAL_ERROR, "Get allocator failed, memory type is %u.", memory_type); + static T default_allocator(RT_MEMORY_RESERVED); + return default_allocator; + } + return *allocator; + } std::map memory_allocator_map_; std::map caching_allocator_map_; + std::map rdma_allocator_map_; std::recursive_mutex allocator_mutex_; }; } // namespace ge diff --git a/src/ge/graph/manager/rdma_pool_allocator.cc b/src/ge/graph/manager/rdma_pool_allocator.cc index 1ff77e92..ef82deff 100644 --- a/src/ge/graph/manager/rdma_pool_allocator.cc +++ b/src/ge/graph/manager/rdma_pool_allocator.cc @@ -15,7 +15,11 @@ */ #include "graph/manager/rdma_pool_allocator.h" + +#include #include "framework/common/debug/ge_log.h" +#include "graph/ge_context.h" +#include "runtime/dev.h" namespace { const size_t kAlignedSize = 512; @@ -52,31 +56,41 @@ Status RdmaPoolAllocator::Initialize() { return ge::SUCCESS; } void RdmaPoolAllocator::Finalize() { + GELOGD("Rdma pool finalize start."); for (auto it = allocated_blocks_.begin(); it != allocated_blocks_.end();) { auto block = it->second; - allocated_blocks_.erase(it); + it = allocated_blocks_.erase(it); delete block; } for (auto it = block_bin_.begin(); it != block_bin_.end();) { auto block = *it; - block_bin_.erase(it); + it = block_bin_.erase(it); delete block; } if (rdma_base_addr_ != nullptr) { + GELOGD("Start to free rdma pool memory."); if (memory_allocator_->FreeMemory(rdma_base_addr_) != SUCCESS) { GELOGW("Free rdma pool memory failed"); } + rdma_base_addr_ = nullptr; } } -Status RdmaPoolAllocator::InitMemory(size_t mem_size, uint32_t device_id) { +Status RdmaPoolAllocator::InitMemory(size_t mem_size) { + auto device_id = GetContext().DeviceId(); + GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); if (rdma_base_addr_ != nullptr) { GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced"); return GE_MULTI_INIT; } const std::string purpose = "Memory for rdma pool."; std::lock_guard lock(mutex_); + auto dev_id = static_cast(device_id); + GE_CHK_RT_RET(rtSetDevice(dev_id)); + // DeviceReset before memory finished! + GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(dev_id)); }); + rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); if (rdma_base_addr_ == nullptr) { GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed"); @@ -94,6 +108,7 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size, uint32_t device_id) { } uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { + GELOGI("start to malloc rdma memory size:%zu, device id = %u", size, device_id); auto aligned_size = GetAlignedBlockSize(size); Block key(device_id, aligned_size, nullptr); std::lock_guard lock(mutex_); @@ -107,9 +122,9 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { return nullptr; } allocated_blocks_.emplace(block->ptr, block); - GELOGI("Find block size = %zu", block->size); if (ShouldSplit(block, aligned_size)) { + GELOGD("Block will be splited block size = %zu, aligned_size:%zu", block->size, aligned_size); auto *new_block = new (std::nothrow) Block(device_id, block->size - aligned_size, nullptr, block->ptr + aligned_size); if (new_block == nullptr) { @@ -126,12 +141,14 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { block_bin_.insert(new_block); } return block->ptr; + GELOGD("Find block size = %zu", block->size); } + GELOGW("Memory block not founded."); return nullptr; } Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { - GELOGI("Free device id = %u", device_id); + GELOGI("Free rdma memory, device id = %u", device_id); if (memory_addr == nullptr) { GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); return GE_GRAPH_FREE_FAILED; @@ -143,27 +160,41 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { GELOGE(PARAM_INVALID, "Invalid memory pointer"); return PARAM_INVALID; } + Block *block = it->second; block->allocated = false; allocated_blocks_.erase(it); + + Block *merge_blocks[] = {block->prev, block->next}; + for (Block *merge_block : merge_blocks) { + MergeBlocks(block, merge_block); + } block_bin_.insert(block); - // Each time merge with its pre and next. - MergeBlockNearby(block, block->next); - MergeBlockNearby(block->prev, block); + return SUCCESS; } -void RdmaPoolAllocator::MergeBlockNearby(Block *pre_block, Block *block) { - if (!(CanMerge(pre_block) && CanMerge(block))) { +void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) { + if (!CanMerge(dst) || !CanMerge(src)) { return; } - pre_block->size += block->size; - pre_block->next = block->next; - if (block->next != nullptr) { - block->next->prev = pre_block; + + if (dst->prev == src) { + dst->ptr = src->ptr; + dst->prev = src->prev; + if (dst->prev != nullptr) { + dst->prev->next = dst; + } + } else { + dst->next = src->next; + if (dst->next != nullptr) { + dst->next->prev = dst; + } } - block_bin_.erase(block); - delete block; + + dst->size += src->size; + block_bin_.erase(src); + delete src; } Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { diff --git a/src/ge/graph/manager/rdma_pool_allocator.h b/src/ge/graph/manager/rdma_pool_allocator.h index e1da29a9..4d8cf71e 100644 --- a/src/ge/graph/manager/rdma_pool_allocator.h +++ b/src/ge/graph/manager/rdma_pool_allocator.h @@ -40,12 +40,12 @@ class RdmaPoolAllocator { RdmaPoolAllocator &operator=(const RdmaPoolAllocator &) = delete; - ~RdmaPoolAllocator() { Finalize(); } + ~RdmaPoolAllocator() = default; Status Initialize(); void Finalize(); - Status InitMemory(size_t mem_size, uint32_t device_id = 0); + Status InitMemory(size_t mem_size); uint8_t *Malloc(size_t size, uint32_t device_id = 0); @@ -54,7 +54,7 @@ class RdmaPoolAllocator { Status GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size); private: - void MergeBlockNearby(Block *pre_block, Block *block); + void MergeBlocks(Block *dst, Block *src); rtMemType_t memory_type_; size_t rdma_mem_size_ = 0; // Total rdma memory size to be allocated. diff --git a/src/ge/graph/manager/util/hcom_util.cc b/src/ge/graph/manager/util/hcom_util.cc index 5f31c982..614f8527 100644 --- a/src/ge/graph/manager/util/hcom_util.cc +++ b/src/ge/graph/manager/util/hcom_util.cc @@ -63,7 +63,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, return SUCCESS; } -Status HcomOmeUtil::GetHcclTypeSize(hcclDataType_t data_type, int32_t &size) { +Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { auto iter = kConstOpHcclDataTypeSize.find(data_type); GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, "HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!"); @@ -72,7 +72,7 @@ Status HcomOmeUtil::GetHcclTypeSize(hcclDataType_t data_type, int32_t &size) { return SUCCESS; } -Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather, +Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType data_type, bool is_allgather, int &count) { GE_CHECK_NOTNULL(op_desc); if (!IsHCOMOp(op_desc->GetType())) { @@ -149,7 +149,7 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc, int64_t align_size = 512; int32_t size = 0; for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast(kernel_hccl_infos[i].dataType), size), + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast(kernel_hccl_infos[i].dataType), size), "GetHorovodCount: GetHcclTypeSize fail!"); int64_t input_size = 0; int64_t block_size = 0; @@ -187,7 +187,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); if (IsHCOMOp(op_desc->GetType())) { int32_t count = 0; - ret = GetHcomCount(op_desc, static_cast(kernel_hccl_infos[0].dataType), + ret = GetHcomCount(op_desc, static_cast(kernel_hccl_infos[0].dataType), kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); if (ret != SUCCESS) { GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.", @@ -209,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, return SUCCESS; } -Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type) { +Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, HcclReduceOp &op_type) { GE_CHECK_NOTNULL(op_desc); if (IsHCOMOp(op_desc->GetType())) { @@ -219,13 +219,13 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hccl op_desc->GetName().c_str(), op_desc->GetType().c_str()); if (hcom_op_type == "min") { - op_type = HCCL_REP_OP_MIN; + op_type = HCCL_REDUCE_MIN; } else if (hcom_op_type == "max") { - op_type = HCCL_REP_OP_MAX; + op_type = HCCL_REDUCE_MAX; } else if (hcom_op_type == "prod") { - op_type = HCCL_REP_OP_PROD; + op_type = HCCL_REDUCE_PROD; } else if (hcom_op_type == "sum") { - op_type = HCCL_REP_OP_SUM; + op_type = HCCL_REDUCE_SUM; } else { GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str()); return PARAM_INVALID; @@ -239,7 +239,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hccl "HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!", op_desc->GetName().c_str(), op_desc->GetType().c_str()); - auto iter = kHorovodRedOpToHcclRedOp.find(static_cast(horovod_op_type)); + auto iter = kHorovodRedOpToHcclRedOp.find(static_cast(horovod_op_type)); if (iter == kHorovodRedOpToHcclRedOp.end()) { GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld", op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); diff --git a/src/ge/graph/manager/util/hcom_util.h b/src/ge/graph/manager/util/hcom_util.h index e31e3ef0..064058f8 100644 --- a/src/ge/graph/manager/util/hcom_util.h +++ b/src/ge/graph/manager/util/hcom_util.h @@ -34,24 +34,24 @@ namespace ge { using std::string; using std::vector; -static std::map kConstOpHcclDataType = { - {ge::DT_FLOAT, HCCL_DATA_TYPE_FLOAT}, - {ge::DT_FLOAT16, HCCL_DATA_TYPE_HALF}, +static std::map kConstOpHcclDataType = { + {ge::DT_FLOAT, HCCL_DATA_TYPE_FP32}, + {ge::DT_FLOAT16, HCCL_DATA_TYPE_FP16}, {ge::DT_INT8, HCCL_DATA_TYPE_INT8}, - {ge::DT_INT32, HCCL_DATA_TYPE_INT}, + {ge::DT_INT32, HCCL_DATA_TYPE_INT32}, }; -static std::map kConstOpHcclDataTypeSize = { - {HCCL_DATA_TYPE_FLOAT, sizeof(float)}, - {HCCL_DATA_TYPE_HALF, sizeof(float) / 2}, +static std::map kConstOpHcclDataTypeSize = { + {HCCL_DATA_TYPE_FP32, sizeof(float)}, + {HCCL_DATA_TYPE_FP16, sizeof(float) / 2}, {HCCL_DATA_TYPE_INT8, sizeof(int8_t)}, - {HCCL_DATA_TYPE_INT, sizeof(int32_t)}, + {HCCL_DATA_TYPE_INT32, sizeof(int32_t)}, }; -static std::map kHorovodRedOpToHcclRedOp = { - {HOROVOD_REP_OP_SUM, HCCL_REP_OP_SUM}, {HOROVOD_REP_OP_MIN, HCCL_REP_OP_MIN}, - {HOROVOD_REP_OP_MAX, HCCL_REP_OP_MAX}, {HOROVOD_REP_OP_PROD, HCCL_REP_OP_PROD}, - {HOROVOD_REP_OP_RESERVED, HCCL_REP_OP_RESERVED}, +static std::map kHorovodRedOpToHcclRedOp = { + {HOROVOD_REDUCE_SUM, HCCL_REDUCE_SUM}, {HOROVOD_REDUCE_MIN, HCCL_REDUCE_MIN}, + {HOROVOD_REDUCE_MAX, HCCL_REDUCE_MAX}, {HOROVOD_REDUCE_PROD, HCCL_REDUCE_PROD}, + {HOROVOD_REDUCE_RESERVED, HCCL_REDUCE_RESERVED}, }; class HcomOmeUtil { @@ -71,7 +71,7 @@ class HcomOmeUtil { /// @return SUCCESS /// @return FAIL /// - static Status GetHcclTypeSize(hcclDataType_t data_type, int32_t &size); + static Status GetHcclTypeSize(HcclDataType data_type, int32_t &size); /// /// @ingroup domi_ome @@ -87,7 +87,7 @@ class HcomOmeUtil { /// @return SUCCESS /// @return FAIL /// - static Status GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type); + static Status GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, HcclReduceOp &op_type); /// /// @ingroup domi_ome @@ -150,8 +150,7 @@ class HcomOmeUtil { /// @return SUCCESS /// @return FAIL /// - static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather, - int &count); + static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType data_type, bool is_allgather, int &count); private: /// diff --git a/src/ge/graph/manager/util/rt_context_util.cc b/src/ge/graph/manager/util/rt_context_util.cc index 63f217a9..75b25740 100644 --- a/src/ge/graph/manager/util/rt_context_util.cc +++ b/src/ge/graph/manager/util/rt_context_util.cc @@ -19,31 +19,57 @@ #include "framework/common/debug/ge_log.h" namespace ge { +namespace { +const int64_t kDefaultGraphId = -1; +} + void RtContextUtil::AddRtContext(uint64_t session_id, rtContext_t context) { std::lock_guard lock(ctx_mutex_); - rt_contexts_[session_id].emplace_back(context); + rt_contexts_[session_id][kDefaultGraphId].emplace_back(context); +} + +void RtContextUtil::AddRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t context) { + std::lock_guard lock(ctx_mutex_); + rt_contexts_[session_id][static_cast(graph_id)].emplace_back(context); } void RtContextUtil::DestroyRtContexts(uint64_t session_id) { std::lock_guard lock(ctx_mutex_); - auto &contexts = rt_contexts_[session_id]; - DestroyRtContexts(session_id, contexts); + auto &session_ctxs = rt_contexts_[session_id]; + for (auto &graph_ctx_pair : session_ctxs) { + DestroyRtContexts(session_id, graph_ctx_pair.first, graph_ctx_pair.second); + } + auto iter = rt_contexts_.find(session_id); if (iter != rt_contexts_.end()) { rt_contexts_.erase(iter); } } +void RtContextUtil::DestroyRtContexts(uint64_t session_id, uint32_t graph_id) { + std::lock_guard lock(ctx_mutex_); + auto &session_ctxs = rt_contexts_[session_id]; + auto &graph_ctxs = session_ctxs[graph_id]; + DestroyRtContexts(session_id, static_cast(graph_id), graph_ctxs); + + auto iter = session_ctxs.find(graph_id); + if (iter != session_ctxs.end()) { + session_ctxs.erase(iter); + } +} + void RtContextUtil::DestroyAllRtContexts() { std::lock_guard lock(ctx_mutex_); - for (auto &ctx_pair : rt_contexts_) { - DestroyRtContexts(ctx_pair.first, ctx_pair.second); + for (auto &session_ctx_pair : rt_contexts_) { + for (auto &graph_ctx_pair : session_ctx_pair.second) { + DestroyRtContexts(session_ctx_pair.first, graph_ctx_pair.first, graph_ctx_pair.second); + } } rt_contexts_.clear(); } -void RtContextUtil::DestroyRtContexts(uint64_t session_id, std::vector &contexts) { - GELOGI("Runtime context handle number of session %lu is %zu.", session_id, contexts.size()); +void RtContextUtil::DestroyRtContexts(uint64_t session_id, int64_t graph_id, std::vector &contexts) { + GELOGI("Destroy %zu rts contexts for graph %ld of session %lu.", contexts.size(), graph_id, session_id); for (auto &rtContext : contexts) { (void)rtCtxDestroy(rtContext); } diff --git a/src/ge/graph/manager/util/rt_context_util.h b/src/ge/graph/manager/util/rt_context_util.h index 58cc0803..50f0fbed 100644 --- a/src/ge/graph/manager/util/rt_context_util.h +++ b/src/ge/graph/manager/util/rt_context_util.h @@ -32,12 +32,9 @@ class RtContextUtil { } void AddRtContext(uint64_t session_id, rtContext_t context); - - const rtContext_t GetNormalModeContext() const { return before_prerun_ctx_; } - - void SetNormalModeContext(rtContext_t context) { before_prerun_ctx_ = context; } - + void AddRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t context); void DestroyRtContexts(uint64_t session_id); + void DestroyRtContexts(uint64_t session_id, uint32_t graph_id); void DestroyAllRtContexts(); RtContextUtil &operator=(const RtContextUtil &) = delete; @@ -47,11 +44,9 @@ class RtContextUtil { RtContextUtil() = default; ~RtContextUtil() {} - void DestroyRtContexts(uint64_t session_id, std::vector &contexts); - - std::map> rt_contexts_; - rtContext_t before_prerun_ctx_ = nullptr; + void DestroyRtContexts(uint64_t session_id, int64_t graph_id, std::vector &contexts); + std::map>> rt_contexts_; std::mutex ctx_mutex_; }; } // namespace ge diff --git a/src/ge/graph/optimize/graph_optimize.cc b/src/ge/graph/optimize/graph_optimize.cc index a8de6701..214f68eb 100644 --- a/src/ge/graph/optimize/graph_optimize.cc +++ b/src/ge/graph/optimize/graph_optimize.cc @@ -17,6 +17,7 @@ #include "graph/optimize/graph_optimize.h" #include "graph/ge_context.h" +#include "graph/common/local_context.h" #include "graph/passes/dimension_adjust_pass.h" #include "inc/pass_manager.h" #include "init/gelib.h" @@ -68,7 +69,7 @@ void AddNodeInputProperty(ComputeGraphPtr &compute_graph) { src_index_list.emplace_back(peer_out_anchor->GetIdx()); node_op_desc->SetSrcName(src_name_list); node_op_desc->SetSrcIndex(src_index_list); - GE_IF_BOOL_EXEC(!(node_op_desc->GetType() == NETOUTPUT && domi::GetContext().type == domi::TENSORFLOW), + GE_IF_BOOL_EXEC(!(node_op_desc->GetType() == NETOUTPUT && GetLocalOmgContext().type == domi::TENSORFLOW), ge::NodePtr peer_owner_node = peer_out_anchor->GetOwnerNode(); input_name_list.emplace_back( peer_owner_node->GetName() + @@ -102,6 +103,17 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std return SUCCESS; } + if (build_mode_ == BUILD_MODE_TUNING && build_step_ == BUILD_STEP_AFTER_UB_MATCH) { + for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { + Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph)); + if (ret != SUCCESS) { + GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphStage2]: graph optimize failed, ret:%d", ret); + return ret; + } + } + return SUCCESS; + } + for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { ret = (*iter)->OptimizeFusedGraph(*(compute_graph)); if (ret != SUCCESS) { @@ -264,6 +276,8 @@ Status GraphOptimize::SetOptions(const ge::GraphManagerOptions &options) { local_fmk_op_flag_ = options.local_fmk_op_flag; func_bin_path_ = options.func_bin_path; core_type_ = options.core_type; + build_mode_ = options.build_mode; + build_step_ = options.build_step; return SUCCESS; } diff --git a/src/ge/graph/optimize/graph_optimize.h b/src/ge/graph/optimize/graph_optimize.h index 0bbeb0f7..3d2db782 100644 --- a/src/ge/graph/optimize/graph_optimize.h +++ b/src/ge/graph/optimize/graph_optimize.h @@ -89,6 +89,8 @@ class GraphOptimize { // record the summary names for filter sumarry result. std::map> summary_output_indexes_ = {}; std::string func_bin_path_; + std::string build_mode_; + std::string build_step_; }; } // namespace ge #endif // GE_GRAPH_OPTIMIZE_GRAPH_OPTIMIZE_H_ diff --git a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc index 3ecc201a..9c166f4d 100644 --- a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc +++ b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc @@ -136,7 +136,7 @@ NodePtr CreateIdentityAfterSrcNode(const Node &src_node, int out_anchor_idx) { if (src_node.GetOpDesc() == nullptr) { return nullptr; } - static std::atomic identity_num(0); + static std::atomic_long identity_num(0); auto next_num = identity_num.fetch_add(1); // 1. create new identity op desc string identity_name = src_node.GetName() + "_" + IDENTITY + std::to_string(next_num); @@ -541,9 +541,8 @@ Status SplitIdentity(const NodePtr &node) { GE_CHECK_NOTNULL(pre_out_data_anchor); auto pre_node = pre_out_data_anchor->GetOwnerNode(); GE_CHECK_NOTNULL(pre_node); - Status ret = SUCCESS; for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { - ret = SplitIdentityAlongAnchor(out_data_anchor, peer_in_data_anchor, pre_out_data_anchor, pre_node); + Status ret = SplitIdentityAlongAnchor(out_data_anchor, peer_in_data_anchor, pre_out_data_anchor, pre_node); if (ret != SUCCESS) { GELOGE(ret, "Split identity node along anchor failed."); return ret; @@ -551,7 +550,7 @@ Status SplitIdentity(const NodePtr &node) { } // 2.isolate Identity node with no data output if (node->GetOutDataNodesSize() == 0) { - ret = GraphUtils::IsolateNode(node, {}); + Status ret = GraphUtils::IsolateNode(node, {}); if (ret != SUCCESS) { GELOGE(FAILED, "IsolateAndDelete identity node %s.", node->GetName().c_str()); return FAILED; diff --git a/src/ge/graph/partition/dynamic_shape_partition.cc b/src/ge/graph/partition/dynamic_shape_partition.cc index e5a33b37..9cc7d0f4 100644 --- a/src/ge/graph/partition/dynamic_shape_partition.cc +++ b/src/ge/graph/partition/dynamic_shape_partition.cc @@ -43,18 +43,13 @@ #define REQUIRE_SUCCESS(cond, ...) REQUIRE(((cond) == SUCCESS), __VA_ARGS__) #define REQUIRE_GRAPH_SUCCESS(cond, ...) REQUIRE(((cond) == GRAPH_SUCCESS), __VA_ARGS__) -bool IsExperimental() { - const static bool kIsExperimental = (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") != nullptr); - return kIsExperimental; -} - namespace ge { using Cluster = DynamicShapePartitioner::Cluster; using ClusterPtr = std::shared_ptr; Status DynamicShapePartitioner::Partition() { REQUIRE_NOT_NULL(root_graph_, "Graph is nullptr."); - if (!IsExperimental()) { + if (!GraphUtils::IsUnknownShapeGraph(root_graph_)) { GELOGD("Skip dynamic shape partition as not in experimental mode."); REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false), "Failed set dynamic shape partitioned flag on root graph."); @@ -872,7 +867,8 @@ void Cluster::Clear() { control_outputs_.clear(); partition_node_.reset(); subgraph_.reset(); + unique_id_ = 0; } -size_t Cluster::unique_id_ = 0; +thread_local size_t Cluster::unique_id_ = 0; } // namespace ge diff --git a/src/ge/graph/partition/dynamic_shape_partition.h b/src/ge/graph/partition/dynamic_shape_partition.h index b851a084..06a94833 100644 --- a/src/ge/graph/partition/dynamic_shape_partition.h +++ b/src/ge/graph/partition/dynamic_shape_partition.h @@ -81,7 +81,7 @@ class DynamicShapePartitioner { void Clear(); private: - static size_t unique_id_; + static thread_local size_t unique_id_; size_t id_; // Each Cluster records the maximum and minimum topological order of its node size_t min_; // maximum topological order diff --git a/src/ge/graph/partition/engine_place.cc b/src/ge/graph/partition/engine_place.cc index 2d1a7f13..ba651c88 100644 --- a/src/ge/graph/partition/engine_place.cc +++ b/src/ge/graph/partition/engine_place.cc @@ -15,19 +15,25 @@ */ #include "graph/partition/engine_place.h" + #include #include #include #include +#include + #include "common/op/ge_op_utils.h" #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" #include "init/gelib.h" #include "opskernel_manager/ops_kernel_manager.h" +#include "analyzer/analyzer.h" namespace ge { -Status EnginePlacer::Run() { - GELOGI("Engine placer starts."); +namespace { +std::mutex check_support_cost_mutex; +} +Status EnginePlacer::Check() const { if (compute_graph_ == nullptr) { GELOGE(GE_GRAPH_NULL_INPUT, "compute_graph_ is null."); return FAILED; @@ -37,23 +43,48 @@ Status EnginePlacer::Run() { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Run enginePlacer failed"); return FAILED; } + return SUCCESS; +} + +Status EnginePlacer::Run() { + std::lock_guard lock(check_support_cost_mutex); + + GELOGI("Engine placer starts."); + if (Check() != SUCCESS) { + return FAILED; + } + bool is_check_support_success = true; // Assign engine for each node in the graph - instance_ptr->DNNEngineManagerObj().InitPerformanceStaistic(); + ge::GELib::GetInstance()->DNNEngineManagerObj().InitPerformanceStaistic(); for (const auto &node_ptr : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(node_ptr); - GE_CHECK_NOTNULL(node_ptr->GetOpDesc()); + auto op_desc = node_ptr->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); std::string engine_name; + std::string kernel_name; // Check if this node has assigned engine - if ((!node_ptr->GetOpDesc()->GetOpKernelLibName().empty())) { - engine_name = node_ptr->GetOpDesc()->GetOpEngineName(); + bool has_engine_attr = + AttrUtils::GetStr(op_desc, ATTR_NAME_ENGINE_NAME_FOR_LX, engine_name) && !engine_name.empty(); + bool has_kernel_attr = + AttrUtils::GetStr(op_desc, ATTR_NAME_KKERNEL_LIB_NAME_FOR_LX, kernel_name) && !kernel_name.empty(); + bool use_exist_engine_name = !op_desc->GetOpKernelLibName().empty() || (has_kernel_attr && has_engine_attr); + if (use_exist_engine_name) { + if (op_desc->GetOpEngineName().empty()) { + GELOGI("Op %s set engine_name %s engine_name %s from attrs", op_desc->GetName().c_str(), engine_name.c_str(), + kernel_name.c_str()); + op_desc->SetOpEngineName(engine_name); + op_desc->SetOpKernelLibName(kernel_name); + } + engine_name = op_desc->GetOpEngineName(); } else { // Call placer cost model to get the "best" engine for this node - engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr->GetOpDesc()); - // If can't get op's engine name, return failed + engine_name = ge::GELib::GetInstance()->DNNEngineManagerObj().GetDNNEngineName(node_ptr); + // If can't get op's engine name, keep check support finish and return failed if (engine_name.empty()) { + is_check_support_success = false; GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s", node_ptr->GetOpDesc()->GetType().c_str()); - return FAILED; + continue; } } if (AssignEngineAndLog(node_ptr, engine_name) != SUCCESS) { @@ -61,11 +92,12 @@ Status EnginePlacer::Run() { return FAILED; } } - for (auto &it : instance_ptr->DNNEngineManagerObj().GetCheckSupportCost()) { + + for (auto &it : ge::GELib::GetInstance()->DNNEngineManagerObj().GetCheckSupportCost()) { GEEVENT("The time cost of %s::CheckSupported is [%lu] micro second.", it.first.c_str(), it.second); } GELOGI("Engine placer ends."); - return SUCCESS; + return is_check_support_success ? SUCCESS : FAILED; } Status EnginePlacer::AssignEngineAndLog(ge::ConstNodePtr node_ptr, const std::string &engine_name) { diff --git a/src/ge/graph/partition/engine_place.h b/src/ge/graph/partition/engine_place.h index 8a3e83a5..1672df0d 100644 --- a/src/ge/graph/partition/engine_place.h +++ b/src/ge/graph/partition/engine_place.h @@ -46,6 +46,7 @@ class EnginePlacer { private: Status AssignEngineAndLog(ConstNodePtr node_ptr, const std::string &engine_name); + Status Check() const; ComputeGraphPtr compute_graph_; NodeEngineMap node_engine_map_; diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc index 15f298c0..b280074e 100644 --- a/src/ge/graph/partition/graph_partition.cc +++ b/src/ge/graph/partition/graph_partition.cc @@ -362,13 +362,18 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr } GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", graph_info_.num_of_pld_end_), GELOGW("SetInt peerIndex failed");) + GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_peerNodeName", new_end_node->GetName()), + GELOGW("SetStr _peerNodeName failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentOpType", src_node->GetType()), GELOGW("SetStr parentOpType failed");) + GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_parentNodeName", src_node->GetName()), + GELOGW("SetStr parentOpName failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentId", end_graph->GetName() + ":" + std::to_string(node_id)), GELOGW("SetStr parentId failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)), GELOGW("SetInt anchorIndex failed");) GE_IF_BOOL_EXEC(!pld_op_desc->SetExtAttr("parentNode", src_node), GELOGW("SetPldExtAttr parentNode failed");) + OpDescPtr src_node_op_desc = src_node->GetOpDesc(); GE_CHECK_NOTNULL(src_node_op_desc); GE_IF_BOOL_EXEC( @@ -530,6 +535,10 @@ Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) { ClusterPtr cluster = MakeShared(temp_index, kEngineDefaultData, temp_stream); new_cluster = cluster; } else { + if (node_engine_map->count(node) == 0) { + GELOGE(FAILED, "node[%s] does not owner engine!", node->GetName().c_str()); + return FAILED; + } ClusterPtr cluster = MakeShared(temp_index, node_engine_map->at(node), temp_stream); new_cluster = cluster; } @@ -577,32 +586,33 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vectorSetParentNode(compute_graph->GetParentNode()); (void)AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName()); + auto sgi = MakeShared(); + if (sgi == nullptr) { + GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed."); + return FAILED; + } + // set engine name + sgi->SetEngineName(engine_name); + // set stream label + string sub_graph_stream; + if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) { + sgi->SetStreamLabel(sub_graph_stream); + } + /// for now inputFlag is the same before and after partition. It should + /// be changed according to the real partition + std::vector sub_graph_input(graph_info_.input_size_, true); + std::vector sub_graph_output(graph_info_.output_size_, true); + sgi->SetSubGraph(sub_graph); + sgi->SetOutputFlag(sub_graph_output); + sgi->SetInputFlag(sub_graph_input); + sgi->SetOutputContext(graph_info_.output_name_); + AddEndPldInformationToSubGraphInfo(sgi); + GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(), + sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str()); if (engine_name != input_subgraph_name) { // do not add Data subGraph into SubGraphInfo - auto sgi = MakeShared(); - if (sgi == nullptr) { - GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed."); - return FAILED; - } - // set engine name - sgi->SetEngineName(engine_name); - // set stream label - string sub_graph_stream; - if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) { - sgi->SetStreamLabel(sub_graph_stream); - } - /// for now inputFlag is the same before and after partition. It should - /// be changed according to the real partition - std::vector sub_graph_input(graph_info_.input_size_, true); - std::vector sub_graph_output(graph_info_.output_size_, true); - sgi->SetSubGraph(sub_graph); - sgi->SetOutputFlag(sub_graph_output); - sgi->SetInputFlag(sub_graph_input); - sgi->SetOutputContext(graph_info_.output_name_); - AddEndPldInformationToSubGraphInfo(sgi); - GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", - engine_name.c_str(), sub_graph->GetName().c_str(), - sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str()); output_subgraphs.push_back(sgi); + } else { + graph_2_input_subgraph_[compute_graph] = sgi; } } return SUCCESS; diff --git a/src/ge/graph/partition/graph_partition.h b/src/ge/graph/partition/graph_partition.h index 26592359..a363bd9d 100644 --- a/src/ge/graph/partition/graph_partition.h +++ b/src/ge/graph/partition/graph_partition.h @@ -173,8 +173,10 @@ class GraphPartitioner { }; std::unordered_map graph_2_graph_partition_info_; Graph2SubGraphInfoList graph_2_subgraph_list_; + Graph2InputNodesSubGraphInfo graph_2_input_subgraph_; GraphPartitionInfo graph_info_; uint32_t partition_times_; // times of call partition + friend class GraphManager; }; } // namespace ge diff --git a/src/ge/graph/passes/common_subexpression_elimination_pass.cc b/src/ge/graph/passes/common_subexpression_elimination_pass.cc index 18f2e857..4415d144 100644 --- a/src/ge/graph/passes/common_subexpression_elimination_pass.cc +++ b/src/ge/graph/passes/common_subexpression_elimination_pass.cc @@ -20,6 +20,7 @@ #include #include +#include "common/base64.h" #include "graph/utils/node_utils.h" #include "ge_local_engine/engine/host_cpu_engine.h" #include "graph/passes/folding_pass.h" @@ -83,7 +84,7 @@ Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) { continue; } auto key = GetCseKey(node); - GELOGD("The node %s cse key %s", node->GetName().c_str(), key.c_str()); + GELOGD("The node %s cse key %s", node->GetName().c_str(), ge::base64::EncodeToBase64(key).c_str()); auto iter = keys_to_node.find(key); if (iter == keys_to_node.end()) { keys_to_node[key] = node; diff --git a/src/ge/graph/passes/compile_nodes_pass.cc b/src/ge/graph/passes/compile_nodes_pass.cc index 330569a2..a93671c7 100644 --- a/src/ge/graph/passes/compile_nodes_pass.cc +++ b/src/ge/graph/passes/compile_nodes_pass.cc @@ -93,7 +93,7 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std: // reset op kernel lib, find supported kernel kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { - (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc); + (void)instance->DNNEngineManagerObj().GetDNNEngineName(node); kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { GELOGE(GRAPH_FAILED, "Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), diff --git a/src/ge/graph/passes/cond_pass.cc b/src/ge/graph/passes/cond_pass.cc index 03ca9009..c3a421b1 100644 --- a/src/ge/graph/passes/cond_pass.cc +++ b/src/ge/graph/passes/cond_pass.cc @@ -227,7 +227,7 @@ Status CondPass::HandleScalarCond(const ComputeGraphPtr &graph, const OutDataAnc GELOGI("Handle cond with scalar cond-input."); GeTensorDesc tensor = out_anchor->GetOwnerNode()->GetOpDesc()->GetOutputDesc(out_anchor->GetIdx()); - std::string cast_name = out_anchor->GetOwnerNode()->GetName() + "_Cast"; + std::string cast_name = in_anchor->GetOwnerNode()->GetName() + "_Cast"; NodePtr cast_node = AddCastNode(graph, cast_name, tensor, src_type, DT_INT32); if (cast_node == nullptr) { GELOGE(FAILED, "Add Cast node failed, name:%s.", cast_name.c_str()); @@ -266,7 +266,7 @@ Status CondPass::InsertNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr out_tensor.SetShape(in_tensor.GetShape()); out_tensor.SetOriginShape(in_tensor.GetOriginShape()); - OpDescBuilder op_desc_builder(out_anchor->GetOwnerNode()->GetName() + "_" + type, type); + OpDescBuilder op_desc_builder(in_anchor->GetOwnerNode()->GetName() + "_" + type, type); OpDescPtr op_desc = op_desc_builder.AddInput("x", in_tensor).AddOutput("y", out_tensor).Build(); if (op_desc == nullptr) { GELOGE(FAILED, "Create op_desc failed."); diff --git a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc index 9454c00d..6c426e95 100644 --- a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc +++ b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc @@ -20,6 +20,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "framework/common/util.h" #include "graph/utils/graph_utils.h" +#include "graph/debug/ge_attr_define.h" namespace ge { /* Pass Explaination: @@ -42,6 +43,12 @@ Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) { GELOGD("CtrlEdgeTransferPass start running"); GE_CHECK_NOTNULL(graph); + bool is_dynamic_shape = false; + (void)AttrUtils::GetBool(graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape); + if (!is_dynamic_shape) { + return SUCCESS; + } + for (ge::NodePtr &n : graph->GetDirectNode()) { auto op_desc = n->GetOpDesc(); if (op_desc == nullptr) { diff --git a/src/ge/graph/passes/end_of_sequence_add_control_pass.cc b/src/ge/graph/passes/end_of_sequence_add_control_pass.cc index a3928835..90c0841c 100644 --- a/src/ge/graph/passes/end_of_sequence_add_control_pass.cc +++ b/src/ge/graph/passes/end_of_sequence_add_control_pass.cc @@ -112,7 +112,7 @@ bool EndOfSequenceAddControlPass::IsDataLikeNode(const NodePtr &node) { } string engine_name = op_desc->GetOpEngineName(); if (engine_name.empty()) { - engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node->GetOpDesc()); + engine_name = instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node); } const map schedulers = instance_ptr->DNNEngineManagerObj().GetSchedulers(); // Only one scheduler has been supported by now diff --git a/src/ge/graph/passes/folding_pass.cc b/src/ge/graph/passes/folding_pass.cc index 8281db5d..b52a3226 100644 --- a/src/ge/graph/passes/folding_pass.cc +++ b/src/ge/graph/passes/folding_pass.cc @@ -142,8 +142,8 @@ Status FoldingPass::Folding(NodePtr &node, vector &outputs) { for (auto iter = in_data_nodes_set.begin(); iter != in_data_nodes_set.end(); ++iter) { auto pre_node = *iter; if (pre_node->GetOutDataNodesSize() == 0) { - if (pre_node->GetType() == DATA) { - GELOGI("No need to remove data, node name:%s.", pre_node->GetName().c_str()); + if ((pre_node->GetType() == DATA) || (pre_node->GetType() == ENTER)) { + GELOGI("No need to remove data/enter, node name:%s.", pre_node->GetName().c_str()); continue; } if (IsolateAndDeleteNode(pre_node, {}) != SUCCESS) { @@ -174,7 +174,7 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) { if (in_node == nullptr) { continue; } - if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH) || (in_node->GetType() == SWITCHN)) { + if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH)) { GELOGI("The in_node name is %s, and node type is %s.", in_node->GetName().c_str(), in_node->GetType().c_str()); auto ret = in_node_anchor->Unlink(in_data_anchor); if (ret != SUCCESS) { diff --git a/src/ge/graph/passes/get_original_format_pass.cc b/src/ge/graph/passes/get_original_format_pass.cc index 066c46ea..8c3c84f9 100644 --- a/src/ge/graph/passes/get_original_format_pass.cc +++ b/src/ge/graph/passes/get_original_format_pass.cc @@ -25,6 +25,7 @@ #include "framework/omg/omg_inner_types.h" #include "graph/utils/attr_utils.h" #include "graph/utils/op_desc_utils.h" +#include "graph/common/local_context.h" using domi::DOMI_TENSOR_NCHW; using domi::DOMI_TENSOR_NHWC; @@ -33,8 +34,6 @@ using domi::FAILED; using domi::PARAM_INVALID; using domi::SUCCESS; -using domi::GetContext; - namespace ge { Status GetOriginalFormatPass::Run(ge::ComputeGraphPtr graph) { GE_CHECK_NOTNULL(graph); @@ -62,8 +61,8 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph GE_CHECK_NOTNULL(desc_ptr); auto is_data = (desc_ptr->GetType() == DATA_TYPE || desc_ptr->GetType() == AIPP_DATA_TYPE); if (is_data) { - GELOGI("Data node: %s,format :%d", node_ptr->GetName().c_str(), domi::GetContext().format); - ori_format = static_cast(domi::GetContext().format); + GELOGI("Data node: %s,format :%d", node_ptr->GetName().c_str(), GetLocalOmgContext().format); + ori_format = static_cast(GetLocalOmgContext().format); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(desc_ptr, ATTR_NAME_FORMAT, ori_format), GELOGE(FAILED, "set ATTR_NAME_FORMAT failed"); return FAILED); diff --git a/src/ge/graph/passes/infershape_pass.cc b/src/ge/graph/passes/infershape_pass.cc index 7ed1ea8c..cacca584 100644 --- a/src/ge/graph/passes/infershape_pass.cc +++ b/src/ge/graph/passes/infershape_pass.cc @@ -18,12 +18,21 @@ #include "common/util/error_manager/error_manager.h" #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" +#include "analyzer/analyzer.h" +#include "framework/common/util.h" #include "graph/shape_refiner.h" namespace ge { Status InferShapePass::Run(NodePtr &node) { auto ret = ShapeRefiner::InferShapeAndType(node, !OptionExists(kOptimizeAfterSubGraph)); if (ret != GRAPH_SUCCESS) { + // select INFERSHAPE failed info + auto graph = node->GetOwnerComputeGraph(); + GE_CHECK_NOTNULL(graph); + analyzer::DataInfo analyze_info{graph->GetSessionID(), graph->GetGraphID(), analyzer::INFER_SHAPE, node, + "InferShapeFailed!"}; + (void)Analyzer::GetInstance()->DoAnalyze(analyze_info); + GELOGE(GE_GRAPH_INFERSHAPE_FAILED, "infershape failed. node: %s", node->GetName().c_str()); return GE_GRAPH_INFERSHAPE_FAILED; } diff --git a/src/ge/graph/passes/iterator_op_pass.cc b/src/ge/graph/passes/iterator_op_pass.cc index 1d11004d..656ed390 100644 --- a/src/ge/graph/passes/iterator_op_pass.cc +++ b/src/ge/graph/passes/iterator_op_pass.cc @@ -73,14 +73,14 @@ Status IteratorOpPass::Run(ge::ComputeGraphPtr graph) { GE_IF_BOOL_EXEC(status != SUCCESS, GELOGW("Fail to Get var_desc of NODE_NAME_FLOWCTRL_LOOP_PER_ITER failed."); continue); Status ret; - ret = SetRtContext(graph->GetSessionID(), rtContext_t(), RT_CTX_NORMAL_MODE); + ret = SetRtContext(graph->GetSessionID(), graph->GetGraphID(), rtContext_t(), RT_CTX_NORMAL_MODE); // EOS will not be considered if ret is not SUCCESS. GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGW("Set rt context RT_CTX_NORMAL_MODE failed."); continue); status = GetVariableValue(graph->GetSessionID(), ge_tensor_desc, NODE_NAME_FLOWCTRL_LOOP_PER_ITER, &loop_per_iter); - ret = SetRtContext(graph->GetSessionID(), rtContext_t(), RT_CTX_GEN_MODE); + ret = SetRtContext(graph->GetSessionID(), graph->GetGraphID(), rtContext_t(), RT_CTX_GEN_MODE); // The following process will be affected if ret is not SUCCESS. GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Set rt context RT_CTX_GEN_MODE failed."); return ret); @@ -279,11 +279,14 @@ ge::OpDescPtr IteratorOpPass::CreateMemcpyAsyncOp(const ge::NodePtr &pre_node) { return op_desc; } -Status IteratorOpPass::SetRtContext(uint64_t session_id, rtContext_t rt_context, rtCtxMode_t mode) { - GELOGI("set rt_context %d, device id:%u.", static_cast(mode), ge::GetContext().DeviceId()); +Status IteratorOpPass::SetRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t rt_context, rtCtxMode_t mode) { + GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id, graph_id, + static_cast(mode), ge::GetContext().DeviceId()); + GE_CHK_RT_RET(rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId())); GE_CHK_RT_RET(rtCtxSetCurrent(rt_context)); - RtContextUtil::GetInstance().AddRtContext(session_id, rt_context); + RtContextUtil::GetInstance().AddRtContext(session_id, graph_id, rt_context); + return SUCCESS; } } // namespace ge diff --git a/src/ge/graph/passes/iterator_op_pass.h b/src/ge/graph/passes/iterator_op_pass.h index 78b951e6..77e80600 100644 --- a/src/ge/graph/passes/iterator_op_pass.h +++ b/src/ge/graph/passes/iterator_op_pass.h @@ -64,7 +64,7 @@ class IteratorOpPass : public GraphPass { /// ge::OpDescPtr CreateMemcpyAsyncOp(const ge::NodePtr &pre_node); - Status SetRtContext(uint64_t session_id, rtContext_t rt_context, rtCtxMode_t mode); + Status SetRtContext(uint64_t session_id, uint32_t graph_id, rtContext_t rt_context, rtCtxMode_t mode); }; } // namespace ge #endif // GE_GRAPH_PASSES_ITERATOR_OP_PASS_H_ diff --git a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc index 63ca68a2..4f122fb2 100644 --- a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc +++ b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc @@ -127,7 +127,7 @@ Status LinkGenMaskNodesPass::GetGenMaskGroupSize(vector &gen_mask_nodes auto ge_lib = GELib::GetInstance(); if ((ge_lib != nullptr) && ge_lib->InitFlag()) { - (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(gen_mask_op); + (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(gen_mask_node); } size_t gen_mask_group_num = kDefaultMaxParallelNum; diff --git a/src/ge/graph/passes/memcpy_addr_async_pass.cc b/src/ge/graph/passes/memcpy_addr_async_pass.cc index 3af40888..934f4737 100644 --- a/src/ge/graph/passes/memcpy_addr_async_pass.cc +++ b/src/ge/graph/passes/memcpy_addr_async_pass.cc @@ -19,6 +19,8 @@ #include "common/ge/ge_util.h" #include "framework/common/debug/log.h" #include "graph/utils/node_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" namespace ge { Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) { @@ -262,6 +264,11 @@ Status MemcpyAddrAsyncPass::InsertMemAddrAsyncNodeBeforeNetoutput(const ComputeG if ((in_node->GetType() != CONSTANT) && (in_node->GetType() != CONSTANTOP) && (in_node->GetType() != DATA)) { continue; } + auto desc = in_node->GetOpDesc(); + GE_CHECK_NOTNULL(desc); + if (IsEmptyTenor(desc->GetOutputDesc(peer_out_anchor->GetIdx()).GetShape())) { + continue; + } GELOGI("Need to insert MemcpyAddrAsync before netoutput on parent graph."); NodePtr memcpy_addr_async_node = CreateMemcpyAddrAsyncNode(graph, peer_out_anchor, in_node); GE_IF_BOOL_EXEC(memcpy_addr_async_node == nullptr, GELOGE(INTERNAL_ERROR, "CreateMemcpyAddrAsyncNode failed."); @@ -271,9 +278,30 @@ Status MemcpyAddrAsyncPass::InsertMemAddrAsyncNodeBeforeNetoutput(const ComputeG GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "InsertMemcpyAddrAsyncNode failed."); return ret); GELOGI("Insert mem_addr_async node %s success between %s and %s.", memcpy_addr_async_node->GetName().c_str(), in_node->GetName().c_str(), node->GetName().c_str()); - NodeUtils::UpdateIsInputConst(memcpy_addr_async_node); + // if src node is const, need to update attr and offset here because this pass process is after offset set. + if ((in_node->GetType() == CONSTANT) || (in_node->GetType() == CONSTANTOP)) { + NodeUtils::UpdateIsInputConst(memcpy_addr_async_node); + auto output_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(output_desc); + auto output_tensor_desc = output_desc->MutableInputDesc(static_cast(in_data_anchor->GetIdx())); + int64_t data_offset = 0; + (void)TensorUtils::GetDataOffset(*output_tensor_desc, data_offset); + auto input_tensor = memcpy_addr_async_node->GetOpDesc()->MutableInputDesc(0); + GELOGI("Need update const Offset %ld to op [%s]", data_offset, memcpy_addr_async_node->GetName().c_str()); + TensorUtils::SetDataOffset(*input_tensor, data_offset); + TensorUtils::SetDataOffset(*output_tensor_desc, 0); + } } NodeUtils::UpdateIsInputConst(node); return SUCCESS; } + +bool MemcpyAddrAsyncPass::IsEmptyTenor(const GeShape &shape) const { + for (const auto dim : shape.GetDims()) { + if (dim == 0) { + return true; + } + } + return false; +} } // namespace ge diff --git a/src/ge/graph/passes/memcpy_addr_async_pass.h b/src/ge/graph/passes/memcpy_addr_async_pass.h index 1f184bd5..a70fcbdd 100644 --- a/src/ge/graph/passes/memcpy_addr_async_pass.h +++ b/src/ge/graph/passes/memcpy_addr_async_pass.h @@ -30,6 +30,7 @@ class MemcpyAddrAsyncPass : public GraphPass { void FindUserData(const NodePtr &node, uint32_t &parent_index); void FindUserDataForKnown(const NodePtr &parent_node, uint32_t &parent_index); void FindUserDataForNonDynamic(const ge::NodePtr &parent_node, uint32_t &parent_index); + bool IsEmptyTenor(const GeShape &shape) const; NodePtr CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor, const NodePtr &out_of_user_data); diff --git a/src/ge/graph/passes/multi_batch_clone_pass.cc b/src/ge/graph/passes/multi_batch_clone_pass.cc index 3390e783..4bf41dcb 100644 --- a/src/ge/graph/passes/multi_batch_clone_pass.cc +++ b/src/ge/graph/passes/multi_batch_clone_pass.cc @@ -16,8 +16,8 @@ #include "graph/passes/multi_batch_clone_pass.h" -#include "common/ge/ge_util.h" #include "common/formats/utils/formats_trans_utils.h" +#include "common/ge/ge_util.h" #include "graph/preprocess/multi_batch_options.h" #include "graph/utils/node_utils.h" #include "graph/utils/op_desc_utils.h" @@ -30,7 +30,9 @@ constexpr uint8_t kDataOutIndex = 0; constexpr uint8_t kCaseArgIndex = 1; const std::string kMultiBatchCaseNode = "ascend_mbatch_shape_case"; -const std::string kMultiBatchIndexNode = "ascend_mbatch_shape_data"; +const std::string kMultiBatchDataNode = "ascend_mbatch_shape_data"; +const std::string kMultiBatchConstNode = "ascend_mbatch_shape_const"; +const std::string kMultiBatchMapIndexNode = "ascend_mbatch_shape_mapindex"; } // namespace Status MultiBatchClonePass::Run(ComputeGraphPtr graph) { @@ -59,6 +61,7 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) { } (void)AttrUtils::SetStr(branch, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_); + graph->InValid(); // Will modify, need topological again. graph->Swap(*branch); if (CreateRootGraph(graph) != SUCCESS) { return FAILED; @@ -174,40 +177,130 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { /// /// @ingroup ge -/// @brief Create index node for root graph. +/// @brief Create index data node for root graph. /// @param [in] const ComputeGraphPtr &graph: Root/Case graph. +/// @param [in] NodePtr node: index data node. /// @return 0: SUCCESS / others: FAILED /// -Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) { - // Data --> MapIndex --> Case - const OpDescPtr op_desc = MakeShared(kMultiBatchIndexNode, DATA); - if (op_desc == nullptr) { - GELOGE(OUT_OF_MEMORY, "Create multi-batch index node failed"); +Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node) { + const OpDescPtr data_desc = MakeShared(kMultiBatchDataNode, DATA); + if (data_desc == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed"); return FAILED; } - GeTensorDesc data_desc(GeShape(), FORMAT_ND, DT_INT32); - if (op_desc->AddInputDesc(data_desc) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Add output desc failed"); + GeTensorDesc data_tensor(GeShape({static_cast(batch_shapes_[0].size())}), FORMAT_ND, DT_INT32); + if (data_desc->AddInputDesc(data_tensor) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Add input desc failed"); return FAILED; } - if (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS) { + if (data_desc->AddOutputDesc(data_tensor) != GRAPH_SUCCESS) { GELOGE(FAILED, "Add output desc failed"); return FAILED; } size_t data_index = all_data_nodes_.size(); - (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index); - (void)AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true); + (void)AttrUtils::SetInt(data_desc, ATTR_NAME_INDEX, data_index); + (void)AttrUtils::SetBool(data_desc, ATTR_INSERT_BY_MBATCH, true); - index_node_ = graph->AddNode(op_desc); - if (index_node_ == nullptr) { - GELOGE(OUT_OF_MEMORY, "Create multi-batch case node failed"); + node = graph->AddNode(data_desc); + if (node == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed"); + return OUT_OF_MEMORY; + } + + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief Create index const node for root graph. +/// @param [in] const ComputeGraphPtr &graph: Root/Case graph. +/// @param [in] NodePtr node: index const node. +/// @return 0: SUCCESS / others: FAILED +/// +Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, NodePtr &node) { + const OpDescPtr const_desc = MakeShared(kMultiBatchConstNode, CONSTANT); + if (const_desc == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed"); + return FAILED; + } + + int64_t count = batch_shapes_.size() * batch_shapes_[0].size(); + std::unique_ptr addr(new (std::nothrow) int32_t[count]); + GE_CHECK_NOTNULL(addr); + + size_t i = 0; + for (auto &batch_shape : batch_shapes_) { + for (int64_t dim : batch_shape) { + addr[i++] = static_cast(dim); + } + } + + GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32); + GeTensor tensor(const_tensor); + tensor.SetData(reinterpret_cast(addr.get()), count * sizeof(int32_t)); + if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) { + GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str()); + return FAILED; + } + + if (const_desc->AddOutputDesc(const_tensor) != GRAPH_SUCCESS) { + GELOGE(OUT_OF_MEMORY, "Failed to add output desc for const node %s", const_desc->GetName().c_str()); + return FAILED; + } + + node = graph->AddNode(const_desc); + if (node == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed"); + return OUT_OF_MEMORY; + } + + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief Create index node for root graph. +/// @param [in] const ComputeGraphPtr &graph: Root/Case graph. +/// @return 0: SUCCESS / others: FAILED +/// +Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) { + // Data --> MapIndex --> Case + NodePtr data_node; + GE_CHK_STATUS_RET(CreateIndexDataNode(graph, data_node), "Create data node failed"); + + NodePtr const_node; + GE_CHK_STATUS_RET(CreateIndexConstNode(graph, const_node), "Create const node failed"); + + OpDescBuilder op_builder(kMultiBatchMapIndexNode, "MapIndex"); + op_builder.AddInput("x", data_node->GetOpDesc()->GetOutputDesc(0)) + .AddInput("data_seq", const_node->GetOpDesc()->GetOutputDesc(0)) + .AddOutput("y", GeTensorDesc(GeShape(), FORMAT_ND, DT_INT32)); + + const OpDescPtr op_desc = op_builder.Build(); + if (op_desc == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch index desc failed"); + return FAILED; + } + NodePtr index_node = graph->AddNode(op_desc); + if (index_node == nullptr) { + GELOGE(OUT_OF_MEMORY, "Create multi-batch index node failed"); return OUT_OF_MEMORY; } - if (GraphUtils::AddEdge(index_node_->GetOutDataAnchor(0), case_node_->GetInDataAnchor(0)) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Failed to add edge between Data:%s to Case:%s", index_node_->GetName().c_str(), + if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", data_node->GetName().c_str(), + index_node->GetName().c_str()); + return FAILED; + } + if (GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(1)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", const_node->GetName().c_str(), + index_node->GetName().c_str()); + return FAILED; + } + if (GraphUtils::AddEdge(index_node->GetOutDataAnchor(0), case_node_->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Failed to add edge between MapIndex:%s to Case:%s", index_node->GetName().c_str(), case_node_->GetName().c_str()); return FAILED; } @@ -366,6 +459,7 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) { return SUCCESS; } + (void)AttrUtils::SetListInt(data->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims()); size_t max_shape_index = 0; int64_t max_size = 0; for (size_t i = 0; i < batch_shapes_.size(); ++i) { diff --git a/src/ge/graph/passes/multi_batch_clone_pass.h b/src/ge/graph/passes/multi_batch_clone_pass.h index 1da08e78..0d52b738 100644 --- a/src/ge/graph/passes/multi_batch_clone_pass.h +++ b/src/ge/graph/passes/multi_batch_clone_pass.h @@ -17,9 +17,9 @@ #ifndef GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_ #define GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_ +#include #include #include -#include #include "inc/graph_pass.h" @@ -45,6 +45,24 @@ class MultiBatchClonePass : public GraphPass { /// Status CreateRootGraph(const ComputeGraphPtr &graph); + /// + /// @ingroup ge + /// @brief Create index data node for root graph. + /// @param [in] const ComputeGraphPtr &graph: Root/Case graph. + /// @param [in] NodePtr node: index data node. + /// @return 0: SUCCESS / others: FAILED + /// + Status CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node); + + /// + /// @ingroup ge + /// @brief Create index const node for root graph. + /// @param [in] const ComputeGraphPtr &graph: Root/Case graph. + /// @param [in] NodePtr node: index const node. + /// @return 0: SUCCESS / others: FAILED + /// + Status CreateIndexConstNode(const ComputeGraphPtr &graph, NodePtr &node); + /// /// @ingroup ge /// @brief Create index node for root graph. @@ -149,7 +167,6 @@ class MultiBatchClonePass : public GraphPass { std::map all_branch_output_; NodePtr case_node_; - NodePtr index_node_; }; } // namespace ge #endif // GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_ diff --git a/src/ge/graph/passes/net_output_pass.cc b/src/ge/graph/passes/net_output_pass.cc index f9c3835f..8ded625c 100644 --- a/src/ge/graph/passes/net_output_pass.cc +++ b/src/ge/graph/passes/net_output_pass.cc @@ -27,6 +27,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "framework/omg/omg_inner_types.h" #include "graph/debug/ge_attr_define.h" +#include "graph/common/local_context.h" #include "graph/passes/pass_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" @@ -413,7 +414,7 @@ Status NetOutputPass::ProcessWithNetoutput(const ge::ComputeGraphPtr &graph, con Status NetOutputPass::AddCtrlEdgesBetweenLeafAndNetOutput(const ge::ComputeGraphPtr &graph, const ge::NodePtr &net_out_node) { GE_CHECK_NOTNULL(net_out_node); - if (!domi::GetContext().user_out_nodes.empty()) { + if (!GetLocalOmgContext().user_out_nodes.empty()) { GELOGI("No need to add ctrl edge to netoutput because user out nodes have been set."); return SUCCESS; } @@ -603,7 +604,7 @@ Status NetOutputPass::SetUserDefDTypeAndFormatFromAtcParams(const NodePtr &outpu GELOGI("[NETOUTPUT PASS] The graph no need netoutput node!"); return SUCCESS; } - auto output_type = domi::GetContext().output_type; + auto output_type = GetLocalOmgContext().output_type; auto op_desc = output_node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); std::vector userdef_dtypes; diff --git a/src/ge/graph/passes/permute_pass.cc b/src/ge/graph/passes/permute_pass.cc index 3c0dfd4e..e55edbb2 100644 --- a/src/ge/graph/passes/permute_pass.cc +++ b/src/ge/graph/passes/permute_pass.cc @@ -24,10 +24,10 @@ #include "inc/kernel.h" #include "inc/kernel_factory.h" #include "framework/omg/omg_inner_types.h" +#include "graph/common/local_context.h" using domi::DOMI_TENSOR_ND; using domi::DOMI_TENSOR_NHWC; -using domi::GetContext; using domi::SUCCESS; using domi::TENSORFLOW; @@ -39,11 +39,11 @@ Status PermutePass::Run(ComputeGraphPtr graph) { OpDescPtr op_desc_ptr = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc_ptr); GE_IF_BOOL_EXEC( - op_desc_ptr->GetType() == PERMUTE && GetContext().type == domi::TENSORFLOW, + op_desc_ptr->GetType() == PERMUTE && GetLocalOmgContext().type == domi::TENSORFLOW, /// Input format 5D means NHWC in 4D way. So if input origin foramt is NCHW and /// permute paramter list is [0,3,1,2], this permute can be optimised. GE_IF_BOOL_EXEC( - GetContext().format != DOMI_TENSOR_ND, + GetLocalOmgContext().format != DOMI_TENSOR_ND, // Get input origin foramt for (NodePtr &n : graph->GetDirectNode()) { diff --git a/src/ge/graph/passes/reshape_recovery_pass.cc b/src/ge/graph/passes/reshape_recovery_pass.cc index 07b08de9..a3de0525 100644 --- a/src/ge/graph/passes/reshape_recovery_pass.cc +++ b/src/ge/graph/passes/reshape_recovery_pass.cc @@ -20,7 +20,7 @@ namespace ge { namespace { NodePtr CreateReshape(const ConstGeTensorDescPtr &src, const ConstGeTensorDescPtr &dst, const ComputeGraphPtr &graph) { - static std::atomic reshape_num(0); + static std::atomic_long reshape_num(0); auto next_num = reshape_num.fetch_add(1); auto reshape = MakeShared("Reshape_ReshapeRecoveryPass_" + std::to_string(next_num), RESHAPE); if (reshape == nullptr) { @@ -83,4 +83,4 @@ Status ReshapeRecoveryPass::Run(ComputeGraphPtr graph) { } return SUCCESS; } -} // namespace ge \ No newline at end of file +} // namespace ge diff --git a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc index d51f52e1..2146a35d 100644 --- a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc +++ b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc @@ -64,9 +64,10 @@ void SameTransdataBreadthFusionPass::GetSubGraphNodesInfo() { } OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, const GeTensorDesc &out_desc) { - static uint32_t fusion_cast_op_count = 1; + static std::atomic_long atomic_fusion_cast_op_count(1); + auto fusion_cast_op_count = atomic_fusion_cast_op_count.fetch_add(1); std::stringstream cast_op_name; - cast_op_name << "fusion_cast_" << fusion_cast_op_count++; + cast_op_name << "fusion_cast_" << fusion_cast_op_count; auto node_op = ge::OperatorFactory::CreateOperator(cast_op_name.str(), CAST); auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); diff --git a/src/ge/graph/passes/subexpression_migration_pass.cc b/src/ge/graph/passes/subexpression_migration_pass.cc index cb09a743..c7f3845e 100644 --- a/src/ge/graph/passes/subexpression_migration_pass.cc +++ b/src/ge/graph/passes/subexpression_migration_pass.cc @@ -24,7 +24,6 @@ namespace ge { constexpr uint32_t kDataOutIndex = 0; constexpr uint32_t kCaseInputBase = 1; constexpr uint32_t kInvalidParent = 0x7fffffffU; -const std::set kTransOpTypes = {"Cast", "TransData", "Reshape", "BnHost"}; bool IsSameTensor(ConstGeTensorDescPtr src_tensor, ConstGeTensorDescPtr dst_tensor) { if ((src_tensor == nullptr) && (dst_tensor == nullptr)) { @@ -163,7 +162,6 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap } data_nodes[parent_index] = data; - GELOGD("Subgraph %s has %zu Data nodes", subgraph->GetName().c_str(), data_nodes.size()); } } @@ -181,9 +179,9 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap /// /// @ingroup ge /// @brief Get all Data nodes for all subgraph. -/// @param [in] graph: Root compute graph. -/// @param [in] func_desc: functional OpDesc of Case. -/// @param [out] graph_nodes: Data groups of subgraph. +/// @param [in] node: Node Directly to Data. +/// @param [out] inputs: parent index of Input. +/// @param [out] outputs: parent index of Output. /// @return true: SUCCESS / false: FAILED /// bool SubexpressionMigrationPass::GetAssociatedNodes(const NodePtr &node, map &inputs, @@ -227,9 +225,9 @@ bool SubexpressionMigrationPass::GetAssociatedNodes(const NodePtr &node, map> &graph_nodes, @@ -245,10 +243,10 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const mapsecond; const auto &out_anchor = work_data->GetOutDataAnchor(kDataOutIndex); - const auto &in_ahchors = out_anchor->GetPeerInDataAnchors(); - const auto &in_anchor = in_ahchors.at(anchor_idx); + const auto &in_anchors = out_anchor->GetPeerInDataAnchors(); + const auto &in_anchor = in_anchors.at(anchor_idx); if (in_anchor == nullptr) { - GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_ahchors.size()); + GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_anchors.size()); return false; } @@ -288,7 +286,8 @@ Status SubexpressionMigrationPass::GraphNodeMigration(const ComputeGraphPtr &gra for (size_t i = 0; i < in_anchors.size(); ++i) { const auto &in_anchor = in_anchors.at(i); const auto &base_node = in_anchor->GetOwnerNode(); - if (kTransOpTypes.count(base_node->GetType()) == 0) { + GELOGD("Get Data direct node: %s", base_node->GetName().c_str()); + if (!base_node->GetHostNode()) { continue; } @@ -453,7 +452,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra GELOGE(FAILED, "Node: %s parent index %u not found", attach->GetName().c_str(), i); return FAILED; } - if (it_idx->second == kInvalidParent) { // Not connnect, Skip. + if (it_idx->second == kInvalidParent) { // Not connect, Skip. continue; } @@ -469,7 +468,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra if (it_idx == outputs.end()) { return FAILED; } - if (it_idx->second == kInvalidParent) { // Not connnect, Skip. + if (it_idx->second == kInvalidParent) { // Not connect, Skip. continue; } diff --git a/src/ge/graph/passes/subexpression_migration_pass.h b/src/ge/graph/passes/subexpression_migration_pass.h index ac750725..fbe28cae 100644 --- a/src/ge/graph/passes/subexpression_migration_pass.h +++ b/src/ge/graph/passes/subexpression_migration_pass.h @@ -48,9 +48,9 @@ class SubexpressionMigrationPass : public GraphPass { /// /// @ingroup ge /// @brief Get all Data nodes for all subgraph. - /// @param [in] graph: Root compute graph. - /// @param [in] func_desc: functional OpDesc of Case. - /// @param [out] graph_nodes: Data groups of subgraph. + /// @param [in] node: Node Directly to Data. + /// @param [out] inputs: parent index of Input. + /// @param [out] outputs: parent index of Output. /// @return true: SUCCESS / false: FAILED /// bool GetAssociatedNodes(const NodePtr &node, map &inputs, map &outputs); @@ -59,13 +59,13 @@ class SubexpressionMigrationPass : public GraphPass { /// @ingroup ge /// @brief Get all Data nodes for all subgraph. /// @param [in] graph_nodes: Data groups of subgraph. - /// @param [in] data_base: Data Node for migration. - /// @param [in] data_idx: Data groups of subgraph. - /// @param [in] data_idx: Data groups of subgraph. + /// @param [in] base_node: Data Node for migration. + /// @param [in] node_idx: Parent index of Data node. + /// @param [in] anchor_idx: Anchor index of node. /// @return true: Same / false: not same /// bool IsParallelNodeSame(const map> &graph_nodes, const NodePtr &base_node, - uint32_t base_idx, uint32_t anchor_idx); + uint32_t node_idx, uint32_t anchor_idx); /// /// @ingroup ge @@ -134,4 +134,4 @@ class SubexpressionMigrationPass : public GraphPass { bool migration_append_{false}; }; } // namespace ge -#endif // GE_COMMON_SUBEXPRESSION_MIGRATION_H_ \ No newline at end of file +#endif // GE_COMMON_SUBEXPRESSION_MIGRATION_H_ diff --git a/src/ge/graph/passes/switch_data_edges_bypass.cc b/src/ge/graph/passes/switch_data_edges_bypass.cc index 059ad772..d7f5d90f 100644 --- a/src/ge/graph/passes/switch_data_edges_bypass.cc +++ b/src/ge/graph/passes/switch_data_edges_bypass.cc @@ -16,6 +16,7 @@ #include "switch_data_edges_bypass.h" +#include #include "common/debug/log.h" #include "common/ge/ge_util.h" #include "common/op/ge_op_utils.h" @@ -78,7 +79,8 @@ std::pair GetInDataNodeByIndex(const NodePtr &node, i return {out_anchor->GetOwnerNode(), out_anchor}; } NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { - static int identity_counter = 0; + static std::atomic_long atomic_identity_counter(0); + auto identity_counter = atomic_identity_counter.fetch_add(1); auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { @@ -100,7 +102,7 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { } auto identity_opdesc = - MakeShared("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter++), IDENTITY); + MakeShared("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter), IDENTITY); if (identity_opdesc == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index); return nullptr; @@ -117,7 +119,8 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { return identity; } NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { - static int counter = 0; + static std::atomic_long atomic_counter(0); + auto counter = atomic_counter.fetch_add(1); auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { @@ -138,7 +141,7 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { return nullptr; } - auto memcpy_opdesc = MakeShared("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter++), MEMCPYASYNC); + auto memcpy_opdesc = MakeShared("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter), MEMCPYASYNC); if (memcpy_opdesc == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index); return nullptr; @@ -218,4 +221,4 @@ Status SwitchDataEdgesBypass::BypassSwitch(const NodePtr &node) { return SUCCESS; } -} // namespace ge \ No newline at end of file +} // namespace ge diff --git a/src/ge/graph/passes/transop_breadth_fusion_pass.cc b/src/ge/graph/passes/transop_breadth_fusion_pass.cc index d8df4a22..5c754f4f 100644 --- a/src/ge/graph/passes/transop_breadth_fusion_pass.cc +++ b/src/ge/graph/passes/transop_breadth_fusion_pass.cc @@ -28,6 +28,12 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { return SUCCESS; } + // breadth fusion pass requires new topologic + Status ret_topo = graph->TopologicalSorting(); + if (ret_topo != SUCCESS) { + GELOGE(ret_topo, "TopologicalSorting the merged graph failed."); + return ret_topo; + } for (auto const &node : graph->GetDirectNode()) { GE_CHECK_NOTNULL(node); diff --git a/src/ge/graph/passes/transop_symmetry_elimination_pass.cc b/src/ge/graph/passes/transop_symmetry_elimination_pass.cc index 9d0ac4d4..e217656c 100644 --- a/src/ge/graph/passes/transop_symmetry_elimination_pass.cc +++ b/src/ge/graph/passes/transop_symmetry_elimination_pass.cc @@ -163,9 +163,9 @@ bool TransOpSymmetryEliminationPass::JudgeTransposeDBack2Raw(const NodePtr &src_ // which we get through 3: i = perm_1[perm_2[i]] // vector src_node_perm; - AttrUtils::GetListInt(src_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, src_node_perm); + (void)AttrUtils::GetListInt(src_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, src_node_perm); vector dst_node_perm; - AttrUtils::GetListInt(dst_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, dst_node_perm); + (void)AttrUtils::GetListInt(dst_node->GetOpDesc(), ge::PERMUTE_ATTR_PERM, dst_node_perm); if (src_node_perm.size() != dst_node_perm.size()) { return false; diff --git a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc index 3080e886..61bca6b8 100644 --- a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc +++ b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include "common/ge/ge_util.h" #include "common/ge_inner_error_codes.h" #include "common/types.h" @@ -451,9 +452,11 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkNodesWhenDescNotChanged( OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDesc &format_trans_input_desc, const GeTensorDesc &format_trans_output_desc) { - static uint32_t fusion_format_transfer_op_count = 1; + static std::atomic_long atomic_fusion_format_transfer_op_count(1); + auto fusion_format_transfer_op_count = atomic_fusion_format_transfer_op_count.fetch_add(1); + std::stringstream format_transfer_op_name; - format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count++; + format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count; OpDescPtr format_transfer_op = MakeShared(format_transfer_op_name.str().c_str(), TRANSDATA); if (format_transfer_op == nullptr) { GELOGE(INTERNAL_ERROR, "new format transfer op failed!"); @@ -496,9 +499,11 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_input_desc, const GeTensorDesc &cast_output_desc) { + static std::atomic_long atomic_fusion_cast_op_count(1); + auto fusion_cast_op_count = atomic_fusion_cast_op_count.fetch_add(1); + std::stringstream cast_op_name; - static uint32_t fusion_cast_op_count = 1; - cast_op_name << "fusion_cast_op_" << fusion_cast_op_count++; + cast_op_name << "fusion_cast_op_" << fusion_cast_op_count; auto node_op = ge::OperatorFactory::CreateOperator(cast_op_name.str(), CAST); auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); diff --git a/src/ge/graph/passes/transpose_transdata_pass.cc b/src/ge/graph/passes/transpose_transdata_pass.cc index 3ac6dea5..b9bd59be 100644 --- a/src/ge/graph/passes/transpose_transdata_pass.cc +++ b/src/ge/graph/passes/transpose_transdata_pass.cc @@ -43,7 +43,7 @@ Status TransposeTransDataPass::Run(NodePtr &node) { return PARAM_INVALID; } - if (op_desc->GetType() != TRANSPOSE && op_desc->GetType() != TRANSPOSED) { + if (op_desc->GetType() != TRANSPOSED) { return SUCCESS; } if (CheckOneInAndOneOutDataAnchor(node) != SUCCESS) { diff --git a/src/ge/graph/preprocess/graph_preprocess.cc b/src/ge/graph/preprocess/graph_preprocess.cc index 4df22cfc..20216941 100644 --- a/src/ge/graph/preprocess/graph_preprocess.cc +++ b/src/ge/graph/preprocess/graph_preprocess.cc @@ -32,6 +32,7 @@ #include "common/formats/utils/formats_trans_utils.h" #include "framework/common/debug/ge_log.h" #include "graph/common/ge_call_wrapper.h" +#include "graph/common/local_context.h" #include "graph/common/transop_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" @@ -1073,10 +1074,14 @@ Status GraphPrepare::CheckRefOp() { }; Status GraphPrepare::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode) { - GELOGI("set rt_context %d, device id:%u.", static_cast(mode), ge::GetContext().DeviceId()); + GE_CHECK_NOTNULL(compute_graph_); + GELOGI("set rt_context, session id: %lu, graph id: %u, mode %d, device id:%u.", session_id_, + compute_graph_->GetGraphID(), static_cast(mode), ge::GetContext().DeviceId()); + GE_CHK_RT_RET(rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId())); GE_CHK_RT_RET(rtCtxSetCurrent(rt_context)); - RtContextUtil::GetInstance().AddRtContext(session_id_, rt_context); + RtContextUtil::GetInstance().AddRtContext(session_id_, compute_graph_->GetGraphID(), rt_context); + return SUCCESS; } @@ -1109,14 +1114,14 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) { } Status GraphPrepare::UpdateInput(const std::vector &user_input) { - compute_graph_->SaveDataFormat(ge::TypeUtils::DomiFormatToFormat(domi::GetContext().format)); + compute_graph_->SaveDataFormat(ge::TypeUtils::DomiFormatToFormat(GetLocalOmgContext().format)); for (NodePtr &input_node : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(input_node); OpDescPtr op = input_node->GetOpDesc(); GE_CHECK_NOTNULL(op); if (op->GetType() == DATA) { GeAttrValue::INT index = 0; - if ((!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) || (domi::GetContext().is_dynamic_input)) { + if ((!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) || (GetLocalOmgContext().is_dynamic_input)) { GELOGW("Get index from data attr failed"); continue; } @@ -1357,7 +1362,7 @@ Status GraphPrepare::PrepareDynShape(ConstGraphPtr graph, const std::vector(options_.framework_type); + GetLocalOmgContext().type = static_cast(options_.framework_type); const Graph &const_graph = *graph; PP_RUN("Init", Init, const_graph, session_id); @@ -1520,7 +1525,7 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) { } Status GraphPrepare::CheckUserInput(const std::vector &user_input) { - if (domi::GetContext().is_dynamic_input) { + if (GetLocalOmgContext().is_dynamic_input) { return SUCCESS; } unsigned int node_num = 0; diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc index c231ef15..eb936282 100644 --- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc @@ -39,6 +39,7 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "proto/insert_op.pb.h" +#include "graph/common/local_context.h" #define SAVE_AIPP_ATTR(KEY, SAVE_TYPE) \ do { \ @@ -144,13 +145,13 @@ int64_t CalcMaxSize(int64_t batch_count) { } Format GetAndCheckFormat() { - switch (domi::GetContext().format) { + switch (GetLocalOmgContext().format) { case domi::DOMI_TENSOR_NCHW: return FORMAT_NCHW; case domi::DOMI_TENSOR_NHWC: return FORMAT_NHWC; default: - GELOGE(PARAM_INVALID, "Unexpected format found %d", static_cast(domi::GetContext().format)); + GELOGE(PARAM_INVALID, "Unexpected format found %d", static_cast(GetLocalOmgContext().format)); return FORMAT_ND; } } @@ -619,8 +620,9 @@ void AippOp::SetDtcDefaultValue() { Status AippOp::GenerateOpDesc(OpDescPtr op_desc) { GE_CHECK_NOTNULL(op_desc); - static int op_idx = 0; - op_desc->SetName(std::string("aipp_node").append(std::to_string(op_idx++))); + static std::atomic_long atomic_op_idx(0); + auto op_idx = atomic_op_idx.fetch_add(1); + op_desc->SetName(std::string("aipp_node").append(std::to_string(op_idx))); op_desc->SetType(AIPP); // Add two InputDesc, add the second after the first one is added successfully. diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc index 38bc595e..c55be013 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc @@ -39,6 +39,8 @@ using domi::AippOpParams; namespace ge { namespace { const char *const kMbatchSwitchnName = "mbatch-switch-name"; +const int64_t kFormatAgnosticSwitch = 1; +const int64_t kFormatDependInputIndex = 1; } // namespace static void ConvertShape2Nhwc(Format &format, vector &shape_vec) { if ((format == FORMAT_NHWC) || (shape_vec.size() != static_cast(NORMAL_TENSOR_SIZE))) { @@ -200,9 +202,28 @@ Status InsertNewOpUtil::GetAippParams(const std::unique_ptr return SUCCESS; } + +Status InsertNewOpUtil::AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node) { + GE_CHECK_NOTNULL(aipp_node); + auto next_nodes = aipp_node->GetOutDataNodes(); + for (const auto next_node : next_nodes) { + GE_CHECK_NOTNULL(next_node); + auto op_desc = next_node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + if (op_desc->GetType() == SWITCHN) { + GELOGI("Find switchn node [%s] after aipp [%s]", op_desc->GetName().c_str(), aipp_node->GetName().c_str()); + (void)AttrUtils::SetInt(op_desc, "_format_agnostic", kFormatAgnosticSwitch); + (void)AttrUtils::SetListInt(op_desc, "_format_agnostic_except_input", + std::vector({kFormatDependInputIndex})); + } + } + return SUCCESS; +} + Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) { std::map switchn_names_to_data; std::set updated_switchn; + NodePtr multbatch_case; for (auto &node : graph->GetDirectNode()) { if (node->GetType() == DATA) { @@ -213,6 +234,12 @@ Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) { } if (node->GetType() == AIPP) { GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn)); + // In dynamic batch/HW and dynamic aipp scend, switchn should be set format agnostic, otherwise transdata maybe + // inserted between aipp and switchn which introduce performance and memory increase problem. + GE_RETURN_IF_ERROR(AddFormatAgnosticAttrToSwitchn(node)); + } + if (node->GetType() == CASE && node->GetOpDesc()->HasAttr(ATTR_NAME_BATCH_NUM)) { + multbatch_case = node; } } @@ -225,8 +252,107 @@ Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) { GE_RETURN_IF_ERROR(UpdateDataBySwitchN(switchn, data_iter->second)); } + if (multbatch_case != nullptr) { + GE_RETURN_IF_ERROR(UpdateCaseNode(graph, multbatch_case)); + } + return SUCCESS; +} + +Status InsertNewOpUtil::FindMaxSizeNode(const ComputeGraphPtr &graph, const NodePtr &case_node, + map &max_sizes, + map &aipp_inputs) { + const auto &func_desc = case_node->GetOpDesc(); + for (const auto &name : func_desc->GetSubgraphInstanceNames()) { + const auto &subgraph = graph->GetSubgraph(name); + if (subgraph == nullptr) { + GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); + return GE_GRAPH_EMPTY_SUBGRAPH; + } + + std::set updated_switchn; // fix interface + for (auto &node : subgraph->GetDirectNode()) { + if (node->GetType() == AIPP) { + GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn)); + int64_t size = 0; + auto in_data_anchor = node->GetInDataAnchor(0); + GE_CHECK_NOTNULL(in_data_anchor); + auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_anchor); + const auto &src_node = peer_out_anchor->GetOwnerNode(); + const auto &src_op = src_node->GetOpDesc(); + GE_CHECK_NOTNULL(src_op); + + uint32_t parent_index = 0; + if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str()); + return FAILED; + } + + auto aipp_op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(aipp_op_desc); + auto input = aipp_op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(input); + if (TensorUtils::GetSize(*input, size) == GRAPH_SUCCESS) { + if (max_sizes[parent_index] < size) { + max_sizes[parent_index] = size; + aipp_inputs[parent_index] = input; + } + } + } + } + } + + return SUCCESS; +} + +Status InsertNewOpUtil::UpdateCaseNode(const ComputeGraphPtr &graph, const NodePtr &case_node) { + const auto &func_desc = case_node->GetOpDesc(); + map max_sizes; + map aipp_inputs; + + GE_RETURN_IF_ERROR(FindMaxSizeNode(graph, case_node, max_sizes, aipp_inputs)); + for (const auto &item : aipp_inputs) { + uint32_t parent_index = item.first; + const GeTensorDescPtr &aipp_input = item.second; + GE_CHECK_NOTNULL(aipp_input); + + const GeTensorDescPtr &input_desc = func_desc->MutableInputDesc(parent_index); + GE_CHECK_NOTNULL(input_desc); + input_desc->SetDataType(aipp_input->GetDataType()); + input_desc->SetOriginDataType(aipp_input->GetOriginDataType()); + input_desc->SetShape(aipp_input->GetShape()); + input_desc->SetOriginShape(aipp_input->GetShape()); + input_desc->SetFormat(aipp_input->GetFormat()); + input_desc->SetOriginFormat(aipp_input->GetFormat()); + ge::TensorUtils::SetSize(*input_desc, max_sizes[item.first]); + + const auto &in_anchor = case_node->GetInDataAnchor(parent_index); + const auto &out_anchor = in_anchor->GetPeerOutAnchor(); + const auto &data = out_anchor->GetOwnerNode(); + auto data_opdesc = data->GetOpDesc(); + GE_CHECK_NOTNULL(data_opdesc); + Format old_format = data_opdesc->MutableOutputDesc(0)->GetFormat(); + + auto ret = data_opdesc->UpdateOutputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to update data %s output using case %s", data->GetName().c_str(), + case_node->GetName().c_str()); + return INTERNAL_ERROR; + } + ret = data_opdesc->UpdateInputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to update data %s input using case %s", data->GetName().c_str(), + case_node->GetName().c_str()); + return INTERNAL_ERROR; + } + + // Update attr _mbatch_origin_input_dims for data when it is linked to aipp + UpdateMultiBatchInputDims(data_opdesc, old_format); + } + return SUCCESS; } + Status InsertNewOpUtil::UpdatePrevNodeByAipp(NodePtr &node, std::set &switchns) { GELOGI("Start to update prev node size by aipp %s.", node->GetName().c_str()); auto aipp_op_desc = node->GetOpDesc(); @@ -389,7 +515,7 @@ Status InsertNewOpUtil::GetDataRelatedNode(NodePtr &node, std::mapGetOpDesc(); GE_CHECK_NOTNULL(dst_op); - if (dst_op->GetType() == AIPP || dst_op->GetType() == SWITCHN) { + if (dst_op->GetType() == AIPP || dst_op->GetType() == SWITCHN || dst_op->GetType() == CASE) { auto data_iter = data_next_node_map.find(node); if (data_iter == data_next_node_map.end()) { std::set next_node_set; @@ -407,7 +533,7 @@ Status InsertNewOpUtil::GetDataRelatedNode(NodePtr &node, std::map &aipps) { +Status InsertNewOpUtil::GetAllAipps(const NodePtr &data_node, const NodePtr &node, std::vector &aipps) { GE_CHECK_NOTNULL(node); OpDescPtr op = node->GetOpDesc(); GE_CHECK_NOTNULL(op); @@ -427,6 +553,32 @@ Status InsertNewOpUtil::GetAllAipps(const NodePtr &node, std::vector &a } } } + } else if (op->GetType() == CASE) { + const ComputeGraphPtr &graph = node->GetOwnerComputeGraph(); + for (const auto &name : op->GetSubgraphInstanceNames()) { + const auto &subgraph = graph->GetSubgraph(name); + if (subgraph == nullptr) { + GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); + return GE_GRAPH_EMPTY_SUBGRAPH; + } + + for (auto &subgraph_node : subgraph->GetDirectNode()) { + if (subgraph_node->GetType() == AIPP) { + auto src_node = subgraph_node->GetInDataNodes().at(0); + const auto &src_op = src_node->GetOpDesc(); + GE_CHECK_NOTNULL(src_op); + uint32_t parent_index = 0; + if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str()); + return FAILED; + } + auto data = node->GetInDataNodes().at(parent_index); + if (data->GetName() == data_node->GetName()) { + aipps.emplace_back(subgraph_node); + } + } + } + } } return SUCCESS; } @@ -446,14 +598,14 @@ Status InsertNewOpUtil::RecordAIPPInfoToData(const ComputeGraphPtr &graph) { auto data_node = it.first; auto data_op_desc = data_node->GetOpDesc(); GE_CHECK_NOTNULL(data_op_desc); - std::set aipps_or_switchs = it.second; - if (aipps_or_switchs.size() != 1) { + std::set aipps_or_switchs_or_case = it.second; + if (aipps_or_switchs_or_case.size() != 1) { GELOGW("The number of successors swith or aipp of data is more than 1"); continue; } std::vector aipps; - GE_RETURN_IF_ERROR(GetAllAipps(*aipps_or_switchs.begin(), aipps)); + GE_RETURN_IF_ERROR(GetAllAipps(data_node, *aipps_or_switchs_or_case.begin(), aipps)); GELOGI("RecordAIPPInfoToData: Data: name[%s], type[%s], batch size[%u]", data_node->GetName().c_str(), data_node->GetType().c_str(), aipps.size()); diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h index 93a96ca2..ae431c32 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h @@ -30,7 +30,7 @@ enum AippType { OLD_TYPE, NEW_TYPE }; class InsertNewOpUtil { public: static InsertNewOpUtil &Instance() { - static InsertNewOpUtil instance; + thread_local InsertNewOpUtil instance; return instance; } @@ -64,10 +64,14 @@ class InsertNewOpUtil { void UpdateMultiBatchInputDims(const OpDescPtr &data_opdesc, Format &old_format); Status UpdatePrevNodeByAipp(NodePtr &node, std::set &switchns); Status UpdateDataBySwitchN(const NodePtr &switchn, const NodePtr &data); + Status AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node); Status GetDataRelatedNode(NodePtr &node, std::map> &data_next_node_map); - Status GetAllAipps(const NodePtr &node, std::vector &aipps); + Status GetAllAipps(const NodePtr &data_node, const NodePtr &node, std::vector &aipps); Status GetInputOutputInfo(NodePtr &data_node, NodePtr &aipp_node, std::string &input, std::string &output); Status SetModelInputDims(NodePtr &data_node, NodePtr &aipp_node); + Status FindMaxSizeNode(const ComputeGraphPtr &graph, const NodePtr &case_node, map &max_sizes, + map &aipp_inputs); + Status UpdateCaseNode(const ComputeGraphPtr &graph, const NodePtr &case_node); }; } // namespace ge diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc index 8a066b6a..6adcc63e 100644 --- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc +++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc @@ -30,15 +30,16 @@ #include "framework/omg/omg_inner_types.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" +#include "graph/passes/multi_batch_clone_pass.h" #include "graph/passes/prune_pass.h" +#include "graph/preprocess/multi_batch_options.h" #include "graph/utils/attr_utils.h" #include "graph/utils/graph_utils.h" #include "graph/utils/node_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" -#include "graph/preprocess/multi_batch_options.h" #include "inc/pass_manager.h" -#include "graph/passes/multi_batch_clone_pass.h" +#include "graph/common/local_context.h" using std::set; using std::string; @@ -54,6 +55,9 @@ const int kDataOutIndex = 0; const int kDataInIndex = 0; const int kMergeDataOutIndex = 0; const int kStaticOutput = -1; +const int kDynmaicDims = -1; +const int kDynamicBatchDynamicDimsNum = 1; +const int kDynamicImgSizeDynamciDimsNum = 2; inline bool IsDataLikeType(const std::string &node_type) { return (node_type == DATA) || (node_type == AIPP); } @@ -131,6 +135,8 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) { return nullptr; } + (void)AttrUtils::SetListStr(desc, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, {node->GetName()}); + auto graph = node->GetOwnerComputeGraph(); return graph->AddNode(desc); } @@ -228,6 +234,12 @@ Status MultiBatchGraphCopyer::CopyGraph() { return ret; } + ret = InsertIdentityAfterSwitchN(); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to insert identity nodes after switchn node."); + return INTERNAL_ERROR; + } + GELOGI("Begin to remove useless nodes by prune pass after copy process"); PrunePass prune_pass; ret = prune_pass.Run(graph_); @@ -255,7 +267,10 @@ Status MultiBatchGraphCopyer::Init() { Status MultiBatchGraphCopyer::LabelStatus() { for (const auto &data : origin_data_nodes_) { - origin_nodes_status_[data.get()] = kNodeInBatchBranch; + auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape(); + if (!IsAllDimsPositive(data_shape.GetDims())) { + origin_nodes_status_[data.get()] = kNodeInBatchBranch; + } } bool changed = true; // If anyone of in node is kNodeInBatchBranch, it is also kNodeInBatchBranch @@ -267,8 +282,9 @@ Status MultiBatchGraphCopyer::LabelStatus() { continue; } for (auto &in_node : node->GetInAllNodes()) { - if (origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() && - origin_nodes_status_[in_node.get()] == kNodeInBatchBranch) { + bool is_in_batch = origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() && + origin_nodes_status_[in_node.get()] == kNodeInBatchBranch; + if (is_in_batch) { origin_nodes_status_[node.get()] = kNodeInBatchBranch; changed = true; break; @@ -316,6 +332,10 @@ Status MultiBatchGraphCopyer::CreateNewNodes() { switch (branch_status) { case kNodeStartNode: GELOGD("Name: %s, type: %s, status: kNodeStartNode.", node->GetName().c_str(), node->GetType().c_str()); + ret = UpdateDataToDynamicInfo(node); + if (ret != SUCCESS) { + break; + } ret = InsertSwitchNForData(node); if (ret == SUCCESS) { ret = UpdateMaxShapeToData(node); @@ -712,7 +732,57 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) { data_nodes_to_switchn_[data.get()] = switchn; return SUCCESS; } - +Status MultiBatchGraphCopyer::UpdateDataToDynamicInfo(const NodePtr &data) { + auto data_desc = NodeUtils::GetOutputDesc(*data, kDataOutIndex); + auto data_shape = data_desc.GetShape(); + auto data_format = data_desc.GetFormat(); + auto data_name = data->GetName(); + if (IsAllDimsPositive(data_shape.GetDims())) { + return SUCCESS; + } + if (data_to_dynamic_info_.find(data_name) == data_to_dynamic_info_.end()) { + auto data_shape_dims = data_shape.GetDims(); + auto dynamic_dims_num = std::count_if(data_shape_dims.begin(), data_shape_dims.end(), + [&data_shape_dims](int64_t dim) { return dim < 0; }); + if (dynamic_type_ == DynamicType::kDynamicBatch) { + if (dynamic_dims_num != kDynamicBatchDynamicDimsNum || data_shape.GetDim(0) != kDynmaicDims) { + GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic batch rule", data->GetName().c_str(), + data_shape.ToString().c_str()); + return INTERNAL_ERROR; + } + } else if (dynamic_type_ == DynamicType::kDynamicImageSize) { + int64_t height = 0; + int64_t width = 0; + if (data_format == FORMAT_NCHW) { + height = data_shape.GetDim(NCHW_DIM_H); + width = data_shape.GetDim(NCHW_DIM_W); + } else if (data_format == FORMAT_NHWC) { + height = data_shape.GetDim(NHWC_DIM_H); + width = data_shape.GetDim(NHWC_DIM_W); + } + if (dynamic_dims_num != kDynamicImgSizeDynamciDimsNum || height != kDynmaicDims || width != kDynmaicDims) { + GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic image size rule", data->GetName().c_str(), + data_shape.ToString().c_str()); + return INTERNAL_ERROR; + } + } else if (dynamic_type_ == DynamicType::kDynamicDims) { + GELOGE(INTERNAL_ERROR, "data: %s shape:%s must be set int --input_shape", data->GetName().c_str(), + data_shape.ToString().c_str()); + return INTERNAL_ERROR; + } + // all data has dynamic dims are not in atc parameter --input_shape + if (data_to_dynamic_info_.empty()) { + vector>> tmp_data_name_and_shape{std::make_pair(data_name, data_shape_dims)}; + auto ret = ParserDataToDynmaicInfo(shapes_, tmp_data_name_and_shape, data_to_dynamic_info_); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "parse data : %s dynamic gear info failed", data_name.c_str()); + return INTERNAL_ERROR; + } + } + data_to_dynamic_info_[data_name] = data_to_dynamic_info_.begin()->second; + } + return SUCCESS; +} Status MultiBatchGraphCopyer::InsertMergeForEdgeNode(const NodePtr &node) { for (auto &in_data_anchor : node->GetAllInDataAnchors()) { auto src_out_anchor = in_data_anchor->GetPeerOutAnchor(); @@ -911,33 +981,77 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) { return SUCCESS; } -Status ProcessMultiBatch(ComputeGraphPtr &graph) { - const char *multi_batch_with_case = std::getenv("MULTI_BATCH_WITH_CASE"); - if (multi_batch_with_case != nullptr) { - PassManager pass_manager; - GE_CHK_STATUS_RET(pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass)); - return pass_manager.Run(graph); +Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() { + for (auto &node : graph_->GetAllNodes()) { + if (node->GetType() != SWITCHN) { + continue; + } + auto switchn_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(switchn_desc); + size_t i = 0; + for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { + for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { + auto identity_desc = MakeShared(node->GetName() + "_identity_" + std::to_string(i), IDENTITY); + GE_CHECK_NOTNULL(identity_desc); + + auto out_node = in_data_anchor->GetOwnerNode(); + auto op_desc = out_node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + string batch_label; + if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { + if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { + GELOGE(FAILED, "Set attr ATTR_NAME_BATCH_LABEL failed, node:%s.", identity_desc->GetName().c_str()); + return FAILED; + } + } + + auto data_desc = switchn_desc->GetOutputDesc(i); + i++; + GE_CHK_STATUS_RET(identity_desc->AddInputDesc("x", data_desc)); + GE_CHK_STATUS_RET(identity_desc->AddOutputDesc("y", data_desc)); + + auto identity_node = graph_->AddNode(identity_desc); + GE_CHECK_NOTNULL(identity_node); + GE_CHK_STATUS_RET(out_data_anchor->LinkTo(identity_node->GetInDataAnchor(0))); + GE_CHECK_NOTNULL(identity_node->GetOutControlAnchor()); + GE_CHK_STATUS_RET(identity_node->GetOutControlAnchor()->LinkTo(out_node->GetInControlAnchor())); + } + } } + return SUCCESS; +} + +Status ProcessMultiBatch(ComputeGraphPtr &graph) { std::vector> shapes; if (!InitDynamicParams(shapes)) { GELOGD("There is no multi-batch options, no need to process multi-batch copy"); return SUCCESS; } map>> data_to_dynamic_info; - if (ParserDataToDynmaicInfo(shapes, data_to_dynamic_info) != SUCCESS) { - GELOGD("Parse each data's own dynamic info failed"); - return SUCCESS; + // parser data dynamic info from atc parameter --input_shape + if (ParserDataToDynmaicInfo(shapes, GetLocalOmgContext().user_input_dims, data_to_dynamic_info) != SUCCESS) { + GELOGE(PARAM_INVALID, "Parse each data's own dynamic info failed"); + return PARAM_INVALID; + } + DynamicType dynamic_type = DynamicType::kDynamicUnknown; + if (!GetLocalOmgContext().dynamic_batch_size.empty()) { + dynamic_type = DynamicType::kDynamicBatch; + } else if (!GetLocalOmgContext().dynamic_image_size.empty()) { + dynamic_type = DynamicType::kDynamicImageSize; + ; + } else if (!GetLocalOmgContext().dynamic_dims.empty()) { + dynamic_type = DynamicType::kDynamicDims; } - std::vector>> user_designate_shape; - user_designate_shape = domi::GetContext().user_input_dims; + user_designate_shape = GetLocalOmgContext().user_input_dims; GELOGI("Begin to copy graph for multi-batch"); multibatch::MultiBatchGraphCopyer copyer(graph); for (auto &shape : shapes) { copyer.AddShape(shape); } + copyer.SetDynamicType(dynamic_type); copyer.SetUserDesignateShape(user_designate_shape); copyer.SetDataToDynamicInfo(data_to_dynamic_info); return copyer.CopyGraph(); diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.h b/src/ge/graph/preprocess/multi_batch_copy_graph.h index a0e61554..062b98d2 100644 --- a/src/ge/graph/preprocess/multi_batch_copy_graph.h +++ b/src/ge/graph/preprocess/multi_batch_copy_graph.h @@ -37,6 +37,13 @@ enum NodeStatus { kNodeNotSupportNode, }; +enum DynamicType { + kDynamicBatch, + kDynamicImageSize, + kDynamicDims, + kDynamicUnknown, +}; + class MultiBatchGraphCopyer { public: explicit MultiBatchGraphCopyer(ComputeGraphPtr &graph) : graph_(graph) {} @@ -52,6 +59,7 @@ class MultiBatchGraphCopyer { void SetDataToDynamicInfo(const map>> &designate_shape) { data_to_dynamic_info_ = designate_shape; } + void SetDynamicType(const DynamicType dynamic_type) { dynamic_type_ = dynamic_type; } Status CopyGraph(); private: @@ -65,6 +73,7 @@ class MultiBatchGraphCopyer { NodePtr InsertShapeDataNode(); Status InsertSwitchNForData(const NodePtr &data); + Status InsertIdentityAfterSwitchN(); Status UpdateMaxShapeToData(const NodePtr &data); Status InsertMergeForEdgeNode(const NodePtr &node); @@ -93,7 +102,7 @@ class MultiBatchGraphCopyer { Status LinkNodeToMerge(const NodePtr &node, int out_index, const NodePtr &merge); Status CopyInDataEdges(const NodePtr &origin_node, int batch_num, const NodePtr ©ed_node); Status CopyInControlEdges(const NodePtr &node, int batch_num, const NodePtr ©ed_node); - + Status UpdateDataToDynamicInfo(const NodePtr &node); bool IsInBatchBranch(const NodePtr &node); NodeStatus GetNodeStatus(const NodePtr &node) { return origin_nodes_status_[node.get()]; }; Status CheckCopyResult(const std::vector &start_nodes); @@ -129,6 +138,9 @@ class MultiBatchGraphCopyer { // each data's own dynamic info map>> data_to_dynamic_info_; + + // dynamic type : dynamic batch,, dynamic image size, dynamic dims. + DynamicType dynamic_type_ = DynamicType::kDynamicUnknown; }; } // namespace multibatch } // namespace ge diff --git a/src/ge/graph/preprocess/multi_batch_options.cc b/src/ge/graph/preprocess/multi_batch_options.cc index cbf8206f..005240ca 100644 --- a/src/ge/graph/preprocess/multi_batch_options.cc +++ b/src/ge/graph/preprocess/multi_batch_options.cc @@ -25,6 +25,7 @@ #include "graph/debug/ge_attr_define.h" #include "graph/utils/node_utils.h" #include "graph/ge_context.h" +#include "graph/common/local_context.h" namespace ge { namespace multibatch { @@ -59,9 +60,9 @@ void ParseDynamicSize(string dynamic_size, vector> &shapes) { /// @return true: Configed for Multi batch / false: Not configed for Multi batch. /// bool InitDynamicParams(vector> &shapes) { - if (!domi::GetContext().dynamic_batch_size.empty()) { - GELOGD("Found dynamic batch option, value %s", domi::GetContext().dynamic_batch_size.c_str()); - std::vector dims = ge::StringUtils::Split(domi::GetContext().dynamic_batch_size, ','); + if (!GetLocalOmgContext().dynamic_batch_size.empty()) { + GELOGD("Found dynamic batch option, value %s", GetLocalOmgContext().dynamic_batch_size.c_str()); + std::vector dims = ge::StringUtils::Split(GetLocalOmgContext().dynamic_batch_size, ','); for (const auto &dim : dims) { if (dim.empty()) { continue; @@ -71,18 +72,18 @@ bool InitDynamicParams(vector> &shapes) { } } - if (!domi::GetContext().dynamic_image_size.empty()) { - GELOGD("Found dynamic image size option, value %s", domi::GetContext().dynamic_image_size.c_str()); - ParseDynamicSize(domi::GetContext().dynamic_image_size, shapes); + if (!GetLocalOmgContext().dynamic_image_size.empty()) { + GELOGD("Found dynamic image size option, value %s", GetLocalOmgContext().dynamic_image_size.c_str()); + ParseDynamicSize(GetLocalOmgContext().dynamic_image_size, shapes); for (const auto &shape : shapes) { GELOGI("Found dynamic image size, shape %s", formats::JoinToString(shape).c_str()); } } - if (!domi::GetContext().dynamic_dims.empty()) { - GELOGD("Found dynamic dims option, value %s", domi::GetContext().dynamic_dims.c_str()); - ParseDynamicSize(domi::GetContext().dynamic_dims, shapes); + if (!GetLocalOmgContext().dynamic_dims.empty()) { + GELOGD("Found dynamic dims option, value %s", GetLocalOmgContext().dynamic_dims.c_str()); + ParseDynamicSize(GetLocalOmgContext().dynamic_dims, shapes); for (const auto &shape : shapes) { GELOGI("Found dynamic dims, shape %s", formats::JoinToString(shape).c_str()); @@ -99,14 +100,11 @@ bool InitDynamicParams(vector> &shapes) { /// @return true: Configed for Multi batch / false: Not configed for Multi batch. /// Status ParserDataToDynmaicInfo(const vector> &shapes, + vector>> &data_name_and_shape, map>> &data_to_dynamic_info) { - if (domi::GetContext().user_input_dims.empty()) { - GELOGD("Get user designed shape failed"); - return FAILED; - } size_t cur_data_index = 0; - for (size_t index = 0; index < domi::GetContext().user_input_dims.size(); ++index) { - auto &cur_item = domi::GetContext().user_input_dims[index]; + for (size_t index = 0; index < data_name_and_shape.size(); ++index) { + auto &cur_item = data_name_and_shape[index]; auto &data_name = cur_item.first; auto &data_shape = cur_item.second; auto dynamic_dims_num = @@ -239,13 +237,13 @@ Status CalcShape(const std::vector &batch_shape, GeShape &data_shape) { Status StampDynamicType(const OpDescPtr &op_desc) { GE_CHECK_NOTNULL(op_desc); int32_t dynamic_type = static_cast(FIXED); - if (!domi::GetContext().dynamic_batch_size.empty()) { + if (!GetLocalOmgContext().dynamic_batch_size.empty()) { dynamic_type = static_cast(DYNAMIC_BATCH); } - if (!domi::GetContext().dynamic_image_size.empty()) { + if (!GetLocalOmgContext().dynamic_image_size.empty()) { dynamic_type = static_cast(DYNAMIC_IMAGE); } - if (!domi::GetContext().dynamic_dims.empty()) { + if (!GetLocalOmgContext().dynamic_dims.empty()) { dynamic_type = static_cast(DYNAMIC_DIMS); } if (!AttrUtils::SetInt(op_desc, ATTR_DYNAMIC_TYPE, dynamic_type)) { diff --git a/src/ge/graph/preprocess/multi_batch_options.h b/src/ge/graph/preprocess/multi_batch_options.h index 650020d9..18f667ae 100644 --- a/src/ge/graph/preprocess/multi_batch_options.h +++ b/src/ge/graph/preprocess/multi_batch_options.h @@ -54,10 +54,13 @@ Status CalcShape(const std::vector &batch_shape, GeShape &data_shape); /// /// @ingroup ge /// @brief parse each data's own dynamic dims. +/// @param [in] vector> &shapes: dynamic batch gears info. +/// @param [in] vector>> data_name_and_shape: eg:{{data:{1,1,-1,2}}}. /// @param [out] map>> &data_to_dynamic_info: key:data_name. value:dynamic dims. /// @return SUCCESS / PARAM_INVALID /// Status ParserDataToDynmaicInfo(const vector> &shapes, + vector>> &data_name_and_shape, map>> &data_to_dynamic_info); /// diff --git a/src/ge/host_kernels/rsqrt_kernel.cc b/src/ge/host_kernels/rsqrt_kernel.cc index f91e3399..5184d885 100644 --- a/src/ge/host_kernels/rsqrt_kernel.cc +++ b/src/ge/host_kernels/rsqrt_kernel.cc @@ -73,14 +73,12 @@ Status RsqrtKernel::RsqrtCompute(ConstGeTensorPtr &input_tensor_ptr, GeTensorPtr auto ptr = const_cast(reinterpret_cast(input_tensor_ptr->GetData().data())); for (size_t i = 0; i < data_count; i++) { if (ZeroCheck(*(ptr + i), data_type) != SUCCESS) { - GELOGE(PARAM_INVALID, "The input data can not be 0. "); - return PARAM_INVALID; + GELOGW("Rsqrt: The input data can not less than or equal to zero, rsqrt folding failed."); + return NOT_CHANGED; } switch (data_type) { case DT_FLOAT16: { double val = static_cast(*(reinterpret_cast(input_tensor_ptr->GetData().data()) + i)); - GE_IF_BOOL_EXEC(val < 0, GELOGE(PARAM_INVALID, "The denominator data %lf can not less than 0.", val); - return PARAM_INVALID); double drSqrt = 1.0 / std::sqrt(val); buf[i] = drSqrt; break; diff --git a/src/ge/hybrid/common/npu_memory_allocator.cc b/src/ge/hybrid/common/npu_memory_allocator.cc index 1908725f..cbb556e2 100644 --- a/src/ge/hybrid/common/npu_memory_allocator.cc +++ b/src/ge/hybrid/common/npu_memory_allocator.cc @@ -17,16 +17,17 @@ #include "npu_memory_allocator.h" #include #include "framework/common/debug/log.h" -#include "graph/manager/graph_mem_allocator.h" #include "graph/manager/graph_caching_allocator.h" +#include "graph/manager/graph_mem_allocator.h" +#include "graph/manager/rdma_pool_allocator.h" namespace ge { namespace hybrid { std::map> NpuMemoryAllocator::allocators_; std::mutex NpuMemoryAllocator::mu_; -AllocationAttr::AllocationAttr(int padding, void *try_reuse_addr) - : padding_(padding), try_reuse_addr_(try_reuse_addr) {} +AllocationAttr::AllocationAttr(int padding, void *try_reuse_addr, MemStorageType mem_type) + : padding_(padding), try_reuse_addr_(try_reuse_addr), mem_type_(mem_type) {} AllocationAttr::AllocationAttr(int padding) : AllocationAttr(padding, nullptr) {} AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reuse_addr) {} @@ -46,6 +47,7 @@ NpuMemoryAllocator::NpuMemoryAllocator(uint32_t device_id) : device_id_(device_i void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { void *try_reuse_addr = nullptr; size_t allocate_size = size; + MemStorageType mem_type = HBM; if (attr != nullptr) { try_reuse_addr = attr->try_reuse_addr_; if (attr->padding_ != 0) { @@ -53,10 +55,24 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { allocate_size = (size + 2 * attr->padding_ - 1) / attr->padding_ * attr->padding_; GELOGD("Padding size %ld by %d. final size = %zu.", size, attr->padding_, allocate_size); } + mem_type = attr->mem_type_; } - void *buffer = MemManager::CachingInstance(RT_MEMORY_HBM) - .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); + if (allocate_size == 0) { + GELOGE(MEMALLOC_FAILED, "Memory size is 0, device_id = %u, size = %zu", device_id_, allocate_size); + return nullptr; + } + + void *buffer = nullptr; + if (mem_type == RDMA_HBM) { + buffer = MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Malloc(allocate_size, device_id_); + } else if (mem_type == HOST_DDR) { + buffer = malloc(allocate_size); + } else { + buffer = MemManager::Instance() + .CachingInstance(RT_MEMORY_HBM) + .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); + } if (buffer == nullptr) { GELOGE(MEMALLOC_FAILED, "Failed to malloc memory, device_id = %u, size = %zu", device_id_, allocate_size); return nullptr; @@ -66,11 +82,17 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { return buffer; } -void NpuMemoryAllocator::Deallocate(void *data) { +void NpuMemoryAllocator::Deallocate(void *data, MemStorageType mem_type) { GELOGI("To deallocating buffer, addr = %p", data); if (data != nullptr) { GELOGI("Deallocating buffer successfully. addr = %p", data); - MemManager::CachingInstance(RT_MEMORY_HBM).Free(reinterpret_cast(data), device_id_); + if (mem_type == RDMA_HBM) { + MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(reinterpret_cast(data), device_id_); + } else if (mem_type == HOST_DDR) { + free(data); + } else { + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Free(reinterpret_cast(data), device_id_); + } } } diff --git a/src/ge/hybrid/common/npu_memory_allocator.h b/src/ge/hybrid/common/npu_memory_allocator.h index 7aa15578..99c01b34 100644 --- a/src/ge/hybrid/common/npu_memory_allocator.h +++ b/src/ge/hybrid/common/npu_memory_allocator.h @@ -32,7 +32,7 @@ class AllocationAttr { AllocationAttr() = default; explicit AllocationAttr(int padding); explicit AllocationAttr(void *try_reuse_addr); - AllocationAttr(int padding, void *try_reuse_addr); + AllocationAttr(int padding, void *try_reuse_addr, MemStorageType = HBM); ~AllocationAttr() = default; void SetMemType(MemStorageType memType) { mem_type_ = memType; } MemStorageType GetMemType() { return mem_type_; } @@ -56,7 +56,7 @@ class NpuMemoryAllocator { } void *Allocate(std::size_t size, AllocationAttr *attr = nullptr); - void Deallocate(void *data); + void Deallocate(void *data, MemStorageType mem_type = HBM); static constexpr int kDefaultPadding = 32; diff --git a/src/ge/hybrid/common/tensor_value.cc b/src/ge/hybrid/common/tensor_value.cc index 929d3c87..11a96d13 100644 --- a/src/ge/hybrid/common/tensor_value.cc +++ b/src/ge/hybrid/common/tensor_value.cc @@ -21,8 +21,8 @@ namespace ge { namespace hybrid { -TensorBuffer::TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size) - : allocator_(allocator), buffer_(buffer), size_(size) {} +TensorBuffer::TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size, MemStorageType mem_type) + : allocator_(allocator), buffer_(buffer), size_(size), mem_type_(mem_type) {} std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator, size_t size, AllocationAttr *attr) { void *buffer = nullptr; @@ -36,14 +36,18 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator return nullptr; } + MemStorageType mem_type = HBM; + if (attr != nullptr) { + mem_type = attr->GetMemType(); + } buffer = allocator->Allocate(size, attr); if (buffer == nullptr) { GELOGE(MEMALLOC_FAILED, "Failed to allocate memory. size = %zu", size); return nullptr; } - GELOGD("Tensor created. addr = %p, size = %zu", buffer, size); - return std::unique_ptr(new (std::nothrow) TensorBuffer(allocator, buffer, size)); + GELOGD("Tensor created. addr = %p, size = %zu, mem_type = %d", buffer, size, static_cast(mem_type)); + return std::unique_ptr(new (std::nothrow) TensorBuffer(allocator, buffer, size, mem_type)); } std::unique_ptr TensorBuffer::Create(void *buffer, size_t size) { @@ -53,7 +57,7 @@ std::unique_ptr TensorBuffer::Create(void *buffer, size_t size) { TensorBuffer::~TensorBuffer() { if (allocator_ != nullptr && buffer_ != nullptr) { - allocator_->Deallocate(buffer_); + allocator_->Deallocate(buffer_, mem_type_); } } diff --git a/src/ge/hybrid/common/tensor_value.h b/src/ge/hybrid/common/tensor_value.h index db8df9e5..d720e0e0 100644 --- a/src/ge/hybrid/common/tensor_value.h +++ b/src/ge/hybrid/common/tensor_value.h @@ -20,6 +20,7 @@ #include #include #include +#include "memory/memory_api.h" namespace ge { namespace hybrid { @@ -33,6 +34,8 @@ class TensorBuffer { static std::unique_ptr Create(void *buffer, size_t size); + TensorBuffer(const TensorBuffer &) = delete; + TensorBuffer &operator=(const TensorBuffer &) = delete; ~TensorBuffer(); void *GetData() { return buffer_; } @@ -40,11 +43,12 @@ class TensorBuffer { size_t GetSize() const { return size_; } private: - TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size); + TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size, MemStorageType mem_type = HBM); NpuMemoryAllocator *allocator_ = nullptr; void *buffer_ = nullptr; size_t size_ = 0; + MemStorageType mem_type_; }; class TensorValue { diff --git a/src/ge/hybrid/executor/worker/execution_engine.cc b/src/ge/hybrid/executor/worker/execution_engine.cc index b19d0849..1eb73e41 100644 --- a/src/ge/hybrid/executor/worker/execution_engine.cc +++ b/src/ge/hybrid/executor/worker/execution_engine.cc @@ -272,8 +272,9 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, TaskContext &task_ if (context.profiling_level > 0) { auto *ctx = &context; const string &name = node_state.GetName(); - task_context.RegisterCallback([ctx, name]() { RECORD_CALLBACK_EVENT(ctx, name.c_str(), "[Compute] Start"); }); + (void)task_context.RegisterCallback([ctx, name]() { RECORD_CALLBACK_EVENT(ctx, name.c_str(), "[Compute] Start"); }); } + RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] Start"); GE_CHK_STATUS_RET(node_item.node_executor->ExecuteTask(*task, task_context, callback), "[%s] Failed to execute task", node_state.GetName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] End"); @@ -286,8 +287,18 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const for (auto i = 0; i < task_context.NumInputs(); ++i) { const auto &input_tensor = task_context.GetInput(i); GE_CHECK_NOTNULL(input_tensor); + if (input_tensor->GetData() == nullptr) { + GELOGD("[%s] Skipping null input, index = %d", task_context.GetNodeName(), i); + continue; + } + const auto &tensor_desc = node_state.GetOpDesc()->MutableInputDesc(i); GE_CHECK_NOTNULL(tensor_desc); + if (tensor_desc->GetDataType() == DT_STRING) { + GELOGD("[%s] Skipping DT_STRING input, index = %d", task_context.GetNodeName(), i); + continue; + } + int64_t expected_size; GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, expected_size)); GELOGD("[%s] Input[%d] expects [%ld] bytes.", task_context.GetNodeName(), i, expected_size); diff --git a/src/ge/hybrid/model/hybrid_model_builder.cc b/src/ge/hybrid/model/hybrid_model_builder.cc index 97783711..45fb3a6a 100644 --- a/src/ge/hybrid/model/hybrid_model_builder.cc +++ b/src/ge/hybrid/model/hybrid_model_builder.cc @@ -36,20 +36,24 @@ const uint32_t kAlignment = 32; const int kBytes = 8; int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { - int64_t var_size = GetSizeByDataType(desc.GetDataType()); - if (var_size <= 0) { - GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", - TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); - return -1; - } - auto shape = desc.GetShape(); - auto dim_num = shape.GetDimNum(); - for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) { - var_size *= shape.GetDim(dim_index); + int64_t var_size = 0; + auto data_type = desc.GetDataType(); + if (data_type == DT_STRING) { + (void)TensorUtils::GetSize(desc, var_size); + } else { + var_size = GetSizeByDataType(data_type); + if (var_size <= 0) { + GELOGW("Failed to calc var data size from data type %s", TypeUtils::DataTypeToSerialString(data_type).c_str()); + return -1; + } + auto shape = desc.GetShape(); + auto dim_num = shape.GetDimNum(); + for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) { + var_size *= shape.GetDim(dim_index); + } + // padding up to multiple of kAlignment, and add extra kAlignment + var_size = (var_size + kAlignment * 2 - 1) / kAlignment * kAlignment; } - - // padding up to multiple of kAlignment, and add extra kAlignment - var_size = (var_size + kAlignment * 2 - 1) / kAlignment * kAlignment; return var_size; } } // namespace @@ -614,11 +618,6 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ } int64_t var_size = CalcVarSizeInBytes(*tensor_desc); - if (var_size < 0) { - GELOGE(INTERNAL_ERROR, "[%s] Invalid var size: %ld", var_name.c_str(), var_size); - return INTERNAL_ERROR; - } - tensor.reset(new (std::nothrow) TensorValue(dev_mem, var_size)); GE_CHECK_NOTNULL(tensor); return SUCCESS; diff --git a/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc index 6cf7363e..cc140b08 100644 --- a/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc +++ b/src/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc @@ -28,7 +28,7 @@ namespace hybrid { REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::GE_LOCAL, GeLocalNodeExecutor); const std::unordered_map> RefInputTask::out_ref_input_index_ = { - {DATA, {}}, {AIPPDATA, {}}, {RESHAPE, {}}, {EXPANDDIMS, {}}}; + {DATA, {}}, {AIPPDATA, {}}, {RESHAPE, {}}, {EXPANDDIMS, {}}, {SQUEEZE, {}}, {BROADCASTGRADIENTARGS, {}}}; const std::unordered_set DependInputShapeTask::depend_input_shape_ops_ = {SHAPE, SHAPEN, RANK, SIZE}; diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc index 57b426d8..f2cd1888 100644 --- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc +++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc @@ -15,15 +15,21 @@ */ #include "hybrid/node_executor/hccl/hccl_node_executor.h" -#include "graph/manager/util/hcom_util.h" -#include "framework/common/debug/ge_log.h" -#include "framework/common/fmk_error_codes.h" #include "common/ge/ge_util.h" #include "common/ge/plugin_manager.h" +#include "framework/common/debug/ge_log.h" #include "graph/attr_value.h" #include "graph/debug/ge_attr_define.h" +#include "graph/manager/util/hcom_util.h" +#include "graph/runtime_inference_context.h" #include "hccl/hcom.h" +namespace { +const size_t kVarTableDims = 2; +const size_t kVarTableRowCnt = 3; +const size_t kVarTableIdxAddr = 1; +const size_t kVarTableIdxLen = 2; +} // namespace namespace ge { namespace hybrid { @@ -35,8 +41,8 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do GELOGE(FAILED, "hccl handle is nullptr! "); return FAILED; } - auto EnqueueHcomOpertion = (hcclResult_t(*)(HcomOpertion, std::function))dlsym( - context.handle_, "EnqueueHcomOpertion"); + auto EnqueueHcomOpertion = + (HcclResult(*)(HcomOpertion, std::function))dlsym(context.handle_, "EnqueueHcomOpertion"); if (EnqueueHcomOpertion == nullptr) { GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function."); if (dlclose(context.handle_) != 0) { @@ -74,7 +80,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do return PARAM_INVALID; } op_info.dataType = iter->second; - hcclRedOp_t op_type = HCCL_REP_OP_SUM; + HcclReduceOp op_type = HCCL_REDUCE_SUM; if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER || op_desc->GetType() == HVDCALLBACKALLREDUCE) { GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed"); @@ -85,7 +91,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed"); } op_info.root = root_id; - auto callback = [this, op_desc](hcclResult_t status) { + auto callback = [this, op_desc](HcclResult status) { if (status != HCCL_SUCCESS) { GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status); } @@ -94,14 +100,14 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do GELOGI("node %s hccl callback success.", op_desc->GetName().c_str()); }; int32_t count = 0; - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast(op_info.dataType), + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast(op_info.dataType), op_desc->GetType() == HCOMALLGATHER, count), "GetHcomCount failed"); GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.", context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); op_info.count = count; - hcclResult_t hccl_ret = EnqueueHcomOpertion(op_info, callback); + HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback); if (hccl_ret != HCCL_SUCCESS) { GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; @@ -116,6 +122,119 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do return SUCCESS; } +Status RdmaNodeTask::UpdateArgs(TaskContext &context) { return SUCCESS; } + +Status RdmaNodeTask::Init(TaskContext &context) { + GELOGI("[%s] RdmaNodeTask::Init in.", context.GetNodeName()); + const NodeItem &node_item = context.GetNodeItem(); + GE_CHECK_NOTNULL(node_item.op_desc); + auto remote_idx = node_item.op_desc->GetInputIndexByName("remote"); + auto in_data_anchor = node_item.node->GetInDataAnchor(remote_idx); + GE_CHECK_NOTNULL(in_data_anchor); + auto out_data_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(out_data_anchor); + auto peer_node = out_data_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(peer_node->GetOpDesc()); + + remote_index_ = {peer_node->GetOpDesc()->GetId(), out_data_anchor->GetIdx()}; + if (node_item.node->GetType() == HCOMREMOTEREAD) { + local_index_ = 0; + } else { + local_index_ = node_item.op_desc->GetInputIndexByName("local"); + } + return SUCCESS; +} + +Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector &addr_infos) { + RuntimeInferenceContext *ctx = nullptr; + GE_CHK_STATUS_RET(RuntimeInferenceContext::GetContext(std::to_string(context.GetSessionId()), &ctx)); + + ge::Tensor remote_tensor; + GE_CHK_STATUS_RET(ctx->GetTensor(remote_index_.first, remote_index_.second, remote_tensor)); + auto data = reinterpret_cast(remote_tensor.GetData()); + if (data == nullptr) { + GELOGE(FAILED, "Tensor data is nullptr."); + return FAILED; + } + auto dims = remote_tensor.GetTensorDesc().GetShape().GetDims(); + if (dims.size() != kVarTableDims && dims.back() != kVarTableRowCnt) { + GELOGE(PARAM_INVALID, "Variable table shape check failed"); + return PARAM_INVALID; + } + + size_t remote_size = 0; + for (auto idx = 0; idx < dims.front(); ++idx) { + remote_size += data[idx * kVarTableRowCnt + kVarTableIdxLen]; + } + + if (context.GetNodeItem().NodeType() == HCOMREMOTEREAD) { + auto allocator = NpuMemoryAllocator::GetAllocator(); + GE_CHECK_NOTNULL(allocator); + AllocationAttr attr; + attr.SetMemType(RDMA_HBM); + for (auto i = 0; i < context.NumOutputs(); ++i) { + GELOGD("Allocate rdma memory for node %s, size: %zu", context.GetNodeName(), remote_size); + auto tensor_buffer = TensorBuffer::Create(allocator, remote_size, &attr); + GE_CHK_STATUS_RET(context.SetOutput(i, TensorValue(std::shared_ptr(tensor_buffer.release())))); + } + } + + TensorValue *tv; + if (context.GetNodeItem().NodeType() == HCOMREMOTEREAD) { + tv = context.MutableOutput(0); + } else { + tv = context.MutableInput(local_index_); + } + GE_CHECK_NOTNULL(tv); + auto local_addr = reinterpret_cast(reinterpret_cast(tv->MutableData())); + for (auto idx = 0; idx < dims.front(); ++idx) { + addr_infos.push_back({static_cast(data[idx * kVarTableRowCnt]), + data[idx * kVarTableRowCnt + kVarTableIdxAddr], local_addr, + data[idx * kVarTableRowCnt + kVarTableIdxLen]}); + local_addr += data[idx * kVarTableRowCnt + kVarTableIdxLen]; + } + + return SUCCESS; +} + +Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { + GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName()); + auto EnqueueRemoteAccess = + (HcclResult(*)(const string &, const vector &, + std::function))dlsym(context.handle_, "EnqueueRemoteAccess"); + if (EnqueueRemoteAccess == nullptr) { + GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function."); + if (dlclose(context.handle_) != 0) { + GELOGW("Failed to close handle %s", dlerror()); + } + return FAILED; + } + vector addr_infos; + GE_CHK_STATUS_RET(ExtractTensor(context, addr_infos)); + + auto callback = [this](HcclResult status) { + if (status != HCCL_SUCCESS) { + GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); + } + std::lock_guard lock(this->hccl_mutex_); + this->cond_.notify_all(); + GELOGI("rdma callback success."); + }; + HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); + if (hccl_ret != HCCL_SUCCESS) { + GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + return HCCL_E_INTERNAL; + } + + // pending until hccl finished + std::unique_lock ulock(hccl_mutex_); + cond_.wait(ulock); + + (void)context.RegisterCallback(done_callback); + GELOGI("[%s] RdmaNodeTask::ExecuteAsync success.", context.GetNodeName()); + return SUCCESS; +} + Status HcclNodeTask::UpdateArgs(TaskContext &context) { return SUCCESS; } Status HcclNodeTask::Init(TaskContext &context) { @@ -127,8 +246,10 @@ Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName()); GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed."); - // allocate output mem - GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed."); + // allocate output mem, output mem or remote read will be calculated when node execute. + if (context.GetNodeItem().NodeType() != HCOMREMOTEREAD) { + GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed."); + } GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed."); GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName()); @@ -138,8 +259,11 @@ Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr &task) const { GELOGI("[%s] HcclNodeExecutor::LoadTask in.", node->GetName().c_str()); GE_CHECK_NOTNULL(node); - - task = MakeShared(); + if (node->GetType() == HCOMREMOTEREAD || node->GetType() == HCOMREMOTEWRITE) { + task = MakeShared(); + } else { + task = MakeShared(); + } GE_CHECK_NOTNULL(task); GELOGI("[%s] HcclNodeExecutor::LoadTask success.", node->GetName().c_str()); return SUCCESS; @@ -169,12 +293,12 @@ Status HcclNodeExecutor::Initialize() { GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); return FAILED; } - auto HcomExcutorInitialize = (hcclResult_t(*)())dlsym(handle_, "HcomExcutorInitialize"); + auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize"); if (HcomExcutorInitialize == nullptr) { GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function."); return FAILED; } - hcclResult_t hccl_ret = HcomExcutorInitialize(); + HcclResult hccl_ret = HcomExcutorInitialize(); if (hccl_ret == HCCL_E_PTR) { GELOGI("Hccl comm is null, hcom executor initialize is not required."); } else if (hccl_ret == HCCL_SUCCESS) { @@ -187,12 +311,12 @@ Status HcclNodeExecutor::Initialize() { } Status HcclNodeExecutor::Finalize() { - auto HcomExcutorFinalize = (hcclResult_t(*)())dlsym(handle_, "HcomExcutorFinalize"); + auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize"); if (HcomExcutorFinalize == nullptr) { GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function."); return FAILED; } - hcclResult_t hccl_ret = HcomExcutorFinalize(); + HcclResult hccl_ret = HcomExcutorFinalize(); if (hccl_ret != HCCL_SUCCESS) { GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret); return FAILED; diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h index 8791c4e3..ddf6eb3a 100644 --- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h +++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h @@ -16,9 +16,9 @@ #ifndef HYBRID_HCCL_NODE_EXECUTOR_H_ #define HYBRID_HCCL_NODE_EXECUTOR_H_ -#include "hybrid/node_executor/node_executor.h" -#include "hybrid/model/hybrid_model.h" #include "graph/op_desc.h" +#include "hybrid/model/hybrid_model.h" +#include "hybrid/node_executor/node_executor.h" namespace ge { namespace hybrid { @@ -41,6 +41,24 @@ class HcclNodeTask : public NodeTask { std::condition_variable cond_; }; +class RdmaNodeTask : public NodeTask { + public: + RdmaNodeTask() = default; + + ~RdmaNodeTask() override {} + + Status UpdateArgs(TaskContext &context) override; + Status ExecuteAsync(TaskContext &context, std::function done_callback) override; + Status Init(TaskContext &context) override; + + private: + Status ExtractTensor(TaskContext &context, vector &addr_infos); + std::pair remote_index_; + int32_t local_index_ = 0; + std::mutex hccl_mutex_; + std::condition_variable cond_; +}; + class HcclNodeExecutor : public NodeExecutor { public: Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr &task) const; diff --git a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc index fbad1fcd..49ff722f 100644 --- a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc +++ b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc @@ -18,7 +18,6 @@ #include "hybrid/node_executor/host_cpu/kernel_factory.h" #include "graph/passes/folding_pass.h" #include "hybrid/model/hybrid_model.h" -#include "inc/kernel_factory.h" #include "ge_local_engine/engine/host_cpu_engine.h" namespace ge { @@ -32,16 +31,8 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) { Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGD("[%s] Start execute.", context.GetNodeName()); - - std::vector inputs; - std::vector outputs; - GE_CHK_STATUS_RET(ProcessInputs(context, inputs), "node:%s type:%s, process inputs failed.", node_->GetName().c_str(), - node_->GetType().c_str()); - GE_CHK_STATUS_RET(Execute(context, inputs, outputs), "node:%s type:%s, task execute failed.", - node_->GetName().c_str(), node_->GetType().c_str()); - GE_CHK_STATUS_RET(ProcessOutputs(context, outputs), "node:%s type:%s, process outputs failed.", - node_->GetName().c_str(), node_->GetType().c_str()); - + GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.", node_->GetName().c_str(), + node_->GetType().c_str()) if (done_callback) { GELOGD("[%s] Start invoke callback.", context.GetNodeName()); done_callback(); @@ -50,98 +41,48 @@ Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function &inputs) { - int32_t input_num = context.NumInputs(); - for (auto i = 0; i < input_num; ++i) { - auto tensor_value = context.GetInput(i); - GE_CHECK_NOTNULL(tensor_value); - GeTensorPtr input_ptr = - MakeShared(node_->GetOpDesc()->GetInputDesc(i), - reinterpret_cast(tensor_value->GetData()), tensor_value->GetSize()); - if (input_ptr == nullptr) { - GELOGE(MEMALLOC_FAILED, "Make shared failed"); - return MEMALLOC_FAILED; - } - inputs.push_back(input_ptr); - } - return SUCCESS; -} +Status CpuKernelNodeTask::Execute(TaskContext &context) { + const auto &op_desc = node_->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); -Status HostNodeTaskBase::ProcessOutputs(TaskContext &context, std::vector &outputs) { - int32_t output_num = context.NumOutputs(); - if (static_cast(output_num) != outputs.size()) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.", - node_->GetName().c_str(), node_->GetType().c_str(), output_num, outputs.size()); - return INTERNAL_ERROR; + std::vector inputs; + for (int32_t i = 0; i < context.NumInputs(); ++i) { + const auto &input_desc = op_desc->GetInputDesc(i); + auto in_tensor = MakeShared(input_desc, reinterpret_cast(context.GetInput(i)->GetData()), + context.GetInput(i)->GetSize()); + GE_CHECK_NOTNULL(in_tensor); + in_tensor->MutableTensorDesc().SetDataType(input_desc.GetDataType()); + in_tensor->MutableTensorDesc().SetShape(input_desc.GetShape()); + inputs.emplace_back(in_tensor); + GELOGI("node:%s allocate input %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, + reinterpret_cast(in_tensor->GetData().data()), in_tensor->GetData().size()); } - // alloc output - GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs()); - - // copy data to output - for (auto i = 0; i < output_num; ++i) { - GeTensorPtr &tensor = outputs[i]; - GE_CHECK_NOTNULL(tensor); - auto tensor_data = tensor->GetData(); - auto tensor_value = context.MutableOutput(i); - GE_CHECK_NOTNULL(tensor_value); - if (tensor_data.GetSize() > tensor_value->GetSize()) { - GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", - node_->GetName().c_str(), node_->GetType().c_str(), i, tensor_data.GetSize(), tensor_value->GetSize()); - return INTERNAL_ERROR; - } - - GELOGI("node:%s type:%s [%d]th output data=%p, out size=%zu, data size=%zu.", node_->GetName().c_str(), - node_->GetType().c_str(), i, tensor_value->GetData(), tensor_value->GetSize(), tensor_data.GetSize()); - if (tensor_data.GetSize() > 0) { - GE_CHK_RT_RET(rtMemcpy(tensor_value->MutableData(), tensor_value->GetSize(), tensor_data.GetData(), - tensor_data.GetSize(), RT_MEMCPY_HOST_TO_HOST)); + std::vector outputs; + for (int32_t i = 0; i < context.NumOutputs(); ++i) { + const auto &output_desc = op_desc->GetOutputDesc(i); + AllocationAttr attr; + attr.SetMemType(HOST_DDR); + if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) { + GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i); + return FAILED; } - GELOGI("node:%s type:%s [%d]th set data success, data size=%zu.", node_->GetName().c_str(), - node_->GetType().c_str(), i, tensor_data.GetSize()); - } - - return SUCCESS; -} - -Status CpuKernelNodeTask::Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) { - std::vector const_inputs; - for (const auto &input : inputs) { - const_inputs.emplace_back(input); - } - return FoldingPass::RunOpKernel(node_, const_inputs, outputs); -} - -Status HostKernelNodeTask::Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) { - auto kernel = KernelFactory::Instance().Create(node_->GetType()); - if (kernel == nullptr) { - GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", node_->GetName().c_str(), - node_->GetType().c_str()); - return UNSUPPORTED; - } - - std::vector const_inputs; - for (const auto &input : inputs) { - const_inputs.emplace_back(input); - } - Status compute_ret = kernel->Compute(node_->GetOpDesc(), const_inputs, outputs); - if (compute_ret != SUCCESS) { - GELOGE(compute_ret, "node %s type %s compute failed or not imply.", node_->GetName().c_str(), - node_->GetType().c_str()); - return compute_ret; + auto tensor = context.GetOutput(i); + GE_CHECK_NOTNULL(tensor); + auto out_tensor = + MakeShared(output_desc, reinterpret_cast(tensor->GetData()), tensor->GetSize()); + GE_CHECK_NOTNULL(out_tensor); + out_tensor->MutableTensorDesc().SetDataType(output_desc.GetDataType()); + out_tensor->MutableTensorDesc().SetShape(output_desc.GetShape()); + outputs.emplace_back(out_tensor); + GELOGI("node:%s allocate output %d, addr=%p, size=%zu", op_desc->GetName().c_str(), i, + reinterpret_cast(out_tensor->GetData().data()), out_tensor->GetData().size()); } - return SUCCESS; + return HostCpuEngine::GetInstance().Run(node_, inputs, outputs); } -Status HostCpuNodeTask::ProcessInputs(TaskContext &context, std::vector &inputs) { return SUCCESS; } - -Status HostCpuNodeTask::ProcessOutputs(TaskContext &context, std::vector &outputs) { return SUCCESS; } - -Status HostCpuNodeTask::Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) { +Status HostCpuNodeTask::Execute(TaskContext &context) { RunContext run_context; auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_); if (host_kernel == nullptr) { @@ -175,10 +116,6 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no GELOGI("create CpuKernelNodeTask for node %s, type %s.", name.c_str(), type.c_str()); task = MakeShared(node); GE_CHECK_NOTNULL(task); - } else if (KernelFactory::Instance().Create(type) != nullptr) { - GELOGI("create HostKernelNodeTask for node %s, type %s.", name.c_str(), type.c_str()); - task = MakeShared(node); - GE_CHECK_NOTNULL(task); } else if (hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node) != nullptr) { GELOGI("create HostCpuNodeTask for node %s, type %s.", name.c_str(), type.c_str()); task = MakeShared(node); diff --git a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h index b27e558b..036a0c60 100644 --- a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h +++ b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.h @@ -17,58 +17,41 @@ #ifndef GE_HYBRID_KERNEL_HOST_CPU_NODE_EXECUTOR_H_ #define GE_HYBRID_KERNEL_HOST_CPU_NODE_EXECUTOR_H_ -#include "inc/kernel.h" #include "hybrid/node_executor/node_executor.h" +#include "inc/kernel.h" namespace ge { namespace hybrid { class HostNodeTaskBase : public NodeTask { public: explicit HostNodeTaskBase(const NodePtr &node) : node_(node) {} - ~HostNodeTaskBase() = default; - virtual Status UpdateArgs(TaskContext &context); - virtual Status ExecuteAsync(TaskContext &context, std::function done_callback); + ~HostNodeTaskBase() override = default; + Status UpdateArgs(TaskContext &context) override; + Status ExecuteAsync(TaskContext &context, std::function done_callback) override; protected: NodePtr node_; private: - virtual Status Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) = 0; - virtual Status ProcessInputs(TaskContext &context, std::vector &inputs); - virtual Status ProcessOutputs(TaskContext &context, std::vector &outputs); + virtual Status Execute(TaskContext &context) = 0; }; class CpuKernelNodeTask : public HostNodeTaskBase { public: explicit CpuKernelNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {} - ~CpuKernelNodeTask() = default; - - private: - Status Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) override; -}; - -class HostKernelNodeTask : public HostNodeTaskBase { - public: - explicit HostKernelNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {} - ~HostKernelNodeTask() = default; + ~CpuKernelNodeTask() override = default; private: - Status Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) override; + Status Execute(TaskContext &context) override; }; class HostCpuNodeTask : public HostNodeTaskBase { public: explicit HostCpuNodeTask(const NodePtr &node) : HostNodeTaskBase(node) {} - ~HostCpuNodeTask() = default; + ~HostCpuNodeTask() override = default; private: - Status Execute(TaskContext &context, const std::vector &inputs, - std::vector &outputs) override; - Status ProcessInputs(TaskContext &context, std::vector &inputs) override; - Status ProcessOutputs(TaskContext &context, std::vector &outputs) override; + Status Execute(TaskContext &context) override; }; class HostCpuNodeExecutor : public NodeExecutor { diff --git a/src/ge/hybrid/node_executor/task_context.cc b/src/ge/hybrid/node_executor/task_context.cc index dd833fe1..e49a2b43 100644 --- a/src/ge/hybrid/node_executor/task_context.cc +++ b/src/ge/hybrid/node_executor/task_context.cc @@ -18,6 +18,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "framework/common/debug/log.h" #include "graph/utils/tensor_utils.h" +#include "graph/debug/ge_attr_define.h" #include "hybrid/executor/hybrid_execution_context.h" #include "hybrid/executor/subgraph_executor.h" @@ -225,7 +226,15 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) { for (int i = 0; i < node_item_->num_outputs; ++i) { const auto &output_desc = node_item_->op_desc->MutableOutputDesc(i); GE_CHECK_NOTNULL(output_desc); - GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, attr)); + uint32_t mem_type = 0; + (void)AttrUtils::GetInt(node_item_->op_desc, ATTR_OUTPUT_MEMORY_TYPE, mem_type); + if (attr == nullptr) { + auto tmp_attr = AllocationAttr(0, nullptr, static_cast(mem_type)); + GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, &tmp_attr)); + } else { + attr->SetMemType(static_cast(mem_type)); + GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, attr)); + } } return SUCCESS; diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc index 0532321e..d5e745eb 100644 --- a/src/ge/init/gelib.cc +++ b/src/ge/init/gelib.cc @@ -29,15 +29,17 @@ #include "common/profiling/profiling_manager.h" #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" #include "framework/common/util.h" +#include "analyzer/analyzer.h" #include "ge/ge_api_types.h" #include "ge_local_engine/engine/host_cpu_engine.h" +#include "graph/common/ge_call_wrapper.h" #include "graph/ge_context.h" #include "graph/ge_global_options.h" #include "graph/load/new_model_manager/model_manager.h" #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/graph_var_manager.h" -#include "graph/common/ge_call_wrapper.h" #include "omm/csa_interact.h" #include "runtime/kernel.h" @@ -142,8 +144,15 @@ Status GELib::InnerInitialize(const map &options) { return initHostCpuEngineStatus; } + GELOGI("Start to init Analyzer!"); + Status init_analyzer_status = ge::Analyzer::GetInstance()->Initialize(); + if (init_analyzer_status != SUCCESS) { + GELOGE(init_analyzer_status, "Failed to initialize HostCpuEngine"); + RollbackInit(); + return init_analyzer_status; + } + init_flag_ = true; - GELOGI("GeLib initial success."); return SUCCESS; } @@ -159,6 +168,11 @@ Status GELib::SystemInitialize(const map &options) { // In train and infer, profiling is always needed. InitOptions(options); InitProfiling(this->options_); + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS, + GELOGE(FAILED, "Enable exception dump failed"); + return FAILED); // 1.`is_train_mode_` means case: train // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer // these two case need call `InitSystemWithOptions->rtGetDeviceIndexByPhyId` @@ -278,20 +292,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId()); Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT); GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret); - options.physical_device_id = options.device_id; - - // The physical ID is transferred to the logical ID. FMK receives physical ID and needs to be converted - uint32_t dev_logic_index = 0; - rtError_t rt_ret = rtGetDeviceIndexByPhyId(static_cast(options.device_id), &dev_logic_index); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, - GELOGE(rt_ret, "rtGetDeviceIndexByPhyId transform index by phyId %d failed", options.device_id); - CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_ENV_INIT); - return FAILED); - options.device_id = static_cast(dev_logic_index); - GELOGI("rtGetDeviceIndexByPhyId physical device id:%d,logical device id:%u", options.device_id, dev_logic_index); - - GetContext().SetCtxDeviceId(dev_logic_index); + // set device id + GELOGI("set logical device id:%u", options.device_id); + GetContext().SetCtxDeviceId(static_cast(options.device_id)); GE_CHK_RT_RET(rtSetDevice(options.device_id)); // In the scenario that the automatic add fusion is set, but there is no cleanaddr operator, @@ -389,6 +393,9 @@ Status GELib::Finalize() { GELOGI("HostCpuEngine finalization."); HostCpuEngine::GetInstance().Finalize(); + GELOGI("Analyzer finalization"); + Analyzer::GetInstance()->Finalize(); + // Shut down profiling ShutDownProfiling(); diff --git a/src/ge/ir_build/atc_ir_common.cc b/src/ge/ir_build/atc_ir_common.cc index dbfe688b..82ed40bd 100644 --- a/src/ge/ir_build/atc_ir_common.cc +++ b/src/ge/ir_build/atc_ir_common.cc @@ -476,13 +476,6 @@ Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory) GELOGE(ge::PARAM_INVALID, "Input parameter[--disable_reuse_memory]'s value must be 1 or 0."); return ge::PARAM_INVALID; } - - const char *env_ge_dump = std::getenv("DUMP_OP"); - const int decimal = 10; - int ge_dump_flag = (env_ge_dump != nullptr) ? std::strtol(env_ge_dump, nullptr, decimal) : 0; - if (ge_dump_flag && (disable_reuse_memory == "0")) { - GELOGW("Will dump uncorrect op data with param disable_reuse_memory=0"); - } return ge::SUCCESS; } diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc index 0a60fa11..90f7a8ca 100644 --- a/src/ge/ir_build/ge_ir_build.cc +++ b/src/ge/ir_build/ge_ir_build.cc @@ -31,11 +31,11 @@ #include "graph/compute_graph.h" #include "graph/ge_tensor.h" #include "graph/utils/type_utils.h" +#include "graph/ge_global_options.h" #include "init/gelib.h" #include "ir_build/atc_ir_common.h" #include "model/ge_model.h" -using domi::GetContext; using std::string; using namespace std; @@ -133,25 +133,24 @@ void aclgrphBuildFinalize() { class Impl { public: Impl() { - GetContext().format = domi::DOMI_TENSOR_ND; - GetContext().input_nodes_format_map.clear(); - GetContext().output_formats.clear(); - GetContext().user_input_dims.clear(); - GetContext().input_dims.clear(); - GetContext().op_conf_map.clear(); - GetContext().out_nodes_map.clear(); - GetContext().user_out_nodes.clear(); - GetContext().net_format = domi::DOMI_TENSOR_RESERVED; - GetContext().type = domi::FRAMEWORK_RESERVED; - GetContext().run_mode = ONLY_PRE_CHECK; - GetContext().train_flag = false; - GetContext().fp16_high_precision = HIGH_PRECISION_DEFAULT; - GetContext().output_type.clear(); - GetContext().net_name.clear(); - GetContext().is_dynamic_input = false; - GetContext().dynamic_batch_size.clear(); - GetContext().dynamic_image_size.clear(); - GetContext().dynamic_dims.clear(); + omg_context_ = domi::GetContext(); + omg_context_.format = domi::DOMI_TENSOR_ND; + omg_context_.input_nodes_format_map.clear(); + omg_context_.output_formats.clear(); + omg_context_.user_input_dims.clear(); + omg_context_.input_dims.clear(); + omg_context_.op_conf_map.clear(); + omg_context_.out_nodes_map.clear(); + omg_context_.user_out_nodes.clear(); + omg_context_.net_format = domi::DOMI_TENSOR_RESERVED; + omg_context_.type = domi::FRAMEWORK_RESERVED; + omg_context_.run_mode = ONLY_PRE_CHECK; + omg_context_.train_flag = false; + omg_context_.output_type.clear(); + omg_context_.is_dynamic_input = false; + omg_context_.dynamic_batch_size.clear(); + omg_context_.dynamic_image_size.clear(); + omg_context_.dynamic_dims.clear(); }; ~Impl() { (void)generator_.Finalize(); }; graphStatus CheckOptions(const std::map &options); @@ -161,24 +160,52 @@ class Impl { ModelBufferData &ge_models); graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format, bool is_dynamic_input); + void SetRtSocVersion(); public: ge::GeGenerator generator_; std::map options_; bool is_dynamic_input_ = false; + OmgContext omg_context_; }; graphStatus Impl::CheckOptions(const std::map &options) { for (auto &ele : options) { auto it = ge::ir_option::ir_builder_suppported_options.find(ele.first); if (it == ge::ir_option::ir_builder_suppported_options.end()) { - GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", ele.first.c_str()); - return GRAPH_PARAM_INVALID; + auto it_lx_fusion = ir_builder_supported_options_for_lx_fusion.find(ele.first); + if (it_lx_fusion == ir_builder_supported_options_for_lx_fusion.end()) { + GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", ele.first.c_str()); + return GRAPH_PARAM_INVALID; + } } options_.insert(ele); } + // Check options build_mode and build_step. + std::string build_mode; + auto it = options_.find(BUILD_MODE); + if (it != options_.end() && !(it->second.empty())) { + if (build_mode_options.find(it->second) == build_mode_options.end()) { + GELOGE(GRAPH_PARAM_INVALID, "Build mode:%s is unsupported. Please check!", it->second.c_str()); + return GRAPH_PARAM_INVALID; + } + build_mode = it->second; + } + it = options_.find(BUILD_STEP); + if (it != options_.end() && !(it->second.empty())) { + if (build_step_options.find(it->second) == build_step_options.end()) { + GELOGE(GRAPH_PARAM_INVALID, "Build step:%s is unsupported. Please check!", it->second.c_str()); + return GRAPH_PARAM_INVALID; + } + } else { + if (build_mode == BUILD_MODE_TUNING) { + GELOGE(GRAPH_PARAM_INVALID, "Build mode tuning must specify build step. Please check!"); + return GRAPH_PARAM_INVALID; + } + } return GRAPH_SUCCESS; } + graphStatus Impl::Init(const std::map &options) { // 1. check options graphStatus ret = CheckOptions(options); @@ -186,6 +213,13 @@ graphStatus Impl::Init(const std::map &options) { GELOGE(ret, "User input options are illegal! Please check!"); return ret; } + + GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions()); + GetThreadLocalContext().SetGraphOption(options_); + std::string build_mode = (options_.find(BUILD_MODE) == options_.end() || options_[BUILD_MODE] == BUILD_MODE_NORMAL) + ? "" + : options_[BUILD_MODE]; + options_[BUILD_MODE] = build_mode; // set log level std::string log = options_.find(ge::ir_option::LOG_LEVEL) == options_.end() ? IR_OPTION_LOG_LEVEL_DEFAULT : options_[ge::ir_option::LOG_LEVEL]; @@ -212,9 +246,9 @@ graphStatus Impl::Init(const std::map &options) { } GELOGD("User input dynamic_batch_size:%s, dynamic_image_size:%s, dynamic_dims:%s.", dynamic_batch_size.c_str(), dynamic_image_size.c_str(), dynamic_dims.c_str()); - GetContext().dynamic_batch_size = dynamic_batch_size; - GetContext().dynamic_image_size = dynamic_image_size; - GetContext().dynamic_dims = dynamic_dims; + omg_context_.dynamic_batch_size = dynamic_batch_size; + omg_context_.dynamic_image_size = dynamic_image_size; + omg_context_.dynamic_dims = dynamic_dims; // check output_type std::string output_type = options_.find(ge::ir_option::OUTPUT_TYPE) == options_.end() ? "" : options_[ge::ir_option::OUTPUT_TYPE]; @@ -235,8 +269,10 @@ graphStatus Impl::Init(const std::map &options) { // print ge option map ge::PrintOptionMap(options_, "ge option"); + SetRtSocVersion(); + // 3. init generator with options_ - ret = generator_.Initialize(options_); + ret = generator_.Initialize(options_, omg_context_); if (ret != GRAPH_SUCCESS) { GELOGE(ret, "generator Initialize failed!"); return ret; @@ -244,6 +280,20 @@ graphStatus Impl::Init(const std::map &options) { // 4.parse and init Context with input shape format and net format info return this->InitDomiOmgContext(input_shape, input_format, net_format, is_dynamic_input_); } + +void Impl::SetRtSocVersion() { + auto &global_options = GetMutableGlobalOptions(); + auto it = global_options.find(ge::SOC_VERSION); + if (it != global_options.end()) { + const char *soc_version = it->second.c_str(); + rtError_t rt_ret = rtSetSocVersion(soc_version); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("Set soc version %s failed. ret:0x%X", soc_version, rt_ret); + } + GELOGI("Set soc version %s success.", soc_version); + } +} + graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector &inputs) { auto compute_graph = ge::GraphUtils::GetComputeGraph(graph); GE_CHECK_NOTNULL(compute_graph); @@ -259,8 +309,8 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vectorGetName(); GELOGI("Data op name: %s", data_op_name.c_str()); ge::GeShape data_shape; - auto iter = GetContext().input_dims.find(data_op_name); - if (iter != GetContext().input_dims.end()) { + auto iter = omg_context_.input_dims.find(data_op_name); + if (iter != omg_context_.input_dims.end()) { data_shape = ge::GeShape(iter->second); GELOGI("Data op get shape from Context."); } else { @@ -273,7 +323,7 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector inputs; - if (!GetContext().is_dynamic_input) { // if dynamic input , no need to creat inputs + if (!omg_context_.is_dynamic_input) { // if dynamic input , no need to creat inputs ret = CreateInputsForIRBuild(graph, inputs); if (ret != GRAPH_SUCCESS) { GELOGE(ret, "CreateInputsForIRBuild failed!"); @@ -312,15 +362,15 @@ graphStatus Impl::BuildModel(const Graph &graph, const std::mapsecond; + omg_context_.format = iter->second; } else { GELOGE(GRAPH_PARAM_INVALID, "Input format %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", input_format.c_str()); @@ -332,7 +382,7 @@ graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &in return GRAPH_SUCCESS; } - if (!ParseInputShape(input_shape, GetContext().input_dims, GetContext().user_input_dims, is_dynamic_input)) { + if (!ParseInputShape(input_shape, omg_context_.input_dims, omg_context_.user_input_dims, is_dynamic_input)) { GELOGE(GRAPH_PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); return GRAPH_PARAM_INVALID; } diff --git a/src/ge/model/ge_model.cc b/src/ge/model/ge_model.cc index 348f8416..70251876 100644 --- a/src/ge/model/ge_model.cc +++ b/src/ge/model/ge_model.cc @@ -43,6 +43,8 @@ std::shared_ptr GeModel::GetModelTaskDefPtr() const { return const TBEKernelStore &GeModel::GetTBEKernelStore() const { return this->tbe_kernal_store_; } +const CustAICPUKernelStore &GeModel::GetCustAICPUKernelStore() const { return this->cust_aicpu_kernal_store_; } + Buffer GeModel::GetWeight() const { return this->weights_buffer_; } std::string GeModel::GetName() const { return this->name_; } @@ -59,6 +61,10 @@ void GeModel::SetModelTaskDef(const std::shared_ptr &task) { void GeModel::SetTBEKernelStore(const TBEKernelStore &tbe_kernal_store) { this->tbe_kernal_store_ = tbe_kernal_store; } +void GeModel::SetCustAICPUKernelStore(const CustAICPUKernelStore &cust_aicpu_kernal_store) { + this->cust_aicpu_kernal_store_ = cust_aicpu_kernal_store; +} + void GeModel::SetWeight(const Buffer &weights_buffer) { this->weights_buffer_ = weights_buffer; } void GeModel::SetName(const std::string &name) { this->name_ = name; } diff --git a/src/ge/model/ge_model.h b/src/ge/model/ge_model.h index be4b65bc..288b834f 100644 --- a/src/ge/model/ge_model.h +++ b/src/ge/model/ge_model.h @@ -22,6 +22,7 @@ #include #include #include "common/tbe_kernel_store.h" +#include "common/cust_aicpu_kernel_store.h" #include "framework/common/debug/log.h" #include "framework/common/fmk_error_codes.h" #include "graph/buffer.h" @@ -40,6 +41,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder const Graph &GetGraph() const; std::shared_ptr GetModelTaskDefPtr() const; const TBEKernelStore &GetTBEKernelStore() const; + const CustAICPUKernelStore &GetCustAICPUKernelStore() const; Buffer GetWeight() const; std::string GetName() const; @@ -50,6 +52,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder void SetGraph(const Graph &graph); void SetModelTaskDef(const std::shared_ptr &task); void SetTBEKernelStore(const TBEKernelStore &tbe_kernal_store); + void SetCustAICPUKernelStore(const CustAICPUKernelStore &cust_aicpu_kernal_store); void SetWeight(const Buffer &weights_buffer); void SetName(const std::string &name); @@ -79,6 +82,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder Graph graph_; std::shared_ptr task_; TBEKernelStore tbe_kernal_store_; + CustAICPUKernelStore cust_aicpu_kernal_store_; Buffer weights_buffer_; std::string name_; diff --git a/src/ge/opskernel_manager/ops_kernel_manager.cc b/src/ge/opskernel_manager/ops_kernel_manager.cc index 24c5a52d..51e8f438 100644 --- a/src/ge/opskernel_manager/ops_kernel_manager.cc +++ b/src/ge/opskernel_manager/ops_kernel_manager.cc @@ -34,6 +34,8 @@ const char *const kInitialize = "Initialize"; const char *const kGetOpsKernelInfoStores = "GetOpsKernelInfoStores"; const char *const kGetGraphOptimizerObjs = "GetGraphOptimizerObjs"; const char *const kFinalize = "Finalize"; + +std::mutex ops_kernel_info_mutex; } // namespace namespace ge { @@ -198,7 +200,7 @@ Status OpsKernelManager::ParsePluginOptions(const map &options, return SUCCESS; } -Status OpsKernelManager::CheckPluginPtr() { +Status OpsKernelManager::CheckPluginPtr() const { for (auto iter = ops_kernel_store_.begin(); iter != ops_kernel_store_.end(); ++iter) { if (iter->second == nullptr) { GELOGE(INTERNAL_ERROR, "CheckPluginPtr OpsKernelInfoStorePtr is null"); @@ -339,6 +341,8 @@ Status OpsKernelManager::Finalize() { } const vector &OpsKernelManager::GetOpsKernelInfo(const string &op_type) { + std::lock_guard lock(ops_kernel_info_mutex); + auto find = ops_kernel_info_.find(op_type); if (find != ops_kernel_info_.end()) { return find->second; @@ -353,7 +357,10 @@ const vector &OpsKernelManager::GetOpsKernelInfo(const string &op_type) } } -const map> &OpsKernelManager::GetAllOpsKernelInfo() const { return ops_kernel_info_; } +const map> &OpsKernelManager::GetAllOpsKernelInfo() const { + std::lock_guard lock(ops_kernel_info_mutex); + return ops_kernel_info_; +} OpsKernelInfoStorePtr OpsKernelManager::GetOpsKernelInfoStore(const std::string &name) const { auto find = ops_kernel_store_.find(name); diff --git a/src/ge/opskernel_manager/ops_kernel_manager.h b/src/ge/opskernel_manager/ops_kernel_manager.h index 43644d0e..a5d4d85c 100644 --- a/src/ge/opskernel_manager/ops_kernel_manager.h +++ b/src/ge/opskernel_manager/ops_kernel_manager.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "common/debug/log.h" #include "common/ge/plugin_manager.h" @@ -74,9 +75,6 @@ class OpsKernelManager { // get enablePluginFlag bool GetEnablePluginFlag() const; - // Finalize other ops kernel resource - Status FinalizeOpsKernel(); - private: OpsKernelManager(); ~OpsKernelManager(); @@ -89,7 +87,7 @@ class OpsKernelManager { Status InitOpKernelInfoStores(const map &options); - Status CheckPluginPtr(); + Status CheckPluginPtr() const; void GetExternalEnginePath(std::string &path, const std::map &options); @@ -105,6 +103,9 @@ class OpsKernelManager { Status InitGraphOptimizerPriority(); + // Finalize other ops kernel resource + Status FinalizeOpsKernel(); + PluginManager plugin_manager_; OpTilingManager op_tiling_manager_; // opsKernelInfoStore diff --git a/src/ge/session/inner_session.cc b/src/ge/session/inner_session.cc index a4e77b73..9f1f199f 100644 --- a/src/ge/session/inner_session.cc +++ b/src/ge/session/inner_session.cc @@ -31,28 +31,17 @@ namespace ge { namespace { Status CheckReuseMemoryOption(const std::map &options) { - const int kDecimal = 10; - auto dump_op_env = std::getenv("DUMP_OP"); - int dump_op_flag = (dump_op_env != nullptr) ? std::strtol(dump_op_env, nullptr, kDecimal) : 0; auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY); if (iter != options.end()) { if (iter->second == "0") { GELOGD("%s=0, reuse memory is open", OPTION_EXEC_DISABLE_REUSED_MEMORY); - if (dump_op_flag) { - GELOGW("Will dump incorrect op data with ge option %s=0", OPTION_EXEC_DISABLE_REUSED_MEMORY); - } } else if (iter->second == "1") { GELOGD("%s=1, reuse memory is close", OPTION_EXEC_DISABLE_REUSED_MEMORY); } else { GELOGE(PARAM_INVALID, "option %s=%s is invalid", OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str()); return FAILED; } - } else { - if (dump_op_flag) { - GELOGW("Will dump incorrect op data with default reuse memory"); - } } - return SUCCESS; } } // namespace @@ -60,7 +49,7 @@ Status CheckReuseMemoryOption(const std::map &options) { static std::mutex mutex_; // BuildGraph and RunGraph use InnerSession::InnerSession(uint64_t session_id, const std::map &options) - : init_flag_(false), session_id_(session_id), options_(options) {} + : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {} Status InnerSession::Initialize() { if (init_flag_) { diff --git a/src/ge/session/omg.cc b/src/ge/session/omg.cc index 805f8653..bcf42032 100644 --- a/src/ge/session/omg.cc +++ b/src/ge/session/omg.cc @@ -95,7 +95,28 @@ static void ParseAtcParms(const std::map &atc_params, } } -static Status CheckInputShapeNode(const ComputeGraphPtr &graph) { +static Status CheckInputShapeNode(const ComputeGraphPtr &graph, const bool is_dynamic_input) { + if (!is_dynamic_input) { + for (auto node : graph->GetDirectNode()) { + if (node->GetType() == DATA) { + auto data_op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(data_op_desc); + auto tensor_desc = data_op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(tensor_desc); + for (auto dim : tensor_desc->GetShape().GetDims()) { + if (dim < 0) { + GELOGE(PARAM_INVALID, + "Input op [%s] shape %ld is negative, maybe you should set input_shape to specify its shape", + node->GetName().c_str(), dim); + const string reason = "maybe you should set input_shape to specify its shape"; + ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, + {node->GetName(), to_string(dim), reason}); + return PARAM_INVALID; + } + } + } + } + } for (auto it : domi::GetContext().user_input_dims) { std::string node_name = it.first; ge::NodePtr node = graph->FindNode(node_name); @@ -758,7 +779,7 @@ FMK_FUNC_HOST_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map(model.model_data), &model_def); + try { + flag = google::protobuf::TextFormat::ParseFromString(reinterpret_cast(model.model_data), &model_def); + } catch (google::protobuf::FatalException &e) { + free_model_data(&model.model_data); + GELOGE(FAILED, "ParseFromString fail. exception message : %s", e.what()); + return FAILED; + } + if (!flag) { free_model_data(&model.model_data); GELOGE(FAILED, "ParseFromString fail."); diff --git a/src/ge/session/session_manager.cc b/src/ge/session/session_manager.cc index bca98d53..35d97c31 100644 --- a/src/ge/session/session_manager.cc +++ b/src/ge/session/session_manager.cc @@ -341,6 +341,13 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vectorRemoveGraph(graph_id); + if (ret != SUCCESS) { + GELOGE(FAILED, "Remove graph failed."); + return FAILED; + } return ret; } diff --git a/src/ge/single_op/single_op.cc b/src/ge/single_op/single_op.cc index 5fa4efcf..aeefe2be 100644 --- a/src/ge/single_op/single_op.cc +++ b/src/ge/single_op/single_op.cc @@ -91,24 +91,12 @@ Status SingleOp::ValidateArgs(const std::vector &inputs, const std:: Status SingleOp::GetArgs(const std::vector &inputs, const std::vector &outputs) { size_t arg_index = 0; - if (use_physical_addr_) { - for (auto &input : inputs) { - auto *addr = reinterpret_cast(input.data); - args_[arg_index++] = reinterpret_cast(addr); - } - - for (auto &output : outputs) { - auto *addr = reinterpret_cast(output.data); - args_[arg_index++] = reinterpret_cast(addr); - } - } else { - for (auto &input : inputs) { - args_[arg_index++] = reinterpret_cast(input.data); - } + for (auto &input : inputs) { + args_[arg_index++] = reinterpret_cast(input.data); + } - for (auto &output : outputs) { - args_[arg_index++] = reinterpret_cast(output.data); - } + for (auto &output : outputs) { + args_[arg_index++] = reinterpret_cast(output.data); } return SUCCESS; } diff --git a/src/ge/single_op/single_op.h b/src/ge/single_op/single_op.h index 71096f35..b7d23d32 100644 --- a/src/ge/single_op/single_op.h +++ b/src/ge/single_op/single_op.h @@ -53,7 +53,6 @@ class SingleOp { std::vector tasks_; std::vector> arg_table_; - bool use_physical_addr_ = false; }; class DynamicSingleOp { diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc index 65f76acc..8c974259 100644 --- a/src/ge/single_op/single_op_model.cc +++ b/src/ge/single_op/single_op_model.cc @@ -85,11 +85,6 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP Status SingleOpModel::InitModelMem(StreamResource &res) { ParseOpModelParams(model_helper_, model_params_); - if (model_params_.memory_size > ALLOC_MEMORY_MAX_SIZE || model_params_.weight_size > ALLOC_MEMORY_MAX_SIZE) { - GELOGE(PARAM_INVALID, "Can not alloc memory larger than %lu. memory size = %lu, weight size = %lu", - ALLOC_MEMORY_MAX_SIZE, model_params_.memory_size, model_params_.weight_size); - return PARAM_INVALID; - } if (model_params_.memory_size > model_params_.zero_copy_mem_size) { const string purpose("malloc feature map memory on model execute."); @@ -203,12 +198,6 @@ Status SingleOpModel::ParseInputsAndOutputs() { } Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) { - // for lhisi - const char *use_physical_address = std::getenv("GE_USE_PHYSICAL_ADDRESS"); - if (use_physical_address != nullptr) { - single_op.use_physical_addr_ = true; - } - int arg_index = 0; for (size_t i = 0; i < input_offset_list_.size(); ++i) { auto *addr = model_params_.mem_base + input_offset_list_[i]; diff --git a/src/ge/single_op/task/op_task.cc b/src/ge/single_op/task/op_task.cc index 8280fff5..f23073bb 100644 --- a/src/ge/single_op/task/op_task.cc +++ b/src/ge/single_op/task/op_task.cc @@ -32,7 +32,7 @@ constexpr int kLaunchRetryTimes = 1000; constexpr int kSleepTime = 10; } // namespace -Status OpTask::OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream) { +Status OpTask::OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream) { if (DumpManager::GetInstance().IsDumpOpen()) { GELOGI("Dump is open in single op,start to set dump info"); std::vector input_addrs; @@ -40,11 +40,11 @@ Status OpTask::OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream) auto input_size = op_desc->GetAllInputsDesc().size(); auto output_size = op_desc->GetOutputsSize(); for (size_t i = 0; i < input_size; i++) { - uint64_t input_addr = *(reinterpret_cast(arg) + i); + uint64_t input_addr = *(reinterpret_cast(arg) + i); input_addrs.emplace_back(input_addr); } for (size_t j = 0; j < output_size; j++) { - uint64_t output_addr = *(reinterpret_cast(arg) + input_size + j); + uint64_t output_addr = *(reinterpret_cast(arg) + input_size + j); output_adds.emplace_back(output_addr); } dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc, input_addrs, output_adds, stream); diff --git a/src/ge/single_op/task/op_task.h b/src/ge/single_op/task/op_task.h index 0401a177..a571bce1 100644 --- a/src/ge/single_op/task/op_task.h +++ b/src/ge/single_op/task/op_task.h @@ -57,7 +57,7 @@ class OpTask { std::vector workspace_sizes_; protected: - Status OpenDump(void *arg, const OpDescPtr &op_desc, rtStream_t stream); + Status OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream); DumpProperties dump_properties_; DumpOp dump_op_; }; diff --git a/src/proto/dump_task.proto b/src/proto/dump_task.proto new file mode 100644 index 00000000..ecdf4792 --- /dev/null +++ b/src/proto/dump_task.proto @@ -0,0 +1,127 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; +package toolkit.dumpdata; + +enum OutputDataType { + DT_UNDEFINED = 0; + DT_FLOAT = 1; + DT_FLOAT16 = 2; + DT_INT8 = 3; + DT_UINT8 = 4; + DT_INT16 = 5; + DT_UINT16 = 6; + DT_INT32 = 7; + DT_INT64 = 8; + DT_UINT32 = 9; + DT_UINT64 = 10; + DT_BOOL = 11; + DT_DOUBLE = 12; + DT_STRING = 13; + DT_DUAL_SUB_INT8 = 14; + DT_DUAL_SUB_UINT8 = 15; + DT_COMPLEX64 = 16; + DT_COMPLEX128 = 17; + DT_QINT8 = 18; + DT_QINT16 = 19; + DT_QINT32 = 20; + DT_QUINT8 = 21; + DT_QUINT16 = 22; + DT_RESOURCE = 23; + DT_STRING_REF = 24; + DT_DUAL = 25; +} + +enum OutputFormat { + FORMAT_NCHW = 0; + FORMAT_NHWC = 1; + FORMAT_ND = 2; + FORMAT_NC1HWC0 = 3; + FORMAT_FRACTAL_Z = 4; + FORMAT_NC1C0HWPAD = 5; + FORMAT_NHWC1C0 = 6; + FORMAT_FSR_NCHW = 7; + FORMAT_FRACTAL_DECONV = 8; + FORMAT_C1HWNC0 = 9; + FORMAT_FRACTAL_DECONV_TRANSPOSE = 10; + FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11; + FORMAT_NC1HWC0_C04 = 12; + FORMAT_FRACTAL_Z_C04 = 13; + FORMAT_CHWN = 14; + FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15; + FORMAT_HWCN = 16; + FORMAT_NC1KHKWHWC0 = 17; + FORMAT_BN_WEIGHT = 18; + FORMAT_FILTER_HWCK = 19; + FORMAT_HASHTABLE_LOOKUP_LOOKUPS=20; + FORMAT_HASHTABLE_LOOKUP_KEYS = 21; + FORMAT_HASHTABLE_LOOKUP_VALUE = 22; + FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23; + FORMAT_HASHTABLE_LOOKUP_HITS=24; + FORMAT_C1HWNCoC0 = 25; + FORMAT_MD = 26; + FORMAT_NDHWC = 27; + FORMAT_FRACTAL_ZZ = 28; + FORMAT_FRACTAL_NZ = 29; + FORMAT_RESERVED = 30; +} + +message OriginalOp { + string name = 1; + uint32 output_index = 2; + OutputDataType data_type = 3; + OutputFormat format = 4; +} + +message Shape { + repeated uint64 dim = 1; +} + +message OpOutput { + OutputDataType data_type = 1; + OutputFormat format = 2; + Shape shape = 3; + OriginalOp original_op = 4; // the original op corresponding to the output + bytes data = 5; + uint64 size = 6; +} + +message OpInput { + OutputDataType data_type = 1; + OutputFormat format = 2; + Shape shape = 3; + bytes data = 4; + uint64 size = 5; +} + +enum BufferType { + L1 = 0; +} + +message OpBuffer { + BufferType buffer_type = 1; + bytes data = 2; + uint64 size = 3; +} + +message DumpData{ + string version = 1; + uint64 dump_time = 2; + repeated OpOutput output = 3; + repeated OpInput input = 4; + repeated OpBuffer buffer = 5; +} diff --git a/src/proto/insert_op.proto b/src/proto/insert_op.proto index fd5bd3ec..a059e122 100644 --- a/src/proto/insert_op.proto +++ b/src/proto/insert_op.proto @@ -40,16 +40,22 @@ message AippOpParams { RAW12 = 12; RAW16 = 13; RAW24 = 14; + RGB16 = 15; + RGB20 = 16; + RGB24 = 17; + RGB8_IR = 18; + RGB16_IR = 19; + RGB24_IR = 20; } - + enum AippMode { undefined = 0; static = 1; dynamic = 2; } - + // AIPPģʽ£¬Çø·Ö¾²Ì¬AIPPºÍ¶¯Ì¬AIPP - AippMode aipp_mode = 1; + AippMode aipp_mode = 1; // related_input_rank²ÎÊýΪ±ØÌÀàÐÍΪÕûÐÍ£¬ÅäÖ÷¶Î§>=0, <=ÊäÈëDataËã×ӵĸöÊý£¬Ä¬ÈÏֵΪ0¡£ // ±êʶ¶ÔÄ£Ð͵ĵڼ¸¸öÊäÈë×öAIPP´¦Àí£¬ÀýÈçÄ£ÐÍÓÐÁ½¸öÊäÈ룬ÐèÒª¶ÔµÚ2¸öÊäÈë×öAIPP£¬ÔòÅäÖÃrelated_input_rankΪ1¡£ @@ -126,6 +132,10 @@ message AippOpParams { repeated int32 input_bias_2 = 44; // [End] ¾²Ì¬AIPP²ÎÊý + + // The n number that is used for raw/rgbir data into f16 transformation. + // The transformation equation is x/(2^n). If set to 0, no transform is performed. + uint32 raw_rgbir_to_f16_n = 45; } message MultiShapeOpParams { diff --git a/third_party/fwkacllib/inc/cce/taskdown_common.hpp b/third_party/fwkacllib/inc/cce/taskdown_common.hpp index 51a8ba11..3ecea523 100644 --- a/third_party/fwkacllib/inc/cce/taskdown_common.hpp +++ b/third_party/fwkacllib/inc/cce/taskdown_common.hpp @@ -34,7 +34,8 @@ typedef enum tagccKernelType { TE_AI_CORE = 4, /* te aicore operator*/ TE_AI_CPU = 5, /* te aicpu operator */ AI_CPU = 6, /* aicpu */ - INVALID = 7, /* unknown kernel type */ + CUST_AI_CPU = 7, /* custom aicpu*/ + INVALID = 8, /* unknown kernel type */ } ccKernelType; typedef struct tagOpContext { diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h index 1d83d7bf..00c220f1 100644 --- a/third_party/fwkacllib/inc/hccl/base.h +++ b/third_party/fwkacllib/inc/hccl/base.h @@ -36,76 +36,18 @@ typedef unsigned short u16; typedef unsigned int u32; typedef unsigned long long u64; -/** - * @brief HCOM functions return value definition - */ -typedef enum tagHcclResult { - HCCL_SUCCESS = 0, /**< success */ - HCCL_E_PARA = 1, /**< parameter error */ - HCCL_E_PTR = 2, /**< empty pointer */ - HCCL_E_MEMORY = 3, /**< memory error */ - HCCL_E_INTERNAL = 4, /**< internal error */ - HCCL_E_NOT_SUPPORT = 5, /**< not support feature */ - HCCL_E_NOT_FOUND = 6, /**< not found specific resource */ - HCCL_E_UNAVAIL = 7, /**< resource unavailable */ - HCCL_E_SYSCALL = 8, /**< call system interface error */ - HCCL_E_TIMEOUT = 9, /**< timeout */ - HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */ - HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */ - HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */ - HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */ - HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */ - HCCL_E_RUNTIME = 15, /**< call runtime api fail */ - HCCL_E_DRV = 16, /**< call driver api fail */ - HCCL_E_PROFILING = 17, /**< call profiling api fail */ - HCCL_E_CCE = 18, /**< call cce api fail */ - HCCL_E_NETWORK = 19, /**< call network api fail */ - HCCL_E_RESERVED /**< reserved */ -} hcclResult_t; - -/* handle to communicator */ -typedef void *hcclComm_t; - -/** - * @brief HCCL Reduction opperation - */ -typedef enum tagHcclRedOp { - HCCL_REP_OP_SUM = 0, /**< sum */ - HCCL_REP_OP_PROD = 1, /**< prod */ - HCCL_REP_OP_MAX = 2, /**< max */ - HCCL_REP_OP_MIN = 3, /**< min */ - HCCL_REP_OP_RESERVED /**< reserved */ -} hcclRedOp_t; - /** * @brief Horovod Reduction opperation */ -typedef enum tagHorovodRedOp { - HOROVOD_REP_OP_AVERAGE = 0, /**< average */ - HOROVOD_REP_OP_SUM = 1, /**< sum */ - HOROVOD_REP_OP_ADASUM = 2, /**< adasum */ - HOROVOD_REP_OP_MIN = 3, /**< min */ - HOROVOD_REP_OP_MAX = 4, /**< max */ - HOROVOD_REP_OP_PROD = 5, /**< proo */ - HOROVOD_REP_OP_RESERVED /**< reserved */ -} horovodRedOp_t; - -/** - * @brief HCCL data type - */ -typedef enum tagHcclDataType { - HCCL_DATA_TYPE_INT8 = 0, /**< int8 */ - HCCL_DATA_TYPE_INT = 1, /**< int32 */ - HCCL_DATA_TYPE_HALF = 2, /**< fp16 */ - HCCL_DATA_TYPE_FLOAT = 3, /**< fp32 */ - HCCL_DATA_TYPE_INT16 = 4, /**< int16 */ - HCCL_DATA_TYPE_RESERVED /**< reserved */ -} hcclDataType_t; - -constexpr u32 HCCL_UNIQUE_ID_BYTES = 2060; // 2060: unique id length -using hcclUniqueId = struct hcclUniqueIdDef { - char internal[HCCL_UNIQUE_ID_BYTES]; -}; +typedef enum { + HOROVOD_REDUCE_AVERAGE = 0, /**< average */ + HOROVOD_REDUCE_SUM = 1, /**< sum */ + HOROVOD_REDUCE_ADASUM = 2, /**< adasum */ + HOROVOD_REDUCE_MIN = 3, /**< min */ + HOROVOD_REDUCE_MAX = 4, /**< max */ + HOROVOD_REDUCE_PROD = 5, /**< proo */ + HOROVOD_REDUCE_RESERVED /**< reserved */ +} HorovodReduceOp; const u32 HCCL_MAX_SEGMENT_NUM = 8; // The max number of gradient segments. diff --git a/third_party/fwkacllib/inc/hccl/hccl_types.h b/third_party/fwkacllib/inc/hccl/hccl_types.h new file mode 100755 index 00000000..03f43649 --- /dev/null +++ b/third_party/fwkacllib/inc/hccl/hccl_types.h @@ -0,0 +1,99 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hccl_types.h + * @brief HCCL data type definition + * + */ + +#ifndef HCCL_TYPES_H_ +#define HCCL_TYPES_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * @brief HCCL functions return value definition + */ +typedef enum { + HCCL_SUCCESS = 0, /**< success */ + HCCL_E_PARA = 1, /**< parameter error */ + HCCL_E_PTR = 2, /**< empty pointer */ + HCCL_E_MEMORY = 3, /**< memory error */ + HCCL_E_INTERNAL = 4, /**< internal error */ + HCCL_E_NOT_SUPPORT = 5, /**< not support feature */ + HCCL_E_NOT_FOUND = 6, /**< not found specific resource */ + HCCL_E_UNAVAIL = 7, /**< resource unavailable */ + HCCL_E_SYSCALL = 8, /**< call system interface error */ + HCCL_E_TIMEOUT = 9, /**< timeout */ + HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */ + HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */ + HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */ + HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */ + HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */ + HCCL_E_RUNTIME = 15, /**< call runtime api fail */ + HCCL_E_DRV = 16, /**< call driver api fail */ + HCCL_E_PROFILING = 17, /**< call profiling api fail */ + HCCL_E_CCE = 18, /**< call cce api fail */ + HCCL_E_NETWORK = 19, /**< call network api fail */ + HCCL_E_RESERVED /**< reserved */ +} HcclResult; + +/** + * @brief handle to HCCL communicator + */ +typedef void *HcclComm; + +/** + * @brief HCCL Reduction opperation + */ +typedef enum { + HCCL_REDUCE_SUM = 0, /**< sum */ + HCCL_REDUCE_PROD = 1, /**< prod */ + HCCL_REDUCE_MAX = 2, /**< max */ + HCCL_REDUCE_MIN = 3, /**< min */ + HCCL_REDUCE_RESERVED /**< reserved */ +} HcclReduceOp; + +/** + * @brief HCCL data type + */ +typedef enum { + HCCL_DATA_TYPE_INT8 = 0, /**< int8 */ + HCCL_DATA_TYPE_INT16 = 1, /**< int16 */ + HCCL_DATA_TYPE_INT32 = 2, /**< int32 */ + HCCL_DATA_TYPE_FP16 = 3, /**< fp16 */ + HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */ + HCCL_DATA_TYPE_RESERVED /**< reserved */ +} HcclDataType; + +const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length + +/** + * @brief HCCL root info + */ +typedef struct HcclRootInfoDef { + char internal[HCCL_ROOT_INFO_BYTES]; +} HcclRootInfo; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // HCCL_TYPES_H_ diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h index 19bf4fb3..4399d3a8 100644 --- a/third_party/fwkacllib/inc/hccl/hcom.h +++ b/third_party/fwkacllib/inc/hccl/hcom.h @@ -23,6 +23,7 @@ #define HCOM_H_ #include +#include #ifdef __cplusplus extern "C" { @@ -33,37 +34,37 @@ extern "C" { * * @param rank_table A string identifying the rank table file path, include file name. * @param identify A string identifying the identify for the rank. - * @return hcclResult_t + * @return HcclResult * @see hcom_destroy() */ -extern hcclResult_t hcom_init(const char *rank_table, const char *identify); +extern HcclResult hcom_init(const char *rank_table, const char *identify); /** * @brief Destroy HCOM * - * @return hcclResult_t + * @return HcclResult * @see hcom_init() */ -extern hcclResult_t hcom_destroy(void); +extern HcclResult hcom_destroy(void); /** * @brief Bind the model. * * @param model A pointer identifying the model information. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult * @see hcom_unbind_model() */ -extern hcclResult_t hcom_bind_model(rtModel_t model, rtStream_t stream); +extern HcclResult hcom_bind_model(rtModel_t model, rtStream_t stream); /** * @brief Unbind the model. * * @param model An pointer identifying the model information. - * @return hcclResult_t + * @return HcclResult * @see hcom_unbind_model() */ -extern hcclResult_t hcom_unbind_model(rtModel_t model); +extern HcclResult hcom_unbind_model(rtModel_t model); /** * @brief All-gather operator. @@ -75,10 +76,10 @@ extern hcclResult_t hcom_unbind_model(rtModel_t model); * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32. * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_all_gather(const char *tag, void *inputPtr, void *outputPtr, u64 inputCount, - hcclDataType_t dataType, const char *group, rtStream_t stream); +extern HcclResult hcom_all_gather(const char *tag, void *inputPtr, void *outputPtr, u64 inputCount, + HcclDataType dataType, const char *group, rtStream_t stream); /** * @brief All-reduce operator. @@ -91,10 +92,10 @@ extern hcclResult_t hcom_all_gather(const char *tag, void *inputPtr, void *outpu * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod. * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_all_reduce(const char *tag, void *inputPtr, void *outputPtr, u64 count, - hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream); +extern HcclResult hcom_all_reduce(const char *tag, void *inputPtr, void *outputPtr, u64 count, + HcclDataType dataType, HcclReduceOp op, const char *group, rtStream_t stream); /** * @brief Broadcast operator. @@ -106,9 +107,9 @@ extern hcclResult_t hcom_all_reduce(const char *tag, void *inputPtr, void *outpu * @param root An integer(u32) identifying the the root rank in the operator. * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_broadcast(const char *tag, void *ptr, u64 count, hcclDataType_t dataType, u32 root, +extern HcclResult hcom_broadcast(const char *tag, void *ptr, u64 count, HcclDataType dataType, u32 root, const char *group, rtStream_t stream); /** @@ -122,46 +123,46 @@ extern hcclResult_t hcom_broadcast(const char *tag, void *ptr, u64 count, hcclDa * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod. * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, u64 count, - hcclDataType_t dataType, hcclRedOp_t op, const char *group, rtStream_t stream); +extern HcclResult hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, u64 count, + HcclDataType dataType, HcclReduceOp op, const char *group, rtStream_t stream); /** * @brief Get the rank number in the group. * * @param group A string identifying the group name. * @param rankSize A pointer identifying the rank number. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_rank_size(const char *group, u32 *rankSize); +HcclResult hcom_get_rank_size(const char *group, u32 *rankSize); /** * @brief Get the rank number of this rank's server within the group. * * @param group A string identifying the group name. * @param localRankSize A pointer identifying the rank number. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_local_rank_size(const char *group, u32 *localRankSize); +HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize); /** * @brief Get the rank id of this rank. * * @param group A string identifying the group name. * @param rankId A pointer identifying the rank id. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_rank_id(const char *group, u32 *rankId); +HcclResult hcom_get_rank_id(const char *group, u32 *rankId); /** * @brief Get the local rank id of this rank's server within the group. * * @param group A string identifying the group name. * @param localRankId A pointer identifying the local rank id. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_local_rank_id(const char *group, u32 *localRankId); +HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId); /** * @brief Get the world rank id according to the group rank id. @@ -169,9 +170,9 @@ hcclResult_t hcom_get_local_rank_id(const char *group, u32 *localRankId); * @param group A string identifying the group name. * @param groupRank An integer(u32) identifying the group rank id. * @param worldRank A pointer identifying the world rank id. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank); +HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank); /** * @brief Get the group rank id according to the world rank id. @@ -179,9 +180,9 @@ hcclResult_t hcom_get_world_rank_from_group_rank(const char *group, u32 groupRan * @param worldRank An integer(u32) identifying the world rank id. * @param group A string identifying the group name. * @param groupRank A pointer identifying the group rank id. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank); +HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank); /** * @brief Create group. @@ -189,17 +190,17 @@ hcclResult_t hcom_get_group_rank_from_world_rank(u32 worldRank, const char *grou * @param group A string identifying the group name. * @param rankNum An integer(u32) identifying the number of ranks in the group. * @param rankIds A list identifying the ranks in the group. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_create_group(const char *group, u32 rankNum, u32 *rankIds); +HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds); /** * @brief Destroy group * * @param group A string identifying the group name. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_destroy_group(const char *group); +HcclResult hcom_destroy_group(const char *group); /** * @brief Send operator. @@ -213,9 +214,9 @@ hcclResult_t hcom_destroy_group(const char *group); * The message will be send by the receive operator with the same "sr_tag". * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_send(const char *tag, void *inputPtr, u64 count, hcclDataType_t dataType, +HcclResult hcom_send(const char *tag, void *inputPtr, u64 count, HcclDataType dataType, u32 destRank, u32 srTag, const char *group, rtStream_t stream); /** @@ -230,9 +231,9 @@ hcclResult_t hcom_send(const char *tag, void *inputPtr, u64 count, hcclDataType_ * The message will be send by the send operator with the same "sr_tag". * @param group A string identifying the group name of ranks participating in the operator. * @param stream A pointer identifying the stream information. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataType_t dataType, +HcclResult hcom_receive(const char *tag, void *outputPtr, u64 count, HcclDataType dataType, u32 srcRank, u32 srTag, const char *group, rtStream_t stream); /** @@ -243,9 +244,9 @@ hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataT * @param maxSegmentNum An integer(u32) identifying the max segments of gradients. * @param segmentNum A pointer identifying the segments number of gradients. * @param segmentIdx A list identifying the index of end gradient in each segment. - * @return hcclResult_t + * @return HcclResult */ -hcclResult_t hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 maxSegmentNum, +HcclResult hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 maxSegmentNum, u32 *segmentNum, u32 *segmentIdx, GradSplitForceMode force = FORCE_NONE, OriginalGraphShapeType shapeType = KNOWN_SHAPE); @@ -255,9 +256,9 @@ hcclResult_t hcom_get_split_strategy(const char *group, const struct model_featu * @param group A string identifying the group name. * @param segmentNum An integer(u32) identifying the segments number of gradients. * @param IdxList A list identifying the index of end gradient in each segment. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList); +extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList); /** * @brief Set the gradient split strategy with in the group, according to gradient data size. @@ -265,9 +266,9 @@ extern hcclResult_t hcom_set_split_strategy_by_index(const char *group, u32 segm * @param group A string identifying the group name. * @param segmentNum An integer(u32) identifying the segments number of gradients. * @param sizeList A list identifying the percent of each segment. - * @return hcclResult_t + * @return HcclResult */ -extern hcclResult_t hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList); +extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList); #ifdef __cplusplus } diff --git a/third_party/fwkacllib/inc/ops/aipp.h b/third_party/fwkacllib/inc/ops/aipp.h index 85666223..d11fdc95 100644 --- a/third_party/fwkacllib/inc/ops/aipp.h +++ b/third_party/fwkacllib/inc/ops/aipp.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file aipp.h + * \brief + */ #ifndef GE_OP_AIPP_H #define GE_OP_AIPP_H @@ -41,7 +45,6 @@ REG_OP(Aipp) .OUTPUT(features, TensorType({DT_FLOAT16, DT_UINT8})) .ATTR(aipp_config_path, String, "./aipp.cfg") .OP_END_FACTORY_REG(Aipp) -} // namespace ge /** *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic \n @@ -59,12 +62,11 @@ in aipp config file, framework will auto add one input node to graph at last. *@par Third-party framework compatibility *Compatible with the TensorFlow operator AippData. */ -namespace ge { REG_OP(AippData) .INPUT(data, TensorType::ALL()) .OUTPUT(out, TensorType::ALL()) .ATTR(index, Int, 0) .OP_END_FACTORY_REG(AippData) -} +} // namespace ge #endif // GE_OP_AIPP_H diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h index c30bf32b..84ff3d08 100644 --- a/third_party/fwkacllib/inc/ops/all_ops.h +++ b/third_party/fwkacllib/inc/ops/all_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file all_ops.h + * \brief + */ #ifndef BUILT_IN_OP_PROTO_INC_ALL_OPS_H_ #define BUILT_IN_OP_PROTO_INC_ALL_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h index 7c6f9b2c..ea82e0fa 100644 --- a/third_party/fwkacllib/inc/ops/array_ops.h +++ b/third_party/fwkacllib/inc/ops/array_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file array_ops.h + * \brief + */ #ifndef GE_OP_ARRAY_OPS_H_ #define GE_OP_ARRAY_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/audio_ops.h b/third_party/fwkacllib/inc/ops/audio_ops.h index 6db181f9..feecd7ae 100644 --- a/third_party/fwkacllib/inc/ops/audio_ops.h +++ b/third_party/fwkacllib/inc/ops/audio_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file audio_ops.h + * \brief + */ #ifndef GE_OP_AUDIO_OPS_H_ #define GE_OP_AUDIO_OPS_H_ @@ -43,11 +47,12 @@ per time slice. *@attention Constraints: \n *Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n - *@par Third-party framework compatibility *Compatible with the TensorFlow operator Mfcc. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(Mfcc) .INPUT(spectrogram, TensorType({DT_FLOAT})) .INPUT(sample_rate, TensorType({DT_INT32})) @@ -79,6 +84,9 @@ poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator AudioSpectrogram. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(AudioSpectrogram) @@ -110,6 +118,9 @@ Length of audio requested. *@par Third-party framework compatibility *Compatible with the TensorFlow operator DecodeWav. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(DecodeWav) @@ -136,6 +147,9 @@ REG_OP(DecodeWav) *@par Third-party framework compatibility *Compatible with tensorflow Operator EncodeWav. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(EncodeWav) diff --git a/third_party/fwkacllib/inc/ops/batch_ops.h b/third_party/fwkacllib/inc/ops/batch_ops.h index 47c5b06b..dd2efade 100644 --- a/third_party/fwkacllib/inc/ops/batch_ops.h +++ b/third_party/fwkacllib/inc/ops/batch_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file batch_ops.h + * \brief + */ #ifndef GE_OP_BATCH_OPS_H_ #define GE_OP_BATCH_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/bitwise_ops.h b/third_party/fwkacllib/inc/ops/bitwise_ops.h index ccbeb04c..0a6cbe9b 100644 --- a/third_party/fwkacllib/inc/ops/bitwise_ops.h +++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file bitwise_ops.h + * \brief + */ #ifndef GE_OP_BITWISE_OPS_H_ #define GE_OP_BITWISE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h index 37345833..cded3acd 100644 --- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h +++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file boosted_trees_ops.h + * \brief + */ #ifndef GE_OP_BOOSTED_TREES_OPS_H_ #define GE_OP_BOOSTED_TREES_OPS_H_ @@ -44,8 +48,10 @@ a single feature. *@par Third-party framework compatibility *Compatible with the TensorFlow operator BoostedTreesBucketize. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BoostedTreesBucketize) .DYNAMIC_INPUT(float_values, TensorType({DT_FLOAT})) .DYNAMIC_INPUT(bucket_boundaries, TensorType({DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h index 50178a59..c0109fca 100644 --- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h +++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file candidate_sampling_ops.h + * \brief + */ #ifndef GE_OP_CANDIDATE_SAMPLING_OPS_H_ #define GE_OP_CANDIDATE_SAMPLING_OPS_H_ @@ -60,8 +64,10 @@ which delivers poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ThreadUnsafeUnigramCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -114,8 +120,10 @@ which delivers poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator UniformCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(UniformCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -180,8 +188,10 @@ which delivers poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator FixedUnigramCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(FixedUnigramCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -239,8 +249,10 @@ poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(LearnedUnigramCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -291,8 +303,10 @@ poor performance. *@par Third-party framework compatibility *Compatible with the TensorFlow operator LogUniformCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(LogUniformCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -339,8 +353,10 @@ to occur in a batch of sampled candidates. If "unique" is true, then this is a p *@par Third-party framework compatibility *Compatible with the TensorFlow operator AllCandidateSampler. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(AllCandidateSampler) .INPUT(true_classes, TensorType({ DT_INT64 })) .OUTPUT(sampled_candidates, TensorType({ DT_INT64 })) @@ -379,8 +395,10 @@ each element is -FLOAT_MAX. *@par Third-party framework compatibility *Compatible with the TensorFlow operator ComputeAccidentalHits. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ComputeAccidentalHits) .INPUT(true_classes, TensorType({ DT_INT64 })) .INPUT(sampled_candidates, TensorType({ DT_INT64 })) diff --git a/third_party/fwkacllib/inc/ops/condtake_ops.h b/third_party/fwkacllib/inc/ops/condtake_ops.h index 37d3b92a..72bf46a0 100644 --- a/third_party/fwkacllib/inc/ops/condtake_ops.h +++ b/third_party/fwkacllib/inc/ops/condtake_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file condtake_ops.h + * \brief + */ #ifndef GE_OP_CONDTAKE_OPS_H_ #define GE_OP_CONDTAKE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/control_flow_ops.h b/third_party/fwkacllib/inc/ops/control_flow_ops.h index fa68d49a..75992103 100644 --- a/third_party/fwkacllib/inc/ops/control_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file control_flow_ops.h + * \brief + */ #ifndef GE_CONTROL_FLOW_OPS_H_ #define GE_CONTROL_FLOW_OPS_H_ @@ -377,6 +381,27 @@ REG_OP(RefExit) */ REG_OP(ControlTrigger) .OP_END_FACTORY_REG(ControlTrigger) + +/** +*@brief Returns index of shape in the map. + +*@par Inputs: +* Three inputs, including: +*@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8. +*@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried. +*@li level_index: One dimensional tensore of type int32, specifying secondary index. + +*@par Outputs: +*@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map. +*@par Third-party framework compatibility +* It is a custom operator. It has no corresponding operator in Caffe. +*/ +REG_OP(MapIndex) + .INPUT(x, TensorType({DT_INT32})) + .INPUT(data_seq, TensorType({DT_INT32})) + .OPTIONAL_INPUT(level_index, TensorType({DT_INT32})) + .OUTPUT(y, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(MapIndex) } // namespace ge #endif // GE_CONTROL_FLOW_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/ctc_ops.h b/third_party/fwkacllib/inc/ops/ctc_ops.h index 74b797f3..eaf6f9e9 100644 --- a/third_party/fwkacllib/inc/ops/ctc_ops.h +++ b/third_party/fwkacllib/inc/ops/ctc_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file ctc_ops.h + * \brief + */ #ifndef GE_OP_CTC_OPS_H #define GE_OP_CTC_OPS_H diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h index c766167a..d407c4cd 100644 --- a/third_party/fwkacllib/inc/ops/data_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file data_flow_ops.h + * \brief + */ #ifndef GE_OP_DATA_FLOW_OPS_H_ #define GE_OP_DATA_FLOW_OPS_H_ @@ -486,8 +490,10 @@ DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING. *@par Third-party framework compatibility *Compatible with the TensorFlow operator DynamicPartition. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(DynamicPartition) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ @@ -521,8 +527,10 @@ DT_QUINT8, DT_QINT8, DT_STRING, DT_COMPLEX64, DT_COMPLEX128. *@par Third-party framework compatibility *Compatible with the TensorFlow operator DynamicStitch. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(DynamicStitch) .DYNAMIC_INPUT(indices, TensorType({DT_INT32})) .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ @@ -1603,8 +1611,10 @@ the given name across multiple sessions. *@par Third-party framework compatibility *Compatible with the TensorFlow operator Barrier. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(Barrier) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(component_types, ListType) @@ -1635,8 +1645,10 @@ DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING. *@par Third-party framework compatibility *Compatible with the TensorFlow operator BarrierInsertMany. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BarrierInsertMany) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(keys, TensorType({DT_STRING})) @@ -1683,8 +1695,10 @@ DT_RESOURCE, DT_STRING. *@par Third-party framework compatibility *Compatible with the TensorFlow operator BarrierTakeMany. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BarrierTakeMany) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_elements, TensorType(DT_INT32)) @@ -1718,8 +1732,10 @@ even if no new key is introduced. *@par Third-party framework compatibility *Compatible with the TensorFlow operator BarrierClose. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BarrierClose) .INPUT(handle, TensorType({DT_STRING_REF})) .ATTR(cancel_pending_enqueues, Bool, false) @@ -1740,8 +1756,10 @@ REG_OP(BarrierClose) *@par Third-party framework compatibility *Compatible with the TensorFlow operator BarrierReadySize. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BarrierReadySize) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(size, TensorType(DT_INT32)) @@ -1762,8 +1780,10 @@ REG_OP(BarrierReadySize) *@par Third-party framework compatibility *Compatible with the TensorFlow operator BarrierIncompleteSize. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(BarrierIncompleteSize) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(size, TensorType(DT_INT32)) @@ -1824,8 +1844,10 @@ name across multiple sessions. *@par Third-party framework compatibility *Compatible with the TensorFlow operator ConditionalAccumulator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ConditionalAccumulator) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(dtype, Type) @@ -1858,8 +1880,10 @@ which delivers poor performance.\n *@par Third-party framework compatibility *Compatible with the TensorFlow operator AccumulatorApplyGradient. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(AccumulatorApplyGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(local_step, TensorType({DT_INT64})) @@ -1884,8 +1908,10 @@ which delivers poor performance.\n *@par Third-party framework compatibility *Compatible with the TensorFlow operator AccumulatorNumAccumulated. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(AccumulatorNumAccumulated) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(y, TensorType({DT_INT32})) @@ -1904,8 +1930,10 @@ REG_OP(AccumulatorNumAccumulated) *@par Third-party framework compatibility *Compatible with the TensorFlow operator AccumulatorSetGlobalStep. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(AccumulatorSetGlobalStep) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(new_global_step, TensorType({DT_INT64})) @@ -1935,8 +1963,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE. *@par Third-party framework compatibility *Compatible with the TensorFlow operator AccumulatorTakeGradient. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(AccumulatorTakeGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_required, TensorType({DT_INT32})) @@ -1962,8 +1992,10 @@ default is "MEAN". *@par Third-party framework compatibility *Compatible with tensorflow SparseConditionalAccumulator operator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(SparseConditionalAccumulator) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(shape, ListInt) @@ -1996,8 +2028,10 @@ the type of the accumulator. *@par Third-party framework compatibility *Compatible with tensorflow SparseAccumulatorApplyGradient operator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(SparseAccumulatorApplyGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(local_step, TensorType({DT_INT64})) @@ -2030,8 +2064,10 @@ type of the accumulator. *@par Third-party framework compatibility *Compatible with tensorflow SparseAccumulatorTakeGradient operator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(SparseAccumulatorTakeGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_required, TensorType({DT_INT32})) @@ -2062,8 +2098,10 @@ name across multiple sessions. *@par Third-party framework compatibility *Compatible with the TensorFlow operator ResourceConditionalAccumulator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ResourceConditionalAccumulator) .OUTPUT(handle, TensorType({DT_RESOURCE})) .REQUIRED_ATTR(dtype, Type) @@ -2089,8 +2127,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE *@par Third-party framework compatibility *Compatible with the TensorFlow operator ResourceAccumulatorApplyGradient. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ResourceAccumulatorApplyGradient) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(local_step, TensorType({DT_INT64})) @@ -2111,8 +2151,10 @@ REG_OP(ResourceAccumulatorApplyGradient) *@par Third-party framework compatibility *Compatible with the TensorFlow operator ResourceAccumulatorNumAccumulated. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ResourceAccumulatorNumAccumulated) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(num_accumulated, TensorType({DT_INT32})) @@ -2130,8 +2172,10 @@ REG_OP(ResourceAccumulatorNumAccumulated) *@par Third-party framework compatibility *Compatible with the TensorFlow operator ResourceAccumulatorSetGlobalStep. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ResourceAccumulatorSetGlobalStep) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(new_global_step, TensorType({DT_INT64})) @@ -2158,8 +2202,10 @@ DT_FLOAT16, DT_FLOAT, DT_DOUBLE. *@par Third-party framework compatibility *Compatible with the TensorFlow operator ResourceAccumulatorTakeGradient. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(ResourceAccumulatorTakeGradient) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(num_required, TensorType({DT_INT32})) diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h index 741a9071..cd42b707 100644 --- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file elewise_calculation_ops.h + * \brief + */ #ifndef GE_OP_ELEWISE_CALCULATION_OPS_H #define GE_OP_ELEWISE_CALCULATION_OPS_H #include "graph/operator_reg.h" @@ -2910,14 +2914,14 @@ REG_OP(Bias) .OP_END_FACTORY_REG(Bias) /** -*@brief Function multiply gradients calculation. \n +*@brief Function multiply gradients calculation. output0 is the result of which input0 dot multily input1. output1 is the result of which input0 dot multily input1, then reducesum it. *@par Inputs: *@li input0: A Tensor of input of mul, and dtype supports float16, float32. *@li input1: A Tensor of input of mul and mul_1, and dtype supports float16, float32. -*@li input2: A Tensor of input of mul_1, and dtype supports float16, float32'. +*@li input2: A Tensor of input of mul_1, and dtype supports float16, float32. *@par Attributes: *@li axes: The dimensions to reduce. Default:(), reduce all dimensions. \n @@ -2940,12 +2944,12 @@ REG_OP(ConfusionMulGrad) .OP_END_FACTORY_REG(ConfusionMulGrad) /** -*@brief Function fused multiply l2 loss calculation. \n +*@brief Function fused multiply l2 loss calculation. *@par Inputs: -*@li x1: A Tensor of type float16, float32. -*@li x2: A Tensor of type float16, float32. -*@li x3: A Tensor of type float16, float32. +*@li x1: A Tensor of number type. +*@li x2: A Tensor of number type. +*@li x3: A Tensor of number type. *@par Outputs: *@li y1: A Tensor of shape and dtype of first output, which should have \n @@ -3092,7 +3096,6 @@ REG_OP(Fills) *@brief Add tensor with scale. *@par Inputs: -*Five inputs, including: * @li x1: A Tensor dtype of int32, float16, float32. * @li x2: A Tensor dtype of int32, float16, float32. @@ -3178,6 +3181,27 @@ REG_OP(TensorMove) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_BOOL})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_BOOL})) .OP_END_FACTORY_REG(TensorMove) + +/** +*@brief copy data from x to x. + +*@par Inputs: +*One inputs, including: +* @li x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32, bool. + +*@par Outputs: +*x: A Tensor. Has the same type as "x". + +*@par Third-party framework compatibility +*/ +REG_OP(TensorRedirect) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8, + DT_INT64, DT_INT16, DT_UINT16, DT_DOUBLE, + DT_COMPLEX64})) + .OUTPUT(output_x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8, + DT_INT64, DT_INT16, DT_UINT16, DT_DOUBLE, + DT_COMPLEX64})) + .OP_END_FACTORY_REG(TensorRedirect) } // namespace ge diff --git a/third_party/fwkacllib/inc/ops/functional_ops.h b/third_party/fwkacllib/inc/ops/functional_ops.h index f4a88661..33dce25d 100644 --- a/third_party/fwkacllib/inc/ops/functional_ops.h +++ b/third_party/fwkacllib/inc/ops/functional_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file functional_ops.h + * \brief + */ #ifndef GE_FUNCTIONAL_OPS_H_ #define GE_FUNCTIONAL_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/get_data_ops.h b/third_party/fwkacllib/inc/ops/get_data_ops.h index 0a9b174b..33a64903 100644 --- a/third_party/fwkacllib/inc/ops/get_data_ops.h +++ b/third_party/fwkacllib/inc/ops/get_data_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file get_data_ops.h + * \brief + */ #ifndef GE_OP_GET_DATA_OPS_H_ #define GE_OP_GET_DATA_OPS_H_ @@ -21,12 +25,31 @@ namespace ge { +/** +*@brief Binding dataset and GetNext +*@par Attributes: None +*@par Inputs: Dataset and GetNext operator +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(MakeIterator) .INPUT(x, TensorType::ALL()) .INPUT(x1, TensorType::ALL()) .ATTR(_kernel, String, "dp") .OP_END_FACTORY_REG(MakeIterator) +/** +*@brief Dataset iterator +*@par Attributes: +*output_types: Data type of output +*output_shapes: Shapes of output +*container: Iterator container name +*shared_name: Iterator id +*@par Inputs: None +*@par Outputs: Dataset +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(IteratorV2) .OUTPUT(y, TensorType::ALL()) .ATTR(output_types, ListInt, {}) @@ -35,6 +58,17 @@ REG_OP(IteratorV2) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(IteratorV2) +/** +*@brief Dataset GetNext iterator +*@par Attributes: +*output_types: Data type of output +*output_shapes: Shapes of output +*output_num: Num of output +*@par Inputs: Queue data +*@par Outputs: Input of computer graph +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(IteratorGetNext) .INPUT(x, TensorType::ALL()) .DYNAMIC_OUTPUT(y, TensorType::ALL()) @@ -44,6 +78,17 @@ REG_OP(IteratorGetNext) .ATTR(_kernel, String, "dp") .OP_END_FACTORY_REG(IteratorGetNext) +/** +*@brief Device queue data area. +*@par Attributes: +*output_types: Data type of output +*output_shapes: Shapes of output +*channel_name: Channel ID corresponding to TDT +*@par Inputs: None +*@par Outputs: Dataset GetNext iterator +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(DeviceQueueDataset) .OUTPUT(y, TensorType::ALL()) .ATTR(output_types, ListInt, {}) diff --git a/third_party/fwkacllib/inc/ops/hcom_ops.h b/third_party/fwkacllib/inc/ops/hcom_ops.h index bdacebdf..231729ce 100644 --- a/third_party/fwkacllib/inc/ops/hcom_ops.h +++ b/third_party/fwkacllib/inc/ops/hcom_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file hcom_ops.h + * \brief huawei collective communication library ops. + */ #ifndef GE_OP_HCOM_OPS_H_ #define GE_OP_HCOM_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/hvd_ops.h b/third_party/fwkacllib/inc/ops/hvd_ops.h index 09748b8e..89282ca5 100644 --- a/third_party/fwkacllib/inc/ops/hvd_ops.h +++ b/third_party/fwkacllib/inc/ops/hvd_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file hvd_ops.h + * \brief Horovod collective communication library ops. + */ #ifndef GE_OP_HVD_OPS_H_ #define GE_OP_HVD_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h index 1ea62fa9..9412112c 100644 --- a/third_party/fwkacllib/inc/ops/image_ops.h +++ b/third_party/fwkacllib/inc/ops/image_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file image_ops.h + * \brief + */ #ifndef GE_OP_MAGE_OPS_H_ #define GE_OP_MAGE_OPS_H_ @@ -144,36 +148,33 @@ REG_OP(CropAndResize) .OP_END_FACTORY_REG(CropAndResize) /** -*@brief Extracts crops from the input image tensor and resizes them. Extracts \n -crops from the input image tensor and resizes them using bilinear sampling or \n -nearest neighbor sampling to a common output size specified by crop_size. +*@brief Extracts crops from the input image tensor and resizes them. +* Extracts crops from the input image tensor and resizes them using bilinear sampling or +* nearest neighbor sampling to a common output size specified by crop_size. *@par Inputs: -*Input images must be a 5HD tensor. Inputs include: \n -*@li images:A Tensor. Must be one of the following types:float. A 5HD tensor of shape \n -[batch, C1, image_height, image_width, C0]. +*Input images must be a 5HD tensor. Inputs include: +*@li x:A Tensor. Must be one of the following types:float16, float. A 5HD tensor of shape +* [batch, C1, image_height, image_width, C0]. *@li boxes: A Tensor of type float. A 2-D tensor of shape [num_boxes, 4]. -*@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with \n -int32 values in [0, batch - 1). +*@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with int32 values in [0, batch). *@par Attributes: *@li crop_size: list int. [crop_height, crop_width]. All cropped image patches are resized to this size. -*@li extrapolation_value: An optional float. Defaults to 0. Value used for \n -extrapolation, when applicable. -*@li method: An optional string from: '"bilinear"'. Defaults to \n -"bilinear". +*@li extrapolation_value: An optional float. Defaults to 0. Value used for extrapolation, when applicable. +*@li method: An optional string from: '"bilinear"'. Defaults to "bilinear". *@par Outputs: *y:A Tensor of type float. -*@attention Constraints: \n +*@attention Constraints: *Input images must be a 5HD tensor. *@par Third-party framework compatibility *Compatible with tensorflow CropAndResize operator. */ REG_OP(CropAndResizeD) - .INPUT(x, TensorType({DT_FLOAT})) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(boxes, TensorType({DT_FLOAT})) .INPUT(box_index, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/internal_ops.h b/third_party/fwkacllib/inc/ops/internal_ops.h index 0f9fd12f..014b7a1b 100644 --- a/third_party/fwkacllib/inc/ops/internal_ops.h +++ b/third_party/fwkacllib/inc/ops/internal_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file internal_ops.h + * \brief + */ #ifndef GE_OP_INTERNAL_OPS_H_ #define GE_OP_INTERNAL_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/linalg_ops.h b/third_party/fwkacllib/inc/ops/linalg_ops.h index 916c3267..145e021e 100644 --- a/third_party/fwkacllib/inc/ops/linalg_ops.h +++ b/third_party/fwkacllib/inc/ops/linalg_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file linalg_ops.h + * \brief + */ #ifndef GE_OP_LINALG_OPS_H_ #define GE_OP_LINALG_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/logging_ops.h b/third_party/fwkacllib/inc/ops/logging_ops.h index 897fc699..7ca04188 100644 --- a/third_party/fwkacllib/inc/ops/logging_ops.h +++ b/third_party/fwkacllib/inc/ops/logging_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file logging_ops.h + * \brief + */ #ifndef GE_OP_LOGGING_OPS_H #define GE_OP_LOGGING_OPS_H @@ -35,8 +39,10 @@ the graph. *@par Third-party framework compatibility *Compatible with tensorflow Timestamp operator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(Timestamp) .OUTPUT(y, TensorType({DT_DOUBLE})) .OP_END_FACTORY_REG(Timestamp) @@ -55,8 +61,10 @@ Inputs include: \n *@par Third-party framework compatibility *Compatible with tensorflow Assert operator. -*/ +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(Assert) .INPUT(input_condition, TensorType{DT_BOOL}) .DYNAMIC_INPUT(input_data, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, @@ -72,6 +80,9 @@ REG_OP(Assert) *x: The tensor to print, it is a dynamic_input. *Compatible with aicpu Print operator. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(Print) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, @@ -91,6 +102,9 @@ to print to. *@par Third-party framework compatibility *Compatible with tensorflow PrintV2 operator. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(PrintV2) .INPUT(x, TensorType({DT_STRING})) diff --git a/third_party/fwkacllib/inc/ops/lookup_ops.h b/third_party/fwkacllib/inc/ops/lookup_ops.h index 4dd87a8e..bd34ab64 100644 --- a/third_party/fwkacllib/inc/ops/lookup_ops.h +++ b/third_party/fwkacllib/inc/ops/lookup_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file lookup_ops.h + * \brief + */ #ifndef GE_OP_LOOKUP_OPS_H_ #define GE_OP_LOOKUP_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h index 6d1e2cd2..9ee4f6d4 100644 --- a/third_party/fwkacllib/inc/ops/math_ops.h +++ b/third_party/fwkacllib/inc/ops/math_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file math_ops.h + * \brief + */ #ifndef GE_OP_MATH_OPS_H_ #define GE_OP_MATH_OPS_H_ @@ -630,6 +634,44 @@ REG_OP(NLLLossGrad) .OUTPUT(x_grad, TensorType({DT_FLOAT})) .ATTR(reduction, String, "mean") .OP_END_FACTORY_REG(NLLLossGrad) + +/** +*@brief The ifmr. + +*@par Inputs: +*@li data:A Tensor of feature map +*@li data_min:A Tensor of min value of feature map. +*@li data_max:A Tensor of max value of feature map. +*@li cumsum:A Tensor of cumsum bin of data. + +*@par Attributes: +*min_percentile: min init percentile. +*max_percentile: max init percentile. +*search_range: search range. +*search_step: step size of searching. +*with_offset: whether using offset. + +*@par Outputs: +*scale: optimal scale. +*offset: optimal offset. + +*@par Third-party framework compatibility +*Compatible with mindspore +*/ + +REG_OP(IFMR) + .INPUT(data, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(data_min, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(data_max, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(cumsum, TensorType({DT_INT32})) + .OUTPUT(scale, TensorType({DT_FLOAT})) + .OUTPUT(offset, TensorType({DT_FLOAT})) + .REQUIRED_ATTR(min_percentile, Float) + .REQUIRED_ATTR(max_percentile, Float) + .REQUIRED_ATTR(search_range, ListFloat) + .REQUIRED_ATTR(search_step, Float) + .REQUIRED_ATTR(with_offset, Bool) + .OP_END_FACTORY_REG(IFMR) } // namespace ge #endif // GE_OP_MATH_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h index 7cb24ee7..de94b58e 100644 --- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file matrix_calculation_ops.h + * \brief + */ #ifndef GE_OP_MATRIX_CALCULATION_OPS_H #define GE_OP_MATRIX_CALCULATION_OPS_H diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h index 296dd63c..a120b31d 100644 --- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_batch_norm_ops.h + * \brief + */ #ifndef GE_OP_NN_BATCH_NORM_OPS_H #define GE_OP_NN_BATCH_NORM_OPS_H @@ -340,6 +344,8 @@ REG_OP(BnHost) *@li mode: An optional attr, not use *@par Outputs:\n *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x" +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use BNInference instead. */ REG_OP(BNInferenceD) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h index b2cf56ad..5b84b1fb 100644 --- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_calculation_ops.h + * \brief + */ #ifndef GE_OP_NN_CALCULATION_OPS_H #define GE_OP_NN_CALCULATION_OPS_H @@ -124,6 +128,10 @@ REG_OP(DepthwiseConv2DBackpropFilter) * @par Third-party framework compatibility * @li Compatible with the TensorFlow operator DepthwiseConv2DBackpropFilter. * @li Compatible with the Caffe operator DepthwiseConv2DBackpropFilter. +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use DepthwiseConv2DBackpropFilter +* instead. */ REG_OP(DepthwiseConv2DBackpropFilterD) .INPUT(input, TensorType({float16})) @@ -239,6 +247,10 @@ REG_OP(DepthwiseConv2DBackpropInput) * @par Third-party framework compatibility * @li Compatible with the TensorFlow operator DepthwiseConv2DBackpropInput. * @li Compatible with the Caffe operator DepthwiseConv2DBackpropInput. +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use DepthwiseConv2DBackpropInput +* instead. */ REG_OP(DepthwiseConv2DBackpropInputD) .INPUT(filter, TensorType({DT_FLOAT16})) @@ -340,20 +352,30 @@ REG_OP(BiasAddGrad) *@brief Computes the gradients of convolution with respect to the input. *@par Inputs: * Three inputs: - * @li input_size: A Tensor of type int32. An integer vector representing the shape of input, - * where input is a 4-D tensor [batch, height, width, channels] or [batch, channels, height, width]. - * @li filter: A Tensor. Must be one of the following types: float16, float32, float64. - * 4-D with shape [filter_height, filter_width, in_channels, out_channels] - * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. - * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels] - * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * @li input_size: A Tensor of type int32. An integer vector representing the + * shape of input, where input is a 4-D tensor [batch, height, width, channels] + * or [batch, channels, height, width]. + * @li filter: A Tensor. Must be one of the following types: float16, float32, + * float64. 4-D with shape + * [filter_height, filter_width, in_channels, out_channels] + * or [out_channels, filter_height, filter_width, in_channels] + * or [out_channels, in_channel, filter_height, filter_width]. + * @li out_backprop: A Tensor. Must have the same type as filter. + * 4-D with shape [batch, out_height, out_width, out_channels] + * or [batch, out_channels, out_height, out_width]. + * Gradients with respect to the output of the convolution. *@par Attributes: * Five attributes: - * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. - * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map - * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1] - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. + * @li strides: A tuple/list of 4 integers. The stride of the sliding window + * for H/W dimension. The index of H/W is same as data_format. + * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads + * on feature map + * @li dilations: A tuple/list of 4 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1] + * @li groups: Number of blocked connections from input channels to output + * channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to + * "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as filter,and has same format as input_size *@par Third-party framework compatibility @@ -376,23 +398,35 @@ REG_OP(Conv2DBackpropInput) *@par Inputs: * Two inputs: * @li filter: A Tensor. Types is float16. - * 4-D with shape [filter_height, filter_width, in_channels, out_channels] or [out_channels, filter_height, filter_width, in_channels] + * 4-D with shape [filter_height, filter_width, in_channels, out_channels] + * or [out_channels, filter_height, filter_width, in_channels] * or [out_channels, in_channel, filter_height, filter_width]. - * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels] - * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * @li out_backprop: A Tensor. Must have the same type as filter. + * 4-D with shape [batch, out_height, out_width, out_channels] + * or [batch, out_channels, out_height, out_width]. + * Gradients with respect to the output of the convolution. *@par Attributes: * Six attributes: - * @li input_size A Tensor of type int32. An integer vector representing the shape of input, - * where input is a 4-D tensor [batch, height, width, channels] or [batch, channels, height, width]. - * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. - * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map - * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1] - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. + * @li input_size A Tensor of type int32. An integer vector representing the + * shape of input, where input is a 4-D tensor [batch, height, width, channels] + * or [batch, channels, height, width]. + * @li strides: A tuple/list of 4 integers. The stride of the sliding window + * for H/W dimension. The index of H/W is same as data_format. + * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on + * feature map + * @li dilations: A tuple/list of 4 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1] + * @li groups: Number of blocked connections from input channels to output + * channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to + * "NHWC". Specify the data format of the input and output data. *@par Outputs: - * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width, channels] or [batch, channels, height, width]. + * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width, + * channels] or [batch, channels, height, width]. *@par Third-party framework compatibility * Compatible with Tensorflow's conv2d_backprop_input +*@par Restrictions: + * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DBackpropInput instead. */ REG_OP(Conv2DBackpropInputD) .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8})) @@ -431,7 +465,8 @@ REG_OP(Conv2DBackpropInputD) output channels. Defaults to "1". * @li data_format: An optional string from: "NCHW". Defaults to "NCHW". \n Specify the data format of the input and output data. - * @li offset_x: An optional integer for quantized deconvolution. Defaults to "0". + * @li offset_x: An optional integer for quantized deconvolution. + * Defaults to "0". *@par Outputs: * y: A Tensor. 4D tensor with shape [batch, channels, height, width]. * When type of x is float16, the type of y must be float16. @@ -454,20 +489,30 @@ REG_OP(Deconvolution) *@brief Computes the gradients of convolution with respect to the filter *@par Inputs: * Three inputs: - * @li x: A Tensor. Must be one of the following types: float16, float32, float64. - * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, in_channels, in_height, in_width]. - * @li filter_size: A Tensor of type int32. An integer vector representing the tensor shape of filter, - * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels] - * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. - * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape [batch, out_height, out_width, out_channels] - * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * @li x: A Tensor. Must be one of the following types: float16, float32, + * float64.4-D with shape [batch, in_height, in_width, in_channels] or + * [batch, in_channels, in_height, in_width]. + * @li filter_size: A Tensor of type int32. An integer vector representing the + * tensor shape of filter, where filter is a 4-D tensor [filter_height, + * filter_width, in_channels, out_channels] or [out_channels, filter_height, + * filter_width, in_channels] or [out_channels, in_channel, filter_height, + * filter_width]. + * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape + * [batch, out_height, out_width, out_channels] or [batch, out_channels, + * out_height, out_width]. Gradients with respect to the output of the + * convolution. *@par Attributes: * Five attributes: - * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. - * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map. - * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]. - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. + * @li strides: A tuple/list of 4 integers. The stride of the sliding window + * for H/W dimension. The index of H/W is same as data_format. + * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on + * feature map. + * @li dilations: A tuple/list of 4 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output + * channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to + * "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as x *@par Third-party framework compatibility @@ -490,23 +535,35 @@ REG_OP(Conv2DBackpropFilter) *@par Inputs: * Two inputs: * @li x: A Tensor. Type is float16. - * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, in_channels, in_height, in_width]. - * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape [batch, out_height, out_width, out_channels] - * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, + * in_channels, in_height, in_width]. + * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape + * [batch, out_height, out_width, out_channels] or [batch, out_channels, + * out_height, out_width]. Gradients with respect to the output of the + * convolution. *@par Attributes: * Six attributes: - * @li filter_size: A Tensor of type integers. An integer vector representing the tensor shape of filter, - * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels] - * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. - * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. - * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map - * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]. - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. + * @li filter_size: A Tensor of type integers. An integer vector representing + * the tensor shape of filter, + * where filter is a 4-D tensor [filter_height, filter_width, in_channels, + * out_channels] or [out_channels, filter_height, filter_width, in_channels] + * or [out_channels, in_channel, filter_height, filter_width]. + * @li strides: A tuple/list of 4 integers. The stride of the sliding window + * for H/W dimension. The index of H/W is same as data_format. + * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on + * feature map + * @li dilations: A tuple/list of 4 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output + * channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to + * "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Type is float32 *@par Third-party framework compatibility * Compatible with Tensorflow's conv2d_backprop_filter +*@par Restrictions: + * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DBackpropFilter instead. */ REG_OP(Conv2DBackpropFilterD) .INPUT(x, TensorType({DT_FLOAT16})) @@ -638,25 +695,34 @@ REG_OP(Conv2DCompress) /** *@brief Computes a 3D convolution given 5D "x" and "filter" tensors. *@par Inputs: - * @li x: A 5D tensor. Must be one of the following types: float16, (Currently does not support int8). - * The format of x is NCDHW or NDHWC. - * @li filter: A 5D tensor of the same type as "x". The format is NCDHW, NDHWC or DHWCN. + * @li x: A 5D tensor. Must be one of the following types: float16, + * (Currently does not support int8). The format of x is NCDHW or NDHWC. + * @li filter: A 5D tensor of the same type as "x". + * (Currently does not support int8). + * The format is NCDHW, NDHWC or DHWCN. *@par Optional input: * @li bias: An optional 1D tensor of the same type as "x". * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved. *@par Required Attributes: - * @li strides: A list of 5 integers. Specifies the stride of the sliding window for each dimension of "x". + * @li strides: A list of 5 integers. Specifies the stride of the sliding window + * for each dimension of "x". * The N and C dimensions must be 1. Has the same format as "x". - * @li pads: A list of 6 integers. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right. + * @li pads: A list of 6 integers. + * Supports only padding along the D, H and W dimensions in sequence of head, + * tail, top, bottom, left and right. *@par Attributes: - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. - * @li dilations: A list of 5 integers. Specifies the dilation factor for each dimension of "x". + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A list of 5 integers. Specifies the dilation factor for each + * dimension of "x", now only support [1,1,1,1,1] * The N and C dimensions must be 1. Has the same format as "x". - * @li offset_x: An optional int. Input offset, used for quantized inference. Defaults to 0. + * @li offset_x: An optional int. Input offset, used for quantized inference. + * Defaults to 0. Reserved. *@par Outputs: *y: A Tensor. Has the same type as "x". @@ -687,22 +753,33 @@ REG_OP(Conv3D) *@brief Computes the gradients of convolution 3d with respect to the input. *@par Inputs: * Three inputs: - * @li input_size: A Tensor of type int32, int64. An integer vector representing the shape of input, - * where input is a 5-D tensor [batch, depth, height, width, channels] or [batch, channels, depth, height, width]. - * @li filter: A Tensor. Must be one of the following types: float16, float32, float64. - * @li out_backprop: A Tensor. Must have the same type as filter. 5-D with shape [batch, depth, out_height, out_width, out_channels] - * or [batch, out_channels, depth, out_height, out_width]. Gradients with respect to the output of the convolution. + * @li input_size: A Tensor of type int32, int64. An integer vector representing + * the shape of input, where input is a 5-D tensor + * [batch, depth, height, width, channels] or + * [batch, channels, depth, height, width]. + * @li filter: A Tensor. Must be one of the following types: float16, float32, + * float64. + * @li out_backprop: A Tensor. Must have the same type as filter. + * 5-D with shape [batch, depth, out_height, out_width, out_channels] + * or [batch, out_channels, depth, out_height, out_width]. Gradients with + * respect to the output of the convolution. *@par Required Attributes: - * @li strides: A list of 5 integers. Specifies the stride of the sliding window for each dimension of "x". + * @li strides: A list of 5 integers. Specifies the stride of the sliding window + * for each dimension of "x". * The N and C dimensions must be 1. Has the same format as "x". - * @li pads: A list of 6 integers. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right. + * @li pads: A list of 6 integers. + * Supports only padding along the D, H and W dimensions in sequence of head, + * tail, top, bottom, left and right. *@par Attributes: * Three attributes: - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of the input, now only support [1,1,1,1,1] + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A tuple/list of 5 integers, The dilation factor for each + * dimension of the input, now only support [1,1,1,1,1] *@par Outputs: * y: A Tensor. Has the same type as filter,and has same format as input_size @@ -730,22 +807,31 @@ REG_OP(Conv3DBackpropInput) * @li out_backprop: A Tensor. Must have the same type as filter. *@par Required Attributes: - * @li strides: A list of 5 integers. Specifies the stride of the sliding window for - * each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". + * @li strides: A list of 5 integers. Specifies the stride of the sliding window + * for each dimension of "x". + * The N and C dimensions must be 1. Has the same format as "x". * @li pads: A list of 6 integers. Supports only padding along the D, H and W * dimensions in sequence of head, tail, top, bottom, left and right. - * @li input_size: A tuple/list of type int32, int64. An integer vector representing the shape of input, - * where input is a 5-D tensor [batch, depth, height, width, channels] or [batch, channels, depth, height, width]. + * @li input_size: A tuple/list of type int32, int64. An integer vector + * representing the shape of input, where input is a 5-D tensor + * [batch, depth, height, width, channels] or + * [batch, channels, depth, height, width]. *@par Attributes: * Three attributes: - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1] + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A tuple/list of 5 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1,1] *@par Outputs: * y: A Tensor. Has the same type as filter *@par Third-party framework compatibility * Compatible with Tensorflow's conv3d_backprop_input + +*@par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DBackpropInput instead. */ REG_OP(Conv3DBackpropInputD) .INPUT(filter, TensorType({DT_FLOAT16})) @@ -760,7 +846,7 @@ REG_OP(Conv3DBackpropInputD) .OP_END_FACTORY_REG(Conv3DBackpropInputD) /** -*@brief Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.. +*@brief Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence. *@par Inputs: * @li x: A Tensor dtype of float16. @@ -806,24 +892,35 @@ REG_OP(LSTM) *@brief Computes the gradients of convolution3D with respect to the filter *@par Inputs: * Three inputs: - * @li x: A Tensor. Must be one of the following types: float16, float32, double. - * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] or [batch, in_depth, in_channels, in_height, in_width]. - * @li filter_size: A Tensor of type int32. An integer vector representing the tensor shape of filter, - * where filter is a 5-D tensor [filter_depth, filter_height, filter_width, in_channels, out_channels] - * or [out_channels, filter_depth, filter_height, filter_width, in_channels] or [out_channels, filter_depth, in_channel, filter_height, filter_width]. - * @li out_backprop: A Tensor. Must have the same type as x. 5-D with shape [batch, out_depth, out_height, out_width, out_channels] - * or [batch, out_depth, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * @li x: A Tensor. Must be one of the following types: float16, float32, + * double. + * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] + * or [batch, in_depth, in_channels, in_height, in_width]. + * @li filter_size: A Tensor of type int32. An integer vector representing the + * tensor shape of filter, where filter is a 5-D tensor + * [filter_depth, filter_height, filter_width, in_channels, out_channels] + * or [out_channels, filter_depth, filter_height, filter_width, in_channels] + * or [out_channels, filter_depth, in_channel, filter_height, filter_width]. + * @li out_backprop: A Tensor. Must have the same type as x. + * 5-D with shape [batch, out_depth, out_height, out_width, out_channels] + * or [batch, out_depth, out_channels, out_height, out_width]. + * Gradients with respect to the output of the convolution. *@par Required Attributes: - * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for - * each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". - * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] pads on feature map. + * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding + * window for each dimension of "x". The N and C dimensions must be 1. + * Has the same format as "x". + * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] + * pads on feature map. *@par Attributes: * Three attributes: - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1]. - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A tuple/list of 5 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as x @@ -847,28 +944,40 @@ REG_OP(Conv3DBackpropFilter) *@par Inputs: * Two inputs: * @li x: A Tensor of type float16. - * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] or [batch, in_depth, in_channels, in_height, in_width]. - * @li out_backprop: A Tensor. Must have the same type as x. 5-D with shape [batch, out_depth, out_height, out_width, out_channels] - * or [batch, out_depth, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. + * 5-D with shape [batch, in_depth, in_height, in_width, in_channels] + * or [batch, in_depth, in_channels, in_height, in_width]. + * @li out_backprop: A Tensor. Must have the same type as x. + * 5-D with shape [batch, out_depth, out_height, out_width, out_channels] + * or [batch, out_depth, out_channels, out_height, out_width]. + * Gradients with respect to the output of the convolution. *@par Required Attributes: - * @li filter_size: A tuple/list of type integers. An integer vector representing the tensor shape of filter, - * where filter is a 5-D tensor [filter_depth, filter_height, filter_width, in_channels, out_channels] - * or [out_channels, filter_depth, filter_height, filter_width, in_channels] or [out_channels, filter_depth, in_channel, filter_height, filter_width]. - * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x". + * @li filter_size: A tuple/list of type integers. An integer vector + * representing the tensor shape of filter, where filter is a 5-D tensor + * [filter_depth, filter_height, filter_width, in_channels, out_channels] + * or [out_channels, filter_depth, filter_height, filter_width, in_channels] + * or [out_channels, filter_depth, in_channel, filter_height, filter_width]. + * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding + * window for each dimension of "x". * The N and C dimensions must be 1. Has the same format as "x". - * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] pads on feature map + * @li pads: A tuple/list of 6 integers, [front, back, top, bottom, left, right] + * pads on feature map *@par Attributes: * Three attributes: - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1]. - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A tuple/list of 5 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. *@par Outputs: - * y: A Tensor. Has the same type as x + * y: A Tensor of type float32 *@par Third-party framework compatibility * Compatible with Tensorflow's conv3d_backprop_filter +*@par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DBackpropFilter instead. */ @@ -888,27 +997,32 @@ REG_OP(Conv3DBackpropFilterD) *@brief Computes the transpose of convolution 3d with respect to the input. *@par Inputs: * Three inputs: - * @li input_size: A Tensor of type int32. An integer vector representing the shape of input + * @li input_size: A Tensor of type int32. An integer vector representing the + * shape of input * @li x: A Tensor of type float16, currently does not support int8 - * @li filter: A Tensor of type float16. + * @li filter: A Tensor of type float16, currently does not support int8 *@par Optional input: * Two optional inputs - * @li bias: An optional 1D tensor of the same type as "x". + * @li bias: An optional 1D tensor of the same type as "x". Reserved. * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved. *@par Required Attributes: - * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x". + * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding + * window for each dimension of "x". * The N and C dimensions must be 1. Has the same format as "x". * @li pads: A tuple/list of 6 integers *@par Attributes: * Five attributes: - * @li groups: Number of blocked connections from input channels to output channels. - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1] - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li dilations: A tuple/list of 5 integers, + * The dilation factor for each dimension of input, now only support [1,1,1,1,1] + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. * @li output_padding: The size will be added in the output shape. - * @li offset_x: Input offset_x value + * @li offset_x: Input offset_x value. Reserved. *@par Outputs: * y: A Tensor. Has the same type as filter */ @@ -931,28 +1045,35 @@ REG_OP(Conv3DTranspose) /** *@brief Computes the transpose of convolution 3d with respect to the input. *@par Inputs: - * @li x: A Tensor of type float16. - * @li filter: A Tensor of type float16. + * @li x: A Tensor of type float16, currently does not support int8 + * @li filter: A Tensor of type float16, currently does not support int8 *@par Optional inputs: - * @li bias: An optional 1D tensor of the same type as "x". + * @li bias: An optional 1D tensor of the same type as "x". Reserved. * @li offset_w: An optional 1D tensor for quantized deconvolution. Reserved. *@par Required Attributes: - * @li input_size: A tuple/list of type int32. An integer vector representing the shape of input - * @li strides: A tuple/list of 5 integers. Specifies the stride of the sliding window for each dimension of "x". + * @li input_size: A tuple/list of type int32. + * An integer vector representing the shape of input + * @li strides: A tuple/list of 5 integers. + * Specifies the stride of the sliding window for each dimension of "x". * The N and C dimensions must be 1. Has the same format as "x". * @li pads: A tuple/list of 6 integers. *@par Attributes: * Five attributes: - * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1] - * @li groups: Number of blocked connections from input channels to output channels. - * @li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. + * @li dilations: A tuple/list of 5 integers, The dilation factor for each + * dimension of input, now only support [1,1,1,1,1] + * @li groups: Number of blocked connections from input channels to output + * channels. Reserved. + * @li data_format: An optional string from: "NDHWC", "NCDHW". + * Defaults to "NDHWC". Specify the data format of the input and output data. * @li output_padding: The size will be added in the output shape. - * @li offset_x: Input offset_x value + * @li offset_x: Input offset_x value. Reserved. *@par Outputs: * y: A Tensor. Has the same type as filter +*@par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DTranspose instead. */ REG_OP(Conv3DTransposeD) .INPUT(x, TensorType({DT_FLOAT16})) @@ -974,23 +1095,29 @@ REG_OP(Conv3DTransposeD) *@brief Computes the transpose of convolution 2d with respect to the input. *@par Inputs: * Five inputs: - * @li input_size: A Tensor of type int32 or int64. An integer vector representing - * the shape of input. - * @li x: A Tensor of type float16, int8. + * @li input_size: A Tensor of type int32 or int64. An integer vector + * representing the shape of input, where input is a 4-D tensor + * [batch, height, width, channels] or [batch, channels, height, width]. + * @li x: A Tensor of type float16, int8. 4-D with shape [batch, out_height, + * out_width, out_channels] or [batch, out_channels, out_height, out_width]. * @li filter: A Tensor of type float16, int8. Must have the same type as "x". - * @li bias: An optional 1D tensor of the same type as "x". + * 4-D with shape [filter_height, filter_width, in_channels, out_channels] + * or [out_channels, filter_height, filter_width, in_channels] + * or [out_channels, in_channel, filter_height, filter_width]. + * @li bias: An optional 1D tensor of type float16 or int32. Format is "ND". * @li offset_w: An optional 1D tensor for quantized inference. Reserved. *@par Required Attributes: - * @li strides: A required list or tuple. The stride of the sliding window for - * height and width for H/W dimension. - * @li pads: A required list or tuple of int32. Padding added to each dimension - * of the input. + * @li strides: A required tuple/list of 4 integers. The stride of the sliding + * window for H/W dimension. The index of H/W is same as data_format. + * @li pads: A required tuple/list of 4 integers, [top, bottom, left, right] + * pads on feature map. *@par Attributes: * Five attributes: - * @li groups: Number of blocked connections from input channels to output channels. + * @li groups: Number of blocked connections from input channels to output + * channels. * Defaults to "1". - * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension - * of input. Must be [1, 1, 1, 1]. + * @li dilations: A tuple/list of 4 integers, The dilation factor for each + * dimension of input. Must be [1, 1, 1, 1]. * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". * Specify the data format of the input and output data. * @li output_padding: The size will be added in the output shape. Defaults @@ -998,7 +1125,8 @@ REG_OP(Conv3DTransposeD) * @li offset_x: An optional int. Input offset, used for quantized inference. * Defaults to "0". *@par Outputs: - * y: A Tensor. Has the same type as "filter". + * y: A Tensor. A Tensor of type float16 or int32, and has same format as + * input_size. */ REG_OP(Conv2DTranspose) .INPUT(input_size, TensorType({DT_INT32, DT_INT64})) @@ -1045,6 +1173,8 @@ REG_OP(Conv2DTranspose) * Defaults to "0". *@par Outputs: * y: A Tensor. Has the same type as "filter". +*@par Restrictions: + * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv2DTranspose instead. */ REG_OP(Conv2DTransposeD) .INPUT(x, TensorType({DT_FLOAT16, DT_INT8})) diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h index 9a17cd0d..38612463 100644 --- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_detect_ops.h + * \brief + */ #ifndef GE_OP_NN_DETECT_OPS_H_ #define GE_OP_NN_DETECT_OPS_H_ @@ -293,6 +297,8 @@ REG_OP(ROIAlign) *@see SSDDetectionOutput() *@par Third-party framework compatibility * It is a custom operator. It has no corresponding operator in Caffe. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead. */ REG_OP(PriorBoxD) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) @@ -314,6 +320,55 @@ REG_OP(ROIAlign) .ATTR(variance, ListFloat, {0.1}) .OP_END_FACTORY_REG(PriorBoxD); +/** +*@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox. + +*@par Inputs: +* Six inputs, including: +*@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16. +*@li img: source image. Has the same type and format as "x". +*@li boxes: An ND tensor of type float32 or float16, specifying the prior box information. Same as output y + +*@par Attributes: +*@li min_size: A required float32, specifying the minimum edge length of a square prior box. +*@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size) +*@li img_h: An optional int32, specifying the height of the source image. +*@li img_w: An optional int32, specifying the width of the source image. +*@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image. +*@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image. +*@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True". +*@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False". +*@li offset: An optional float32, specifying the offset. Defaults to "0.5". +*@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value). + +*@par Outputs: +*y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance. + +*@attention Constraints:\n +* This operator applies only to SSD networks. +*@see SSDDetectionOutput() +*@par Third-party framework compatibility +* It is a custom operator. It has no corresponding operator in Caffe. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead. +*/ + REG_OP(PriorBoxDV2) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .REQUIRED_ATTR(min_size, ListFloat) + .REQUIRED_ATTR(max_size, ListFloat) + .ATTR(img_h, Int, 0) + .ATTR(img_w, Int, 0) + .ATTR(step_h, Float, 0.0) + .ATTR(step_w, Float, 0.0) + .ATTR(flip, Bool, true) + .ATTR(clip, Bool, false) + .ATTR(offset, Float, 0.5) + .ATTR(variance, ListFloat, {0.1}) + .OP_END_FACTORY_REG(PriorBoxDV2); + /** *@brief Performs Position Sensitive ROI Pooling. @@ -574,6 +629,8 @@ and the actual image height and width. *@see Yolo() *@par Third-party framework compatibility * It is a custom operator. It has no corresponding operator in Caffe. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV2DetectionOutput instead. */ REG_OP(YoloV2DetectionOutputD) .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -700,6 +757,8 @@ and the actual image height and width. *@see Yolo() *@par Third-party framework compatibility * It is a custom operator. It has no corresponding operator in Caffe. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV3DetectionOutput instead. */ REG_OP(YoloV3DetectionOutputD) .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -926,12 +985,17 @@ REG_OP(ClipBoxes) /** *@brief Computes ClipBoxesD function. +*@par Attributes: +*img_size: A Tensor of shape [H, W]. + *@par Inputs: -*@li boxes_input: A Tensor. Must be float16. N-D with shape [N, 4]. -*@li img_size: A Tensor. Must be int32. shape [H, W]. +*boxes_input: A Tensor. Must be float16. N-D with shape [N, 4]. *@par Outputs: *boxes_output: A Tensor. Must have the same type as boxes_output. N-D with shape [N, 4]. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(ClipBoxesD) .INPUT(boxes_input, TensorType({DT_FLOAT16})) @@ -1032,6 +1096,11 @@ REG_OP(RpnProposals) * @par Third-party framework compatibility * Compatible with the pytorch operator RPNProposals. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use RpnProposals instead. */ REG_OP(RpnProposalsD) .INPUT(rois, TensorType({DT_FLOAT16})) diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h index 52e7702c..05470e88 100644 --- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_norm_ops.h + * \brief + */ #ifndef GE_OP_NN_NORM_OPS_H #define GE_OP_NN_NORM_OPS_H @@ -342,9 +346,9 @@ REG_OP(ConfusionSoftmaxGrad) *@brief Function softmax gradients ext. *@par Inputs: -* @li grad: A Tensor dtype of float16. +* @li grad: A Tensor dtype of float16, float32. * @li x1: A Tensor dtype of float16, float32. -* @li x2: A Tensor dtype of float16. +* @li x2: A Tensor dtype of float16, float32. *@par Attributes: *@li axis: A int Scalar. The axis for reduce. diff --git a/third_party/fwkacllib/inc/ops/nn_ops.h b/third_party/fwkacllib/inc/ops/nn_ops.h index 7637da07..ea4a5ba3 100644 --- a/third_party/fwkacllib/inc/ops/nn_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_ops.h + * \brief + */ #ifndef GE_OP_NN_OPS_H_ #define GE_OP_NN_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h index a7d4c6e3..4878935f 100644 --- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_pooling_ops.h + * \brief + */ #ifndef GE_OP_NN_POOLING_OPS_H #define GE_OP_NN_POOLING_OPS_H @@ -223,8 +227,7 @@ REG_OP(MaxPool) *@brief Performs max 3d pooling on the input. *@par Inputs: -*x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16, \n -int32, int64, uint8, uint16, qint8 +*x: An NC1HWC0 Tensor. Supported type float16, float32, double. *@par Attributes: *@li ksize: A required list of int8, int16, int32, or int64 values, \n @@ -233,11 +236,11 @@ No default value. *@li strides: A required list of int8, int16, int32, or int64 values, \n specifying the stride of the sliding window for each dimension of \n the input tensor. No default value. -*@li padding: A required string. No default value. -*@li pads: A list type of int32. Default value {0, 0, 0, 0, 0, 0}. -*@li dilation: A list type of int32. Default value {0,0,0}. +*@li padding: A required string. Default value "SAME". +*@li pads: A list type of int32. Default value {0, 0, 0}. +*@li dilation: A list type of int32. Default value {1, 1, 1}. *@li ceil_mode: A ceil mode number of int32 . Default value 0. -*@li data_format: An optional string. Defaults to "NHWC". +*@li data_format: An optional string. Defaults to "NDHWC". *@par Outputs: *y: A Tensor. Has the same type and format as input "x". @@ -635,6 +638,9 @@ REG_OP(AvgPoolGrad) * @par Outputs: * @out_grad: A mutable tensor with the same shape and type as "orig_input". +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use AvgPoolGrad instead. */ REG_OP(AvgPoolGradD) .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE})) @@ -956,6 +962,9 @@ REG_OP(AvgPool1D) *@par Third-party framework compatibility *@li compatible with pytorch AvgPool1D operator. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use AvgPool1D instead. */ REG_OP(AvgPool1DD) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h index 0ecaf9a3..0dab8606 100644 --- a/third_party/fwkacllib/inc/ops/nn_training_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nn_training_ops.h + * \brief + */ #ifndef GE_OP_TRAINING_OPS_H #define GE_OP_TRAINING_OPS_H @@ -178,6 +182,9 @@ REG_OP(SparseApplyAdagrad) *@par Third-party framework compatibility * Compatible with the TensorFlow operator SparseApplyAdagrad. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdagrad instead. */ REG_OP(SparseApplyAdagradD) .INPUT(var, TensorType({DT_FLOAT})) @@ -247,6 +254,9 @@ REG_OP(SparseApplyAdagradV2) *@par Third-party framework compatibility *Compatible with the TensorFlow operator SparseApplyAdagradV2. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdagradV2 instead. */ REG_OP(SparseApplyAdagradV2D) .INPUT(var, TensorType({DT_FLOAT})) @@ -440,6 +450,8 @@ REG_OP(ApplyKerasMomentum) *@par Third-party framework compatibility * Compatible with the TensorFlow operator ResourceApplyKerasMomentum. * +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyKerasMomentum instead. */ REG_OP(ApplyKerasMomentumD) .INPUT(var, TensorType::NumberType()) @@ -500,6 +512,9 @@ REG_OP(ApplyKerasMomentumD) *@par Third-party framework compatibility * Compatible with the TensorFlow operator ResourceApplyKerasMomentum. * +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdamWithAmsgrad instead. +* */ REG_OP(ApplyAdamWithAmsgradD) .INPUT(var, TensorType::NumberType()) @@ -1113,6 +1128,8 @@ REG_OP(ApplyAdagradV2) * @par Third-party framework compatibility * Compatible with the TensorFlow operator ApplyAdagrad. * +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagradV2 instead. */ REG_OP(ApplyAdagradV2D) .INPUT(var, TensorType::NumberType()) @@ -1389,6 +1406,9 @@ REG_OP(ApplyRMSProp) * * @par Third-party framework compatibility * @li Compatible with the TensorFlow operator ApplyRMSProp. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyRMSProp instead. */ REG_OP(ApplyRMSPropD) .INPUT(var, TensorType::NumberType()) @@ -2184,6 +2204,9 @@ REG_OP(SparseApplyFtrl) * @par Third-party framework compatibility * Compatible with the TensorFlow operator SparseApplyFtrl. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyFtrl instead. */ REG_OP(SparseApplyFtrlD) .INPUT(var, TensorType({DT_FLOAT})) @@ -2281,6 +2304,9 @@ REG_OP(SparseApplyFtrlV2) * @par Third-party framework compatibility * Compatible with the TensorFlow operator SparseApplyFtrlV2D. +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyFtrlV2 instead. */ REG_OP(SparseApplyFtrlV2D) .INPUT(var, TensorType({DT_FLOAT})) @@ -2389,6 +2415,9 @@ REG_OP(SparseApplyRMSProp) * @li Note that in this sparse implementation, "ms" and "mom" will not update * in iterations during which "grad" is 0. * @li The input tensors "var", "ms" and "mom" must have the same shape. +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyRMSProp instead. */ REG_OP(SparseApplyRMSPropD) .INPUT(var, TensorType::NumberType()) @@ -2492,6 +2521,9 @@ REG_OP(SparseApplyAdadelta) * @li Note that in this sparse implementation, "accum" and "accum_update" will not update * in iterations during which "grad" is 0. * @li The input tensors "var", "accum" and "accum_update" must have the same shape. +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyAdadelta instead. */ REG_OP(SparseApplyAdadeltaD) .INPUT(var, TensorType::NumberType()) diff --git a/third_party/fwkacllib/inc/ops/no_op.h b/third_party/fwkacllib/inc/ops/no_op.h index 61e187c4..9cde8a0f 100644 --- a/third_party/fwkacllib/inc/ops/no_op.h +++ b/third_party/fwkacllib/inc/ops/no_op.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file no_op.h + * \brief + */ #ifndef GE_NO_OP_H_ #define GE_NO_OP_H_ diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h index 310325c8..d265d4e5 100644 --- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h +++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file nonlinear_fuc_ops.h + * \brief + */ #ifndef GE_OP_NONLINEAR_FUC_OPS_H #define GE_OP_NONLINEAR_FUC_OPS_H @@ -58,6 +62,43 @@ REG_OP(GeluGrad) .OUTPUT(z, TensorType({DT_FLOAT16, DT_FLOAT})) .OP_END_FACTORY_REG(GeluGrad) +/** +*@brief Computes the for the fast_gelu of "x". + +*@par Inputs: +*Two inputs, including: +* @li x: A Tensor. Must be one of the following types: float16, float32 + +*@par Outputs: +*y: A Tensor. Has the same type as "x". +*@par Third-party framework compatibility +*Compatible with the TensorFlow operator FastGelu +*/ +REG_OP(FastGelu) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .OP_END_FACTORY_REG(FastGelu) + +/** +*@brief Computes the gradient for the fast_gelu of "x". + +*@par Inputs: +*Three inputs, including: +* @li dy: A Tensor. Must be one of the following types: float16, float32 +* @li x: A Tensor of the same type as "dy". + +*@par Outputs: +*z: A Tensor. Has the same type as "dy". +*@par Third-party framework compatibility +*Compatible with the TensorFlow operator FastGeluGrad +*/ +REG_OP(FastGeluGrad) + .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(z, TensorType({DT_FLOAT16, DT_FLOAT})) + .OP_END_FACTORY_REG(FastGeluGrad) + + /** *@brief Computes the gradient for the tanh of "x". @@ -153,6 +194,9 @@ REG_OP(Relu6) * @par Third-party framework compatibility * Compatible with the TensorFlow operator Relu6. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Relu6 instead. */ REG_OP(Relu6D) .INPUT(x, TensorType::RealNumberType()) @@ -535,14 +579,17 @@ REG_OP(LeakyReluGrad) *@brief Thresholds grad each element of the input Tensor. *@par Inputs: -* @li gradients: A Tensor shape and dtype of input gradients. Support float16, float32, int8, uint8, int32. -* @li features: A Tensor shape and dtype of input features. Support float16, float32, int8, uint8, int32. +* @li gradients: A Tensor shape and dtype of input gradients. Support float16, int32. +* @li features: A Tensor shape and dtype of input features. Support float16, int32. *@par Attributes: *threshold: A float32 scale value to threshold at. *@par Outputs: *backprops: A Tensor of shape and dtype of output backprops, should be same shape and type as inputs. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(ThresholdGradV2D) .INPUT(gradients, TensorType({DT_INT32, DT_FLOAT16})) @@ -555,7 +602,7 @@ REG_OP(ThresholdGradV2D) *@brief Thresholds each element of the input Tensor y = (x > threshold) ? x : value. *@par Inputs: -*x: A Tensor dtype of float16, float32, int8, uint8, int32. +*x: A Tensor dtype of real number. *@par Attributes: *@li threshold: A float32 scale value to threshold at. @@ -563,6 +610,9 @@ REG_OP(ThresholdGradV2D) *@par Outputs: *y: A Tensor of shape and dtype of output, should be same shape and type as input. + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(ThresholdV2D) .INPUT(x, TensorType::RealNumberType()) diff --git a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h index 8e9e1638..91aff6ba 100644 --- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h +++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h @@ -14,6 +14,11 @@ * limitations under the License. */ +/*! + * \file npu_loss_scale_ops.h + * \brief + */ + #ifndef GE_OP_NN_LOSS_SCALE_OPS_H #define GE_OP_NN_LOSS_SCALE_OPS_H #include "graph/operator_reg.h" diff --git a/third_party/fwkacllib/inc/ops/outfeed_ops.h b/third_party/fwkacllib/inc/ops/outfeed_ops.h index af27140a..139e4880 100644 --- a/third_party/fwkacllib/inc/ops/outfeed_ops.h +++ b/third_party/fwkacllib/inc/ops/outfeed_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file outfeed_ops.h + * \brief + */ #ifndef GE_OP_OUTFEED_OPS_H #define GE_OP_OUTFEED_OPS_H diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h index f7153936..6a0492f6 100644 --- a/third_party/fwkacllib/inc/ops/pad_ops.h +++ b/third_party/fwkacllib/inc/ops/pad_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file pad_ops.h + * \brief + */ #ifndef GE_OP_PAD_OPS_H #define GE_OP_PAD_OPS_H diff --git a/third_party/fwkacllib/inc/ops/parsing_ops.h b/third_party/fwkacllib/inc/ops/parsing_ops.h index a8a3e7a1..e73a69fe 100644 --- a/third_party/fwkacllib/inc/ops/parsing_ops.h +++ b/third_party/fwkacllib/inc/ops/parsing_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file parsing_ops.h + * \brief + */ #ifndef GE_OP_PARSING_OPS_H #define GE_OP_PARSING_OPS_H diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h index 4cb80cea..772f9edb 100644 --- a/third_party/fwkacllib/inc/ops/quantize_ops.h +++ b/third_party/fwkacllib/inc/ops/quantize_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file quantize_ops.h + * \brief + */ #ifndef GE_OP_QUANTIZE_OPS_H #define GE_OP_QUANTIZE_OPS_H #include "graph/operator_reg.h" @@ -143,14 +147,14 @@ REG_OP(AscendAntiQuant) *@par Inputs: *@li x0: An NC1HWC0 tensor of type int32, specifying the input. -*@li deq_scale: An NC1HWC0 tensor of type float16 or uint64, specifying the scaling ratio. +*@li deq_scale: An NC1HWC0 tensor of type uint64, specifying the scaling ratio. *@li x1: An NC1HWC0 tensor of type int16, specifying the input. *@par Attributes: *relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False". *@par Outputs: -*y: The dequantized output tensor of type float16 or float32 and with format NC1HWC0. +*y: The dequantized output tensor of type int16 and with format NC1HWC0. *@par Third-party framework compatibility * It is a custom operator. It has no corresponding operator in Caffe. diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h index 2b8bba5f..d0f2b1c5 100644 --- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file ragged_array_ops.h + * \brief + */ #ifndef GE_OP_RAGGED_ARRAY_OPS_H #define GE_OP_RAGGED_ARRAY_OPS_H diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h index 82fd84b7..a95884a8 100644 --- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file ragged_conversion_ops.h + * \brief + */ #ifndef GE_OP_RAGGED_CONVERSION_OPS_H #define GE_OP_RAGGED_CONVERSION_OPS_H #include "graph/operator_reg.h" diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h index e56c35a5..5acdb7f6 100644 --- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file ragged_math_ops.h + * \brief + */ #ifndef GE_OP_RAGGED_MATH_OPS_H #define GE_OP_RAGGED_MATH_OPS_H diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h index a35e8b3a..8c95ea64 100644 --- a/third_party/fwkacllib/inc/ops/random_ops.h +++ b/third_party/fwkacllib/inc/ops/random_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file random_ops.h + * \brief + */ #ifndef GE_OP_RANDOM_OPS_H_ #define GE_OP_RANDOM_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h index d3dfefe1..a7f8a178 100644 --- a/third_party/fwkacllib/inc/ops/reduce_ops.h +++ b/third_party/fwkacllib/inc/ops/reduce_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file reduce_ops.h + * \brief + */ #ifndef GE_OP_REDUCE_OPS_H #define GE_OP_REDUCE_OPS_H @@ -208,7 +212,7 @@ REG_OP(BNTrainingUpdateV2) /** *@brief Performs reduced batch normalization v3. For some scene which don't contain -assignmoving average. +assign moving average. *@par Inputs: * Five inputs, including: (NC1HWC0 supported) @@ -222,7 +226,6 @@ assignmoving average. *epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero. *@par Outputs: -* Three outputs, including: (NC1HWC0 supported) *@li y: A 5D Tensor of type float16 or float32, for normalized "x". *@li batch_mean: A 5D Tensor of type float32, for the mean of "x". *@li batch_variance: A 5D Tensor of type float32, for the variance of "x". diff --git a/third_party/fwkacllib/inc/ops/resource_variable_ops.h b/third_party/fwkacllib/inc/ops/resource_variable_ops.h index 04aadf40..a4d54088 100644 --- a/third_party/fwkacllib/inc/ops/resource_variable_ops.h +++ b/third_party/fwkacllib/inc/ops/resource_variable_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file resource_variable_ops.h + * \brief + */ #ifndef GE_OP_RESOURCE_VARIABLE_OPS_H #define GE_OP_RESOURCE_VARIABLE_OPS_H diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h index ebc59a34..ee19865f 100644 --- a/third_party/fwkacllib/inc/ops/rnn.h +++ b/third_party/fwkacllib/inc/ops/rnn.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file rnn.h + * \brief + */ #ifndef GE_OP_RNN_H #define GE_OP_RNN_H @@ -85,6 +89,76 @@ REG_OP(DynamicLSTM) .OUTPUT(output_h, TensorType({DT_FLOAT32})) .OP_END_FACTORY_REG(DynamicLSTM) +/** +*@brief: DynamicRNN calculation. +*@par Inputs: +*ten inputs: \n +*@li x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li w:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM. +*@li b:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. +*@li seq_length:A 1D Tensor. Must be one of the following types: int32. The format must be ND. +*@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li init_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li wci:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM. +*@li wcf:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM. +*@li wco:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM. +*@li mask:A 1D Tensor. Must be one of the following types: uint8. The format must be ND. + +*@par Attributes: +*@li cell_type:An string identifying the cell type in the op. Default to "LSTM". Only LSTM is currently supported. +*@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL". Only UNIDIRECTIONAL is currently supported. +*@li cell_depth:An integer identifying the cell depth in the op. Default to 1. +*@li use_peephole:An bool identifying if use peephole in the op. Default to false. +*@li keep_prob:An float identifying the keep prob in the op. Default to 1. +*@li cell_clip:An float identifying the cell clip in the op. Default to -1. +*@li num_proj:An integer identifying the num projection in the op. Default to 0. +*@li time_major:An bool identifying the time major in the op. Default to false. +*@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported. +*@li forget_bias:An float identifying the forget bias in the op. Default to 0. +*@li is_training:An bool identifying is training in the op. Default to true. + +*@par Outputs: +*eight outputs: \n +*@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li i:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li j:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*/ +REG_OP(DynamicRNN) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(seq_length, TensorType({DT_UINT32})) + .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(wcf, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(wco, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(mask, TensorType({DT_UINT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(output_c, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(i, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(j, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(f, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(tanhc, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(cell_type, String, "LSTM") + .ATTR(direction, String, "UNIDIRECTIONAL") + .ATTR(cell_depth, Int, 1) + .ATTR(use_peephole, Bool, false) + .ATTR(keep_prob, Float, 1.0) + .ATTR(cell_clip, Float, -1.0) + .ATTR(num_proj, Int, 0) + .ATTR(time_major, Bool, false) + .ATTR(forget_bias, Float, 0.0) + .ATTR(is_training, Bool, true) + .OP_END_FACTORY_REG(DynamicRNN) + /** *@brief: Basic LSTM Cell backward calculation.Calculate the gradient of input and hidden state. *@par Inputs: diff --git a/third_party/fwkacllib/inc/ops/rpn_ops.h b/third_party/fwkacllib/inc/ops/rpn_ops.h index 252bfdb0..1484e95e 100644 --- a/third_party/fwkacllib/inc/ops/rpn_ops.h +++ b/third_party/fwkacllib/inc/ops/rpn_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file rpn_ops.h + * \brief + */ #ifndef GE_OP_RPN_OPS_H #define GE_OP_RPN_OPS_H diff --git a/third_party/fwkacllib/inc/ops/save_ops.h b/third_party/fwkacllib/inc/ops/save_ops.h index a3b9d397..7fd853d3 100644 --- a/third_party/fwkacllib/inc/ops/save_ops.h +++ b/third_party/fwkacllib/inc/ops/save_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file save_ops.h + * \brief + */ #ifndef GE_OP_SAVE_OPS_H_ #define GE_OP_SAVE_OPS_H_ @@ -21,6 +25,13 @@ namespace ge { +/** +*@brief Mark which tensors need to be saved to the ckpt file. +*@par Inputs: +*tensors: A list of input tensor. +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ REG_OP(Save) .DYNAMIC_INPUT(tensors, TensorType:ALL()) .OP_END_FACTORY_REG(Save) @@ -28,4 +39,4 @@ REG_OP(Save) } // namespace ge -#endif // GE_OP_SAVE_OPS_H_ \ No newline at end of file +#endif // GE_OP_SAVE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h index 2cbafc3c..712fc1fc 100644 --- a/third_party/fwkacllib/inc/ops/sdca_ops.h +++ b/third_party/fwkacllib/inc/ops/sdca_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file sdca_ops.h + * \brief + */ #ifndef GE_OP_SDCA_OPS_H #define GE_OP_SDCA_OPS_H diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h index 47cf4a47..1328ae52 100644 --- a/third_party/fwkacllib/inc/ops/selection_ops.h +++ b/third_party/fwkacllib/inc/ops/selection_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file selection_ops.h + * \brief + */ #ifndef GE_OP_SELECTION_OPS_H #define GE_OP_SELECTION_OPS_H #include "graph/operator_reg.h" @@ -125,6 +129,8 @@ REG_OP(Tile) *@par Third-party framework compatibility *Compatible with the TensorFlow operator Tile. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Tile instead. */ REG_OP(TileD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) @@ -540,6 +546,8 @@ REG_OP(ReverseV2) *@par Third-party framework compatibility * Compatible with the TensorFlow operator ReverseV2. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ReverseV2 instead. */ REG_OP(ReverseV2D) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, @@ -755,6 +763,8 @@ REG_OP(Slice) *@par Outputs: *y: A Tensor. Has the same type as "x". The slice extracted from the tensor. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Slice instead. */ REG_OP(SliceD) .INPUT(x, TensorType::BasicType()) @@ -915,6 +925,9 @@ REG_OP(ScatterNdD) * @par Third-party framework compatibility * Compatible with the TensorFlow operator InTopK. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use InTopK instead. */ REG_OP(InTopKD) .INPUT(x1, TensorType({DT_FLOAT})) @@ -1027,6 +1040,9 @@ REG_OP(StridedSliceAssign) * "value" shape must be exactly the shape produced by the slice of "var". * @see StridedSlice() +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSliceAssign instead. */ REG_OP(StridedSliceAssignD) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16})) @@ -1407,7 +1423,10 @@ REG_OP(UnsortedSegmentMin) * @par Outputs: * y: A Tensor.Must have the same type as input "x". -* @see UnsortedSegmentProdD(), +* @see UnsortedSegmentProdD(), UnsortedSegmentSumD(), +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentMin instead. */ REG_OP(UnsortedSegmentMinD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16})) @@ -1457,6 +1476,9 @@ REG_OP(UnsortedSegmentMax) * y: A Tensor.Must have the same type as input "x". * @see UnsortedSegmentProdD(), +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentMax instead. */ REG_OP(UnsortedSegmentMaxD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16})) @@ -1505,6 +1527,9 @@ REG_OP(UnsortedSegmentProd) * y: A Tensor.Must have the same type as input "x". * @see UnsortedSegmentMinD() +* +* @par Restrictions: +* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentProd instead. */ REG_OP(UnsortedSegmentProdD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16})) @@ -1580,6 +1605,8 @@ REG_OP(UnsortedSegmentProdD) *@li actual_rois_num: A Tensor with shape [batch, 8], of type int32, specifying the number of BBoxes output per batch. *@par Third-party framework compatibility * It is a custom operator. It has no corresponding operator in Caffe. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Proposal instead. */ REG_OP(ProposalD) .INPUT(cls_prob, TensorType({DT_FLOAT16, DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/set_ops.h b/third_party/fwkacllib/inc/ops/set_ops.h index d9478380..f4d5c4ba 100644 --- a/third_party/fwkacllib/inc/ops/set_ops.h +++ b/third_party/fwkacllib/inc/ops/set_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file set_ops.h + * \brief + */ #ifndef GE_OP_SET_OPS_H_ #define GE_OP_SET_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h index 6b5600f7..eb3629a4 100644 --- a/third_party/fwkacllib/inc/ops/sparse_ops.h +++ b/third_party/fwkacllib/inc/ops/sparse_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file sparse_ops.h + * \brief + */ #ifndef GE_OP_SPARSE_OPS_H_ #define GE_OP_SPARSE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/spectral_ops.h b/third_party/fwkacllib/inc/ops/spectral_ops.h index 53b3e848..4c412a1f 100644 --- a/third_party/fwkacllib/inc/ops/spectral_ops.h +++ b/third_party/fwkacllib/inc/ops/spectral_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file spectral_ops.h + * \brief + */ #ifndef GE_OP_SPECTRAL_OPS_H #define GE_OP_SPECTRAL_OPS_H diff --git a/third_party/fwkacllib/inc/ops/split_combination_ops.h b/third_party/fwkacllib/inc/ops/split_combination_ops.h index 7e4428d0..de7300d2 100644 --- a/third_party/fwkacllib/inc/ops/split_combination_ops.h +++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file split_combination_ops.h + * \brief + */ #ifndef GE_OP_SPLIT_COMBINATION_OPS_H #define GE_OP_SPLIT_COMBINATION_OPS_H #include "graph/operator_reg.h" @@ -197,6 +201,8 @@ REG_OP(ParallelConcat) *@par Third-party framework compatibility * Compatible with the TensorFlow operator ConcatV2. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ConcatV2 instead. */ REG_OP(ConcatV2D) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8})) @@ -254,6 +260,8 @@ REG_OP(ConcatV2) *@par Third-party framework compatibility * Compatible with the TensorFlow operator Concat. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Concat instead. */ REG_OP(ConcatD) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) @@ -360,6 +368,8 @@ REG_OP(ConcatOffset) *@par Third-party framework compatibility *@ Compatible with the TensorFlow operator ConcatOffset. +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use ConcatOffset instead. */ REG_OP(ConcatOffsetD) .DYNAMIC_INPUT(x, TensorType({DT_INT32})) diff --git a/third_party/fwkacllib/inc/ops/state_ops.h b/third_party/fwkacllib/inc/ops/state_ops.h index 4e759688..2261cd3e 100644 --- a/third_party/fwkacllib/inc/ops/state_ops.h +++ b/third_party/fwkacllib/inc/ops/state_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file state_ops.h + * \brief + */ #ifndef GE_OP_STATE_OPS_H_ #define GE_OP_STATE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h index eb3db1cc..0bcb87cd 100644 --- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h +++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file stateful_random_ops.h + * \brief + */ #ifndef GE_OP_STATEFUL_RANDOM_OPS_H #define GE_OP_STATEFUL_RANDOM_OPS_H diff --git a/third_party/fwkacllib/inc/ops/stateless_random_ops.h b/third_party/fwkacllib/inc/ops/stateless_random_ops.h index 03fc824a..ddfda47d 100644 --- a/third_party/fwkacllib/inc/ops/stateless_random_ops.h +++ b/third_party/fwkacllib/inc/ops/stateless_random_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file stateless_random_ops.h + * \brief + */ #ifndef GE_OP_STATELESS_RANDOM_OPS_H #define GE_OP_STATELESS_RANDOM_OPS_H diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h index d085a868..8b4b7250 100644 --- a/third_party/fwkacllib/inc/ops/string_ops.h +++ b/third_party/fwkacllib/inc/ops/string_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file string_ops.h + * \brief + */ #ifndef GE_OP_STRING_OPS_H_ #define GE_OP_STRING_OPS_H_ @@ -44,6 +48,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringSplit op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringSplit) .INPUT(input, TensorType({DT_STRING})) @@ -76,6 +83,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringSplitV2 op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringSplitV2) .INPUT(input, TensorType({DT_STRING})) @@ -108,6 +118,9 @@ include: \n *@par Third-party framework compatibility *compatible with UnicodeScript op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(UnicodeScript) .INPUT(x, TensorType({DT_INT32})) @@ -139,6 +152,9 @@ include: \n *@par Third-party framework compatibility *compatible with Substr op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(Substr) .INPUT(input, TensorType({DT_STRING})) @@ -169,6 +185,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringToHashBucketFast op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringToHashBucketFast) .INPUT(x, TensorType({DT_STRING})) @@ -203,6 +222,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringToHashBucketStrong op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringToHashBucketStrong) .INPUT(x, TensorType({DT_STRING})) @@ -228,6 +250,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringToHashBucket op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringToHashBucket) .INPUT(string_tensor, TensorType({DT_STRING})) @@ -249,6 +274,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringStrip op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringStrip) .INPUT(x, TensorType({DT_STRING})) @@ -277,6 +305,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringLength op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringLength) .INPUT(x, TensorType({DT_STRING})) @@ -309,6 +340,9 @@ include: \n *@par Third-party framework compatibility *compatible with StringJoin op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringJoin) .DYNAMIC_INPUT(x, TensorType({DT_STRING})) @@ -341,6 +375,9 @@ include: \n *@par Third-party framework compatibility * compatible with StringFormat op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(StringFormat) .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ @@ -372,6 +409,9 @@ include: \n *@par Third-party framework compatibility *compatible with RegexFullMatch op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(RegexFullMatch) .INPUT(x, TensorType({DT_STRING})) @@ -404,6 +444,9 @@ include: \n *@par Third-party framework compatibility *compatible with RegexReplace op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(RegexReplace) .INPUT(x, TensorType({DT_STRING})) @@ -439,6 +482,9 @@ include: \n *@par Third-party framework compatibility *compatible with AsString op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(AsString) .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \ @@ -477,6 +523,9 @@ include: \n *@par Third-party framework compatibility *compatible with EncodeBase64 op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(EncodeBase64) .INPUT(x, TensorType({DT_STRING})) @@ -500,6 +549,9 @@ include: \n *@par Third-party framework compatibility *compatible with DecodeBase64 op of tensorflow + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. */ REG_OP(DecodeBase64) .INPUT(x, TensorType({DT_STRING})) diff --git a/third_party/fwkacllib/inc/ops/swap_co_ops.h b/third_party/fwkacllib/inc/ops/swap_co_ops.h index 02f1451b..a6c0f9ca 100644 --- a/third_party/fwkacllib/inc/ops/swap_co_ops.h +++ b/third_party/fwkacllib/inc/ops/swap_co_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file swap_co_ops.h + * \brief + */ #ifndef GE_OP_SWAP_CO_OPS_H_ #define GE_OP_SWAP_CO_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h index ddbb1b4d..a7c33ab5 100644 --- a/third_party/fwkacllib/inc/ops/transformation_ops.h +++ b/third_party/fwkacllib/inc/ops/transformation_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file transformation_ops.h + * \brief + */ #ifndef GE_OP_TRANSFORMATION_OPS_H #define GE_OP_TRANSFORMATION_OPS_H @@ -93,6 +97,8 @@ REG_OP(DepthwiseWeight6DTo4D) *@par Outputs: *y: A Tensor. Has the same type as "x". +*@par Restrictions: +*Warning: THIS FUNCTION IS DEPRECATED. Please use Transpose instead. */ REG_OP(TransposeD) .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, @@ -124,17 +130,17 @@ REG_OP(Transpose) .OP_END_FACTORY_REG(Transpose) /** -*@brief Doing format_transfer for various data format only \n -support NHWC/NCHW to NC1HWC0 and NC1HWC0 to NHWC/NCHW \n -NCHW to FRACTAL_Zn or FRACTAL_Zn to NCHW \n -HWCN to FRACTAL_Zn or FRACTAL_Zn to HWCN. +*@brief Doing format_transfer for various data format only +support "NHWC/NCHW" to "NC1HWC0" and "NC1HWC0" to "NHWC/NCHW" +"NCHW" to "FRACTAL_Zn" or "FRACTAL_Zn" to "NCHW". +"HWCN" to "FRACTAL_Zn" or "FRACTAL_Zn" to "HWCN". *@par Inputs: *src: A Tensor dtype of all types. *@par Attributes: -*@li src_format: A string source data format, can be NHWC, NCHW, FRACTAL_Zn etc. -*@li expose_hidden: A string target data format, can be NC1HWC0, NCHW, FRACTAL_Zn etc. +*@li src_format: A string source data format, can be "NHWC", "NCHW", "FRACTAL_Zn" etc. +*@li dst_format: A string target data format, can be "NC1HWC0", "NCHW", "FRACTAL_Zn" etc. *@par Outputs: *dst: A Tensor dtype of all types. diff --git a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h index 7da49c1e..bf8ecd53 100644 --- a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h +++ b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h @@ -14,6 +14,10 @@ * limitations under the License. */ +/*! + * \file warp_perspective_ops.h + * \brief + */ #ifndef GE_OP_WARP_PERSPECTIVE_OPS_H_ #define GE_OP_WARP_PERSPECTIVE_OPS_H_ diff --git a/third_party/fwkacllib/inc/register/host_cpu_context.h b/third_party/fwkacllib/inc/register/host_cpu_context.h new file mode 100644 index 00000000..f7d4f52f --- /dev/null +++ b/third_party/fwkacllib/inc/register/host_cpu_context.h @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_REGISTER_HOST_CPU_CONTEXT_H_ +#define INC_REGISTER_HOST_CPU_CONTEXT_H_ + +#include "external/ge/ge_api_error_codes.h" +#include "register/register_types.h" + +namespace ge { +class HostCpuContext { + public: + HostCpuContext() = default; + ~HostCpuContext() = default; + private: + class Impl; + Impl *impl_; +}; +} // namespace ge + +extern "C" { +// Unified definition for registering host_cpu_kernel_wrapper when so is opened +FMK_FUNC_HOST_VISIBILITY ge::Status Initialize(const ge::HostCpuContext &ctx); +} + +#endif //INC_REGISTER_HOST_CPU_CONTEXT_H_ diff --git a/third_party/fwkacllib/inc/register/op_registry.h b/third_party/fwkacllib/inc/register/op_registry.h index 1dc14b8b..3feea0df 100644 --- a/third_party/fwkacllib/inc/register/op_registry.h +++ b/third_party/fwkacllib/inc/register/op_registry.h @@ -65,6 +65,9 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry { domi::FusionParseParamFunc GetFusionParseParamFunc(const std::string &op_type, const std::string &ori_type); + domi::FusionParseParamByOpFunc GetFusionParseParamByOpFunc(const std::string &op_type, + const std::string &ori_type); + domi::ParseSubgraphFunc GetParseSubgraphPostFunc(const std::string &op_type); domi::ImplyType GetImplyTypeByOriOpType(const std::string &ori_optype); @@ -78,6 +81,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry { std::unordered_map op_parse_params_fn_map_; std::unordered_map parse_params_by_op_func_map_; std::unordered_map fusion_op_parse_params_fn_map_; + std::unordered_map fusion_parse_params_by_op_fn_map_; std::unordered_map op_types_to_parse_subgraph_post_func_; std::unordered_map> remove_input_configure_map_; std::unordered_map origin_type_to_om_type_; diff --git a/third_party/fwkacllib/inc/register/register.h b/third_party/fwkacllib/inc/register/register.h index 27da0b0b..d98edaa4 100644 --- a/third_party/fwkacllib/inc/register/register.h +++ b/third_party/fwkacllib/inc/register/register.h @@ -18,6 +18,7 @@ #define INC_REGISTER_REGISTRY_H_ #include "external/register/register.h" +#include "external/ge/ge_api_error_codes.h" namespace ge { class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp { diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h index 572053f6..2ab522fa 100644 --- a/third_party/fwkacllib/inc/runtime/base.h +++ b/third_party/fwkacllib/inc/runtime/base.h @@ -201,6 +201,7 @@ typedef enum tagRtError { RT_ERROR_FEATURE_NOT_SUPPROT, RT_ERROR_MEMORY_ALLOCATION, RT_ERROR_MEMORY_FREE, + RT_ERROR_INVALID_MEMORY_TYPE, RT_ERROR_DEBUG_BASE = 0x07120000, RT_ERROR_DEBUG_NULL, diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h index bf2ce447..048be69a 100644 --- a/third_party/fwkacllib/inc/runtime/dev.h +++ b/third_party/fwkacllib/inc/runtime/dev.h @@ -164,7 +164,7 @@ RTS_API rtError_t rtGetDevicePhyIdByIndex(uint32_t devIndex, uint32_t *phyId); * @return RT_ERROR_NONE for ok * @return RT_ERROR_INVALID_VALUE for error input */ -RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc); +RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc, uint32_t flag); /** * @ingroup dvrt_dev @@ -176,6 +176,17 @@ RTS_API rtError_t rtEnableP2P(uint32_t devIdDes, uint32_t phyIdSrc); */ RTS_API rtError_t rtDisableP2P(uint32_t devIdDes, uint32_t phyIdSrc); +/** + * @ingroup dvrt_dev + * @brief get cability of P2P omemry copy betwen device and peeredevic. + * @param [in] device the logical device id + * @param [in] peerDevice the physical device id + * @param [outv] *canAccessPeer 1:enable 0:disable + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_INVALID_VALUE for error input + */ +RTS_API rtError_t rtDeviceCanAccessPeer(int32_t* canAccessPeer, uint32_t device, uint32_t peerDevice); + /** * @ingroup dvrt_dev * @brief get status diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h index aec290da..956e033b 100644 --- a/third_party/fwkacllib/inc/runtime/kernel.h +++ b/third_party/fwkacllib/inc/runtime/kernel.h @@ -177,6 +177,7 @@ typedef void (*rtCallback_t)(void *fnData); #define RT_KERNEL_CONVERT (0x01) #define RT_KERNEL_DUMPFLAG (0x02) #define RT_FUSION_KERNEL_DUMPFLAG (0x04) +#define RT_KERNEL_CUSTOM_AICPU (0x08) /** * @ingroup rt_kernel diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h index 3280f3c6..8e159dd7 100644 --- a/third_party/fwkacllib/inc/runtime/mem.h +++ b/third_party/fwkacllib/inc/runtime/mem.h @@ -46,6 +46,15 @@ extern "C" { #define RT_MEMORY_L1 ((uint32_t)0x1<<16) #define RT_MEMORY_L2 ((uint32_t)0x1<<17) +/** + * @ingroup dvrt_mem + * @brief memory info type + */ +#define RT_MEM_INFO_TYPE_DDR_SIZE ((uint32_t)0x1) +#define RT_MEM_INFO_TYPE_HBM_SIZE ((uint32_t)0x2) +#define RT_MEM_INFO_TYPE_DDR_P2P_SIZE ((uint32_t)0x3) +#define RT_MEM_INFO_TYPE_HBM_P2P_SIZE ((uint32_t)0x4) + /** * @ingroup dvrt_mem * @brief memory Policy @@ -54,6 +63,9 @@ extern "C" { #define RT_MEMORY_POLICY_HUGE_PAGE_FIRST ((uint32_t)0x1 << 10) // Malloc mem prior hage page, then default page #define RT_MEMORY_POLICY_HUGE_PAGE_ONLY ((uint32_t)0x1 << 11) // Malloc mem only use hage page #define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY ((uint32_t)0x1 << 12) // Malloc mem only use default page +#define RT_MEMORY_POLICY_HUGE_PAGE_FIRST_P2P ((uint32_t)0x1 << 13) // Malloc mem prior hage page, then default page, use for p2p +#define RT_MEMORY_POLICY_HUGE_PAGE_ONLY_P2P ((uint32_t)0x1 << 14) // Malloc mem only use hage page, use for p2p +#define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY_P2P ((uint32_t)0x1 << 15) // Malloc mem only use default page, use for p2p #define MEM_ALLOC_TYPE_BIT ((uint32_t)0x3FF) // mem type bit in <0, 9> @@ -88,6 +100,19 @@ typedef enum tagRtMemcpyKind { RT_MEMCPY_RESERVED, } rtMemcpyKind_t; +typedef enum tagRtMemInfoType { + RT_MEMORYINFO_DDR, + RT_MEMORYINFO_HBM, + RT_MEMORYINFO_DDR_HUGE, // Hugepage memory of DDR + RT_MEMORYINFO_DDR_NORMAL, // Normal memory of DDR + RT_MEMORYINFO_HBM_HUGE, // Hugepage memory of HBM + RT_MEMORYINFO_HBM_NORMAL, // Normal memory of HBM + RT_MEMORYINFO_DDR_P2P_HUGE, // Hugepage memory of DDR + RT_MEMORYINFO_DDR_P2P_NORMAL, // Normal memory of DDR + RT_MEMORYINFO_HBM_P2P_HUGE, // Hugepage memory of HBM + RT_MEMORYINFO_HBM_P2P_NORMAL, // Normal memory of HBM +} rtMemInfoType_t; + typedef enum tagRtRecudeKind { RT_MEMCPY_SDMA_AUTOMATIC_ADD = 10, // D2D, SDMA inline reduce, include 1P, and P2P RT_RECUDE_KIND_END @@ -350,6 +375,16 @@ RTS_API rtError_t rtMemsetAsync(void *ptr, uint64_t destMax, uint32_t value, uin */ RTS_API rtError_t rtMemGetInfo(size_t *free, size_t *total); +/** + * @ingroup dvrt_mem + * @brief get current device memory total and free + * @param [in] memInfoType + * @param [out] free + * @param [out] total + * @return RT_ERROR_NONE for ok, errno for failed + */ +RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size_t *total); + /** * @ingroup dvrt_mem * @brief set memory with uint32_t value