| @@ -1,3 +1,22 @@ | |||||
| # Release 1.0.0 | |||||
| ## Major Features and Improvements | |||||
| * Automatically dump the input and output of the abnormal operator when the network execution is abnormal; | |||||
| * Realize dynamic multi-batch based on GotoLabel; | |||||
| * Optimize the performance of dynamic shape; | |||||
| * The dynamic resolution feature supports new scene that the network has multiple inputs and the shape of each input is different. | |||||
| ## Bugfixes | |||||
| * Fixed the issue that the input and output data of the AICPU operator cannot be dumped in the single-operator execution scenario. | |||||
| * Fixed the execution fails in the custom AICPU operator cascading scenario. | |||||
| * Fixed the issue that in the dynamic batch+dynamic AIPP scenario, the getinputformat and getinputdims parameters are inconsistent. | |||||
| ## Thanks to our Contributors | |||||
| Thanks goes to these wonderful people: wuweikang,wangcong,weiyang,yanghaorang,xutianchun,shibeiji,zhouchao, tanghuikang, zhoulili, liujunzhu, zhengyuanhua, taoxiangdong Contributions of any kind are welcome! | |||||
| Contributions of any kind are welcome! | |||||
| # Release 0.7.0-beta | # Release 0.7.0-beta | ||||
| ## Major Features and Improvements | ## Major Features and Improvements | ||||
| @@ -4,7 +4,7 @@ graphengine_add_pkg(securec | |||||
| MD5 0782dd2351fde6920d31a599b23d8c91 | MD5 0782dd2351fde6920d31a599b23d8c91 | ||||
| LIBS c_sec | LIBS c_sec | ||||
| PATCHES ${GE_SOURCE_DIR}/third_party/patch/securec/securec.patch001 | PATCHES ${GE_SOURCE_DIR}/third_party/patch/securec/securec.patch001 | ||||
| CMAKE_OPTION " " | |||||
| CMAKE_OPTION "-DCMAKE_BUILD_TYPE=Release" | |||||
| ) | ) | ||||
| include_directories(${securec_INC}) | include_directories(${securec_INC}) | ||||
| file(COPY ${securec_INC}/../lib/libc_sec.so DESTINATION ${CMAKE_SOURCE_DIR}/build/graphengine) | file(COPY ${securec_INC}/../lib/libc_sec.so DESTINATION ${CMAKE_SOURCE_DIR}/build/graphengine) | ||||
| @@ -0,0 +1,69 @@ | |||||
| /** | |||||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef INC_EXTERNAL_GE_GE_PROF_H_ | |||||
| #define INC_EXTERNAL_GE_GE_PROF_H_ | |||||
| #include <map> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| #include "ge/ge_api_error_codes.h" | |||||
| namespace ge { | |||||
| enum ProfDataTypeConfig { | |||||
| kProfAcl = 0x0001, | |||||
| kProfTaskTime = 0x0002, | |||||
| kProfAiCoreMetrics = 0x0004, | |||||
| kProfAicpuTrace = 0x0008, | |||||
| kProfModelExecute = 0x0010, | |||||
| kProfRuntimeApi = 0x0020, | |||||
| kProfRuntimeTrace = 0x0040, | |||||
| kProfScheduleTimeline = 0x0080, | |||||
| kProfScheduleTrace = 0x0100, | |||||
| kProfAiVectorCoreMetrics = 0x0200, | |||||
| kProfSubtaskTime = 0x0400, | |||||
| kProfTrainingTrace = 0x0800, | |||||
| kProfHcclTrace = 0x1000, | |||||
| kProfDataProcess = 0x2000, | |||||
| kProfTaskTrace = 0x3842, | |||||
| kProfModelLoad = 0x8000000000000000 | |||||
| }; | |||||
| enum ProfilingAicoreMetrics { | |||||
| kAicoreArithmaticThroughput = 0, | |||||
| kAicorePipeline = 1, | |||||
| kAicoreSynchronization = 2, | |||||
| kAicoreMemory = 3, | |||||
| kAicoreInternalMemory = 4, | |||||
| kAicoreStall = 5, | |||||
| kAicoreMetricsAll = 255 // only for op_trace | |||||
| }; | |||||
| typedef struct ProfAicoreEvents ProfAicoreEvents; | |||||
| typedef struct aclgrphProfConfig aclgrphProfConfig; | |||||
| Status aclgrphProfInit(const char *profiler_path, uint32_t length); | |||||
| Status aclgrphProfFinalize(); | |||||
| aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums, | |||||
| ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events, | |||||
| uint64_t data_type_config); | |||||
| Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config); | |||||
| Status aclgrphProfStart(aclgrphProfConfig *profiler_config); | |||||
| Status aclgrphProfStop(aclgrphProfConfig *profiler_config); | |||||
| } // namespace ge | |||||
| #endif // INC_EXTERNAL_GE_GE_PROF_H_ | |||||
| @@ -97,6 +97,7 @@ GE_ERRORNO_COMMON(INTERNAL_ERROR, 4, "Internal errors"); // 1343225 | |||||
| GE_ERRORNO_COMMON(CSEC_ERROR, 5, "Failed to call libc_sec API!"); // 1343225861 | GE_ERRORNO_COMMON(CSEC_ERROR, 5, "Failed to call libc_sec API!"); // 1343225861 | ||||
| GE_ERRORNO_COMMON(TEE_ERROR, 6, "Failed to call tee API!"); // 1343225862 | GE_ERRORNO_COMMON(TEE_ERROR, 6, "Failed to call tee API!"); // 1343225862 | ||||
| GE_ERRORNO_COMMON(END_OF_SEQUENCE, 7, "End of sequence!"); // 1343225863 | GE_ERRORNO_COMMON(END_OF_SEQUENCE, 7, "End of sequence!"); // 1343225863 | ||||
| GE_ERRORNO_COMMON(PATH_INVALID, 8, "Path is invalid!"); // 1343225864 | |||||
| // Error code for plugin manager | // Error code for plugin manager | ||||
| GE_ERRORNO_COMMON(GE_PLGMGR_PATH_INVALID, 30, "Path is invalid!"); // 1343225886 | GE_ERRORNO_COMMON(GE_PLGMGR_PATH_INVALID, 30, "Path is invalid!"); // 1343225886 | ||||
| @@ -124,9 +125,13 @@ GE_ERRORNO_CLIENT(GE_CLI_GE_ALREADY_INITIALIZED, 10, "GE is already initialized. | |||||
| GE_ERRORNO_CLIENT(GE_CLI_GE_NOT_INITIALIZED, 11, "GE is not yet initialized or is finalized."); // 1343229963 | GE_ERRORNO_CLIENT(GE_CLI_GE_NOT_INITIALIZED, 11, "GE is not yet initialized or is finalized."); // 1343229963 | ||||
| // Init module error code definition | // Init module error code definition | ||||
| GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported."); // 1343234048 | |||||
| GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization."); // 1343234049 | |||||
| GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported."); // 1343234050 | |||||
| GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported."); // 1343234048 | |||||
| GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization."); // 1343234049 | |||||
| GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported."); // 1343234050 | |||||
| GE_ERRORNO_INIT(GE_PROF_MULTI_INIT, 3, "Multiple profiling initializations are not supported."); // 1343234051 | |||||
| GE_ERRORNO_INIT(GE_PROF_NOT_INIT, 4, "Profing initializations have not been done."); // 1343234052 | |||||
| GE_ERRORNO_INIT(GE_PROF_MODE_CONFLICT, 5, | |||||
| "Profiling command mode which is preferred is running, the api mode will not work."); // 1343234053 | |||||
| // Session module error code definition | // Session module error code definition | ||||
| GE_ERRORNO_SESSION(GE_SESS_INIT_FAILED, 0, "Failed to initialize session."); // 1343238144 | GE_ERRORNO_SESSION(GE_SESS_INIT_FAILED, 0, "Failed to initialize session."); // 1343238144 | ||||
| @@ -398,6 +398,24 @@ bool CheckOutputPathValid(const std::string &file_path, const std::string &atc_p | |||||
| /// @param [out] result | /// @param [out] result | ||||
| /// | /// | ||||
| bool ValidateStr(const std::string &filePath, const std::string &mode); | bool ValidateStr(const std::string &filePath, const std::string &mode); | ||||
| /// | |||||
| /// @ingroup domi_common | |||||
| /// @brief Check whether the file is normal file. | |||||
| /// @param [in] file_path file path | |||||
| /// @param [out] result | |||||
| /// | |||||
| bool IsValidFile(const char *file_path); | |||||
| /// | |||||
| /// @ingroup domi_common | |||||
| /// @brief Check path invalid | |||||
| /// @param [in] path, path to be checked | |||||
| /// @param [in] length, length of path | |||||
| /// @return 0 success | |||||
| /// @return -1 fail | |||||
| /// | |||||
| Status CheckPath(const char *path, size_t length); | |||||
| } // namespace ge | } // namespace ge | ||||
| #endif // INC_FRAMEWORK_COMMON_UTIL_H_ | #endif // INC_FRAMEWORK_COMMON_UTIL_H_ | ||||
| @@ -142,6 +142,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM | |||||
| GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_DIMS; | GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_DIMS; | ||||
| GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED; | |||||
| GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID; | GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID; | ||||
| GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME; | GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME; | ||||
| @@ -658,7 +658,7 @@ ComputeGraph::UpdateOutputMapping(const std::map<uint32_t, uint32_t> &output_map | |||||
| return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
| } | } | ||||
| size_t num = op_desc->GetInputsSize(); | |||||
| size_t num = op_desc->GetAllInputsSize(); | |||||
| for (size_t i = 0; i < num; i++) { | for (size_t i = 0; i < num; i++) { | ||||
| GeTensorDesc tensor = op_desc->GetInputDesc(i); | GeTensorDesc tensor = op_desc->GetInputDesc(i); | ||||
| uint32_t cur_index = 0; | uint32_t cur_index = 0; | ||||
| @@ -149,9 +149,10 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std | |||||
| // consider special node save process | // consider special node save process | ||||
| // get all input desc format | // get all input desc format | ||||
| bool node_is_all_nd = false; | bool node_is_all_nd = false; | ||||
| auto input_size = static_cast<uint32_t>(op_desc->GetInputsSize()); | |||||
| auto input_size = static_cast<uint32_t>(op_desc->GetAllInputsSize()); | |||||
| for (uint32_t i = 0; i < input_size; i++) { | for (uint32_t i = 0; i < input_size; i++) { | ||||
| // Operator pre-set format but not origin format | // Operator pre-set format but not origin format | ||||
| GE_IF_BOOL_EXEC(op_desc->MutableInputDesc(i) == nullptr, continue); | |||||
| auto input_format = op_desc->MutableInputDesc(i)->GetFormat(); | auto input_format = op_desc->MutableInputDesc(i)->GetFormat(); | ||||
| // Pre-save data node (only main graph data) and default infer fail | // Pre-save data node (only main graph data) and default infer fail | ||||
| if (node_ptr->GetType() == DATA) { | if (node_ptr->GetType() == DATA) { | ||||
| @@ -164,6 +165,7 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std | |||||
| // Get all output desc format | // Get all output desc format | ||||
| auto output_size = static_cast<uint32_t>(op_desc->GetOutputsSize()); | auto output_size = static_cast<uint32_t>(op_desc->GetOutputsSize()); | ||||
| for (uint32_t i = 0; i < output_size; i++) { | for (uint32_t i = 0; i < output_size; i++) { | ||||
| GE_IF_BOOL_EXEC(op_desc->MutableOutputDesc(i) == nullptr, continue); | |||||
| auto output_format = op_desc->MutableOutputDesc(i)->GetFormat(); | auto output_format = op_desc->MutableOutputDesc(i)->GetFormat(); | ||||
| if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) { | if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) { | ||||
| node_is_all_nd = true; | node_is_all_nd = true; | ||||
| @@ -222,8 +224,9 @@ graphStatus FormatRefiner::BackInferProcess(std::deque<ge::NodePtr> &nodes, ge:: | |||||
| for (const auto &in_anchor : node->GetAllInDataAnchors()) { | for (const auto &in_anchor : node->GetAllInDataAnchors()) { | ||||
| GELOGD("Node is [%s] [B]", (node->GetName()).c_str()); | GELOGD("Node is [%s] [B]", (node->GetName()).c_str()); | ||||
| auto in_data_anchor_idx = in_anchor->GetIdx(); | auto in_data_anchor_idx = in_anchor->GetIdx(); | ||||
| auto to_be_set_format = | |||||
| node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx))->GetOriginFormat(); | |||||
| auto input_desc = node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx)); | |||||
| GE_IF_BOOL_EXEC(input_desc == nullptr, continue); | |||||
| auto to_be_set_format = input_desc->GetOriginFormat(); | |||||
| if (to_be_set_format == FORMAT_ND) { | if (to_be_set_format == FORMAT_ND) { | ||||
| GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str()); | GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str()); | ||||
| continue; | continue; | ||||
| @@ -123,6 +123,7 @@ const std::string ATTR_NAME_AIPP_OUTPUTS = "_aipp_outputs"; | |||||
| const std::string ATTR_NAME_INPUT_DIMS = "input_dims"; | const std::string ATTR_NAME_INPUT_DIMS = "input_dims"; | ||||
| const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED = "_graph_has_been_added"; | |||||
| const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id"; | const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id"; | ||||
| const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name"; | const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name"; | ||||
| @@ -68,7 +68,7 @@ graphStatus Node::Init() { | |||||
| return GRAPH_SUCCESS; | return GRAPH_SUCCESS; | ||||
| } | } | ||||
| GE_CHK_BOOL_EXEC(op_ != nullptr, return GRAPH_FAILED, "original OpDesc is nullptr"); | GE_CHK_BOOL_EXEC(op_ != nullptr, return GRAPH_FAILED, "original OpDesc is nullptr"); | ||||
| size_t size = op_->GetInputsSize(); | |||||
| size_t size = op_->GetAllInputsSize(); | |||||
| for (size_t i = 0; i < size; i++) { | for (size_t i = 0; i < size; i++) { | ||||
| std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), i); | std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), i); | ||||
| if (anchor == nullptr) { | if (anchor == nullptr) { | ||||
| @@ -305,13 +305,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con | |||||
| GELOGE(GRAPH_FAILED, "add input desc failed."); | GELOGE(GRAPH_FAILED, "add input desc failed."); | ||||
| return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
| } | } | ||||
| std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size()); | |||||
| if (anchor == nullptr) { | |||||
| GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size()); | |||||
| return GRAPH_FAILED; | |||||
| if (index < GetAllInDataAnchors().size()) { | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]); | |||||
| } else { | |||||
| std::shared_ptr<InDataAnchor> anchor = | |||||
| ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size()); | |||||
| if (anchor == nullptr) { | |||||
| GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size()); | |||||
| return GRAPH_FAILED; | |||||
| } | |||||
| in_data_anchors_.push_back(anchor); | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back()); | |||||
| } | } | ||||
| in_data_anchors_.push_back(anchor); | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back()); | |||||
| return GRAPH_SUCCESS; | return GRAPH_SUCCESS; | ||||
| } | } | ||||
| @@ -347,20 +353,30 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con | |||||
| } | } | ||||
| GE_CHECK_NOTNULL(op_); | GE_CHECK_NOTNULL(op_); | ||||
| auto op_desc = input_node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(op_desc); | |||||
| if (op_->AddInputDesc(name, op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) { | |||||
| GELOGE(GRAPH_FAILED, "add input desc failed."); | |||||
| return GRAPH_FAILED; | |||||
| auto input_op_desc = input_node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(input_op_desc); | |||||
| auto index = op_->GetInputIndexByName(name); | |||||
| if (index != -1) { | |||||
| if (index >= static_cast<int>(in_data_anchors_.size())) { | |||||
| GELOGE(GRAPH_FAILED, "op %s get input name %s 's index %d is illegal.", op_->GetName().c_str(), name.c_str(), | |||||
| index); | |||||
| return GRAPH_FAILED; | |||||
| } | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]); | |||||
| } else { | |||||
| std::shared_ptr<InDataAnchor> anchor = | |||||
| ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size()); | |||||
| if (anchor == nullptr) { | |||||
| GELOGE(GRAPH_FAILED, "in_data_anchors_size is:%zu, malloc shared_ptr failed.", in_data_anchors_.size()); | |||||
| return GRAPH_FAILED; | |||||
| } | |||||
| in_data_anchors_.push_back(anchor); | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back()); | |||||
| } | } | ||||
| std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size()); | |||||
| if (anchor == nullptr) { | |||||
| GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size()); | |||||
| if (op_->AddInputDesc(name, input_op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) { | |||||
| GELOGE(GRAPH_FAILED, "add input desc failed."); | |||||
| return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
| } | } | ||||
| in_data_anchors_.push_back(anchor); | |||||
| (void)out_anchors.at(0)->LinkTo(in_data_anchors_.back()); | |||||
| return GRAPH_SUCCESS; | return GRAPH_SUCCESS; | ||||
| } | } | ||||
| @@ -675,7 +675,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| if (inputs_desc_[index]->IsValid() != GRAPH_SUCCESS) { | if (inputs_desc_[index]->IsValid() != GRAPH_SUCCESS) { | ||||
| GELOGE(GRAPH_FAILED, "inputsDesc[%u] is InValid", index); | |||||
| GELOGW("inputsDesc[%u] is InValid", index); | |||||
| return nullptr; | return nullptr; | ||||
| } else { | } else { | ||||
| return inputs_desc_[static_cast<size_t>(index)]; | return inputs_desc_[static_cast<size_t>(index)]; | ||||
| @@ -1504,7 +1504,9 @@ class GraphBuilderImpl { | |||||
| GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed."); | GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed."); | ||||
| auto ret = GraphUtils::AddEdge(src_anchor, dst_anchor); | auto ret = GraphUtils::AddEdge(src_anchor, dst_anchor); | ||||
| GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED, "AddEdge failed."); | |||||
| GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED, | |||||
| "from node[%s][%d] to node[%s][%d]AddEdge failed.", src_node_ptr->GetName().c_str(), | |||||
| src_anchor->GetIdx(), dst_node_info->second->GetName().c_str(), dst_anchor->GetIdx()); | |||||
| } | } | ||||
| } | } | ||||
| auto out_control_anchor = src_node_ptr->GetOutControlAnchor(); | auto out_control_anchor = src_node_ptr->GetOutControlAnchor(); | ||||
| @@ -1536,19 +1538,23 @@ inline bool HasSameNameNode(const ComputeGraphPtr &compute_graph) { | |||||
| for (const auto &graph : compute_graph->GetAllSubgraphs()) { | for (const auto &graph : compute_graph->GetAllSubgraphs()) { | ||||
| std::set<string> node_names; | std::set<string> node_names; | ||||
| for (auto const &node : graph->GetDirectNode()) { | for (auto const &node : graph->GetDirectNode()) { | ||||
| node_names.insert(node->GetName()); | |||||
| } | |||||
| if (node_names.size() != graph->GetDirectNodesSize()) { | |||||
| return true; | |||||
| auto result = node_names.insert(node->GetName()); | |||||
| if (!result.second) { | |||||
| GELOGE(GRAPH_FAILED, "graph %s has same name node%s", graph->GetName().c_str(), node->GetName().c_str()); | |||||
| return true; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| std::set<string> node_names; | std::set<string> node_names; | ||||
| for (auto const &node : compute_graph->GetDirectNode()) { | for (auto const &node : compute_graph->GetDirectNode()) { | ||||
| node_names.insert(node->GetName()); | |||||
| auto result = node_names.insert(node->GetName()); | |||||
| if (!result.second) { | |||||
| GELOGE(GRAPH_FAILED, "graph %s has same name node%s", compute_graph->GetName().c_str(), node->GetName().c_str()); | |||||
| return true; | |||||
| } | |||||
| } | } | ||||
| return node_names.size() != compute_graph->GetDirectNodesSize(); | |||||
| return false; | |||||
| } | } | ||||
| ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) { | ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) { | ||||
| @@ -51,6 +51,9 @@ graphStatus ReverseBrushWhileBodySubGraph(const ConstNodePtr &node) { | |||||
| for (const auto &node_sub : sub_graph_body->GetAllNodes()) { | for (const auto &node_sub : sub_graph_body->GetAllNodes()) { | ||||
| for (size_t i = 0; i < node_sub->GetAllInDataAnchorsSize(); i++) { | for (size_t i = 0; i < node_sub->GetAllInDataAnchorsSize(); i++) { | ||||
| auto input_desc = node_sub->GetOpDesc()->MutableInputDesc(i); | auto input_desc = node_sub->GetOpDesc()->MutableInputDesc(i); | ||||
| GE_IF_BOOL_EXEC(input_desc == nullptr, | |||||
| GELOGW("Get null input by index %zu from node %s ", i, node_sub->GetName().c_str()); | |||||
| continue); | |||||
| (void)input_desc->SetUnknownDimNumShape(); | (void)input_desc->SetUnknownDimNumShape(); | ||||
| } | } | ||||
| for (size_t i = 0; i < node_sub->GetAllOutDataAnchorsSize(); i++) { | for (size_t i = 0; i < node_sub->GetAllOutDataAnchorsSize(); i++) { | ||||
| @@ -376,10 +379,13 @@ graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) { | |||||
| continue; | continue; | ||||
| } | } | ||||
| int peer_out_idx = peer_out_data_anchor->GetIdx(); | int peer_out_idx = peer_out_data_anchor->GetIdx(); | ||||
| auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx)); | |||||
| auto peer_out_desc = peer_out_data_node->GetOpDesc()->MutableOutputDesc(static_cast<uint32_t>(peer_out_idx)); | auto peer_out_desc = peer_out_data_node->GetOpDesc()->MutableOutputDesc(static_cast<uint32_t>(peer_out_idx)); | ||||
| // check shape and dtype continuity. do not stop process | // check shape and dtype continuity. do not stop process | ||||
| auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx)); | |||||
| if (in_desc == nullptr) { | |||||
| continue; | |||||
| } | |||||
| auto in_shape = in_desc->GetShape().GetDims(); | auto in_shape = in_desc->GetShape().GetDims(); | ||||
| auto in_dtype = in_desc->GetDataType(); | auto in_dtype = in_desc->GetDataType(); | ||||
| auto peer_out_shape = peer_out_desc->GetShape().GetDims(); | auto peer_out_shape = peer_out_desc->GetShape().GetDims(); | ||||
| @@ -264,11 +264,11 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const | |||||
| return; | return; | ||||
| } | } | ||||
| // Input describes | // Input describes | ||||
| auto size_in = op_desc->GetInputsSize(); | |||||
| auto size_in = op_desc->GetAllInputsSize(); | |||||
| AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_nums", &size_in); | AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_nums", &size_in); | ||||
| if (size_in > 0) { | if (size_in > 0) { | ||||
| for (uint32_t i = 0; i < size_in; i++) { | for (uint32_t i = 0; i < size_in; i++) { | ||||
| auto input_desc = op_desc->GetInputDescPtr(i); | |||||
| auto input_desc = op_desc->GetInputDescPtrDfault(i); | |||||
| if (input_desc != nullptr) { | if (input_desc != nullptr) { | ||||
| auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType()); | auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType()); | ||||
| AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i), | AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i), | ||||
| @@ -480,9 +480,20 @@ void OnnxUtils::AddAttrProtoFromNodeMembers(const NodePtr &node, onnx::NodeProto | |||||
| if (!recv_list.empty()) { | if (!recv_list.empty()) { | ||||
| AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "recv_event_id_list", &recv_list); | AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "recv_event_id_list", &recv_list); | ||||
| } | } | ||||
| // 2.Attributes added from node's op_(message OpDef) | |||||
| auto op_desc = node->op_; | auto op_desc = node->op_; | ||||
| if (op_desc != nullptr) { | if (op_desc != nullptr) { | ||||
| // for input_name_idx_ in opdesc | |||||
| auto input_name_2_indexs = op_desc->GetAllInputName(); | |||||
| ::google::protobuf::RepeatedPtrField<::std::string> input_names; | |||||
| ::google::protobuf::RepeatedField<::google::protobuf::int64> input_indexes; | |||||
| for (const auto &input_name_2_index : input_name_2_indexs) { | |||||
| std::string input_name = input_name_2_index.first; | |||||
| input_names.Add(std::move(input_name)); | |||||
| input_indexes.Add(input_name_2_index.second); | |||||
| } | |||||
| AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRINGS, "_input_name_key", input_names); | |||||
| AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "_input_name_value", input_indexes); | |||||
| // 2.Attributes added from node's op_(message OpDef) | |||||
| // Input and out describes | // Input and out describes | ||||
| AddAttrProtoForOpInAndOutDesc(node_proto, op_desc); | AddAttrProtoForOpInAndOutDesc(node_proto, op_desc); | ||||
| // Others | // Others | ||||
| @@ -1470,8 +1470,7 @@ graphStatus GraphUtils::CopyTensorAttrs(const OpDescPtr &dst_desc, const NodePtr | |||||
| for (uint32_t i = 0; i < src_node->GetAllInDataAnchorsSize(); ++i) { | for (uint32_t i = 0; i < src_node->GetAllInDataAnchorsSize(); ++i) { | ||||
| auto input_desc = dst_desc->MutableInputDesc(i); | auto input_desc = dst_desc->MutableInputDesc(i); | ||||
| if (input_desc == nullptr) { | if (input_desc == nullptr) { | ||||
| GELOGE(GRAPH_FAILED, "Param dst node not valid"); | |||||
| return GRAPH_FAILED; | |||||
| continue; | |||||
| } | } | ||||
| input_desc->CopyAttrsFrom(src_desc->GetInputDesc(i)); | input_desc->CopyAttrsFrom(src_desc->GetInputDesc(i)); | ||||
| } | } | ||||
| @@ -513,7 +513,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector<GeTensorPtr> OpDescUtils:: | |||||
| } | } | ||||
| return MutableWeights(*node); | return MutableWeights(*node); | ||||
| } | } | ||||
| GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus | GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus | ||||
| OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights) { | OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights) { | ||||
| GE_CHK_BOOL_EXEC(node.GetOpDesc() != nullptr, return GRAPH_PARAM_INVALID, "node.GetOpDesc is nullptr!"); | GE_CHK_BOOL_EXEC(node.GetOpDesc() != nullptr, return GRAPH_PARAM_INVALID, "node.GetOpDesc is nullptr!"); | ||||
| @@ -58,6 +58,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge) | |||||
| # need to remove dependencies on pb files later | # need to remove dependencies on pb files later | ||||
| file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | ||||
| "analyzer/analyzer.cc" | "analyzer/analyzer.cc" | ||||
| "client/ge_prof.cc" | |||||
| "client/ge_api.cc" | "client/ge_api.cc" | ||||
| "common/dump/dump_manager.cc" | "common/dump/dump_manager.cc" | ||||
| "common/dump/dump_properties.cc" | "common/dump/dump_properties.cc" | ||||
| @@ -225,6 +226,7 @@ target_link_libraries(ge_runner | |||||
| ${msprof} | ${msprof} | ||||
| ${runtime} | ${runtime} | ||||
| ${resouce} | ${resouce} | ||||
| ${ascend_hal} | |||||
| rt | rt | ||||
| dl) | dl) | ||||
| @@ -335,6 +337,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | |||||
| "host_kernels/unpack_kernel.cc" | "host_kernels/unpack_kernel.cc" | ||||
| "host_kernels/unsqueeze_kernel.cc" | "host_kernels/unsqueeze_kernel.cc" | ||||
| "hybrid/hybrid_davinci_model_stub.cc" | "hybrid/hybrid_davinci_model_stub.cc" | ||||
| "hybrid/node_executor/aicpu/aicpu_ext_info.cc" | |||||
| "init/gelib.cc" | "init/gelib.cc" | ||||
| "ir_build/atc_ir_common.cc" | "ir_build/atc_ir_common.cc" | ||||
| "ir_build/ge_ir_build.cc" | "ir_build/ge_ir_build.cc" | ||||
| @@ -101,7 +101,7 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) { | |||||
| ge::Status Analyzer::Initialize() { | ge::Status Analyzer::Initialize() { | ||||
| ClearHistoryFile(); | ClearHistoryFile(); | ||||
| return CreateAnalyzerFile(); | |||||
| return SUCCESS; | |||||
| } | } | ||||
| void Analyzer::Finalize() { | void Analyzer::Finalize() { | ||||
| @@ -136,7 +136,7 @@ void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) { | |||||
| } else { | } else { | ||||
| auto iter1 = (iter->second).find(graph_id); | auto iter1 = (iter->second).find(graph_id); | ||||
| if (iter1 == (iter->second).end()) { | if (iter1 == (iter->second).end()) { | ||||
| GELOGW("can not find the graph json object by session_id[%lu] and graph_id[%lu].Do nothing", session_id, | |||||
| GELOGW("Can not find the graph json object by session_id[%lu] and graph_id[%lu]. Do nothing.", session_id, | |||||
| graph_id); | graph_id); | ||||
| } | } | ||||
| (iter->second).erase(iter1); | (iter->second).erase(iter1); | ||||
| @@ -169,6 +169,10 @@ void Analyzer::ClearHistoryFile() { | |||||
| } | } | ||||
| ge::Status Analyzer::CreateAnalyzerFile() { | ge::Status Analyzer::CreateAnalyzerFile() { | ||||
| if (is_json_file_create_) { | |||||
| GELOGD("analyzer file has been created!No necessary to create again!"); | |||||
| return SUCCESS; | |||||
| } | |||||
| GELOGD("start to create analyzer file!"); | GELOGD("start to create analyzer file!"); | ||||
| // Check whether the manifest exists, if not, create it. | // Check whether the manifest exists, if not, create it. | ||||
| string real_path = RealPath(kFilePath.c_str()); | string real_path = RealPath(kFilePath.c_str()); | ||||
| @@ -176,18 +180,19 @@ ge::Status Analyzer::CreateAnalyzerFile() { | |||||
| GELOGE(FAILED, "File path is invalid."); | GELOGE(FAILED, "File path is invalid."); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| string file = real_path + "/" + kAnalyzeFile; | |||||
| GELOGD("Created analyzer file:[%s]", file.c_str()); | |||||
| int fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority); | |||||
| std::lock_guard<std::mutex> lg(file_mutex_); | |||||
| json_file_name_ = real_path + "/" + kAnalyzeFile; | |||||
| GELOGD("Created analyzer file:[%s]", json_file_name_.c_str()); | |||||
| int fd = open(json_file_name_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority); | |||||
| if (fd < 0) { | if (fd < 0) { | ||||
| GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", file.c_str()); | |||||
| GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", json_file_name_.c_str()); | |||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| } | } | ||||
| if (close(fd) != 0) { | if (close(fd) != 0) { | ||||
| GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", file.c_str()); | |||||
| GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", json_file_name_.c_str()); | |||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| } | } | ||||
| json_file_name_ = file; | |||||
| is_json_file_create_ = true; | |||||
| GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str()); | GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str()); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -231,6 +236,12 @@ ge::Status Analyzer::DoAnalyze(DataInfo &data_info) { | |||||
| GELOGE(status, "save op info failed!"); | GELOGE(status, "save op info failed!"); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| // create json file | |||||
| status = CreateAnalyzerFile(); | |||||
| if (status != SUCCESS) { | |||||
| GELOGE(status, "create analyzer file failed!"); | |||||
| return status; | |||||
| } | |||||
| // save data to file | // save data to file | ||||
| return SaveAnalyzerDataToFile(); | return SaveAnalyzerDataToFile(); | ||||
| } | } | ||||
| @@ -24,6 +24,7 @@ | |||||
| #include <mutex> | #include <mutex> | ||||
| #include <memory> | #include <memory> | ||||
| #include <fstream> | #include <fstream> | ||||
| #include <atomic> | |||||
| #include "external/ge/ge_api_types.h" | #include "external/ge/ge_api_types.h" | ||||
| #include "graph/compute_graph.h" | #include "graph/compute_graph.h" | ||||
| @@ -181,6 +182,7 @@ class Analyzer { | |||||
| std::mutex file_mutex_; // protect json_file_ | std::mutex file_mutex_; // protect json_file_ | ||||
| std::ofstream json_file_; | std::ofstream json_file_; | ||||
| std::string json_file_name_; | std::string json_file_name_; | ||||
| std::atomic_bool is_json_file_create_{false}; | |||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| #endif // DOMI_ANALYZER_ANANLYZER_H_ | #endif // DOMI_ANALYZER_ANANLYZER_H_ | ||||
| @@ -29,6 +29,7 @@ file(GLOB PROTO_HEADER_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | |||||
| file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | ||||
| "ge_api.cc" | "ge_api.cc" | ||||
| "ge_prof.cc" | |||||
| ) | ) | ||||
| ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) | ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) | ||||
| @@ -66,5 +67,6 @@ target_link_libraries(ge_client | |||||
| ${slog} | ${slog} | ||||
| ${mmpa} | ${mmpa} | ||||
| ${runtime} | ${runtime} | ||||
| ${msprof} | |||||
| rt | rt | ||||
| dl) | dl) | ||||
| @@ -39,7 +39,7 @@ using std::vector; | |||||
| namespace { | namespace { | ||||
| const int32_t kMaxStrLen = 128; | const int32_t kMaxStrLen = 128; | ||||
| } | |||||
| } // namespace | |||||
| static bool g_ge_initialized = false; | static bool g_ge_initialized = false; | ||||
| static std::mutex g_ge_release_mutex; // GEFinalize and ~Session use | static std::mutex g_ge_release_mutex; // GEFinalize and ~Session use | ||||
| @@ -0,0 +1,375 @@ | |||||
| /** | |||||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "ge/ge_prof.h" | |||||
| #include "ge/ge_api.h" | |||||
| #include "init/gelib.h" | |||||
| #include "common/debug/log.h" | |||||
| #include "framework/common/debug/ge_log.h" | |||||
| #include "common/profiling/profiling_manager.h" | |||||
| #include "graph/load/graph_loader.h" | |||||
| #include "toolchain/prof_acl_api.h" | |||||
| using std::map; | |||||
| using std::string; | |||||
| using std::vector; | |||||
| namespace { | |||||
| const uint32_t kMaxDeviceNum = 64; | |||||
| const std::string PROFILING_INIT = "prof_init"; | |||||
| const std::string PROFILING_FINALIZE = "prof_finalize"; | |||||
| const std::string PROFILING_START = "prof_start"; | |||||
| const std::string PROFILING_STOP = "prof_stop"; | |||||
| const std::string DEVICES_NUMS = "devNums"; | |||||
| const std::string DEVICE_ID_LIST = "devIdList"; | |||||
| const std::string AICORE_METRICS = "aicoreMetrics"; | |||||
| const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = { | |||||
| {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"}, | |||||
| {ge::kAicorePipeline, "AICORE_PIPELINE"}, | |||||
| {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"}, | |||||
| {ge::kAicoreMemory, "AICORE_MEMORY"}, | |||||
| {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"}, | |||||
| {ge::kAicoreStall, "AICORE_STALL"}, | |||||
| {ge::kAicoreMetricsAll, "AICORE_METRICS_ALL"}}; | |||||
| const std::map<uint64_t, uint64_t> kDataTypeConfigMapping = {{ge::kProfAcl, PROF_ACL_API}, | |||||
| {ge::kProfTaskTime, PROF_TASK_TIME}, | |||||
| {ge::kProfAiCoreMetrics, PROF_AICORE_METRICS}, | |||||
| {ge::kProfAicpuTrace, PROF_AICPU_TRACE}, | |||||
| {ge::kProfModelExecute, PROF_MODEL_EXECUTE}, | |||||
| {ge::kProfRuntimeApi, PROF_RUNTIME_API}, | |||||
| {ge::kProfRuntimeTrace, PROF_RUNTIME_TRACE}, | |||||
| {ge::kProfScheduleTimeline, PROF_SCHEDULE_TIMELINE}, | |||||
| {ge::kProfScheduleTrace, PROF_SCHEDULE_TRACE}, | |||||
| {ge::kProfAiVectorCoreMetrics, PROF_AIVECTORCORE_METRICS}, | |||||
| {ge::kProfSubtaskTime, PROF_SUBTASK_TIME}, | |||||
| {ge::kProfTrainingTrace, PROF_TRAINING_TRACE}, | |||||
| {ge::kProfHcclTrace, PROF_HCCL_TRACE}, | |||||
| {ge::kProfDataProcess, PROF_DATA_PROCESS}, | |||||
| {ge::kProfTaskTrace, PROF_TASK_TRACE}, | |||||
| {ge::kProfModelLoad, PROF_MODEL_LOAD}}; | |||||
| } // namespace | |||||
| static bool g_graph_prof_init_ = false; | |||||
| static std::mutex g_prof_mutex_; | |||||
| namespace ge { | |||||
| struct aclgrphProfConfig { | |||||
| ProfConfig config; | |||||
| }; | |||||
| Status aclgrphProfInit(const char *profiler_path, uint32_t length) { | |||||
| GELOGT(TRACE_INIT, "Graph prof init start"); | |||||
| std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance(); | |||||
| if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { | |||||
| GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized."); | |||||
| return FAILED; | |||||
| } | |||||
| std::lock_guard<std::mutex> lock(g_prof_mutex_); | |||||
| if (g_graph_prof_init_) { | |||||
| GELOGW("Multi graph profiling initializations."); | |||||
| return GE_PROF_MULTI_INIT; | |||||
| } | |||||
| Status ret = CheckPath(profiler_path, length); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Profiling config path is invalid."); | |||||
| return ret; | |||||
| } | |||||
| // if command mode is set, just return | |||||
| if (ProfilingManager::Instance().ProfilingOn()) { | |||||
| GELOGW("Graph prof init failed, cause profiling command pattern is running."); | |||||
| return GE_PROF_MODE_CONFLICT; | |||||
| } | |||||
| ret = ProfInit(profiler_path); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "ProfInit init fail"); | |||||
| return ret; | |||||
| } | |||||
| GraphLoader graph_loader; | |||||
| Command command; | |||||
| command.cmd_params.clear(); | |||||
| command.cmd_type = PROFILING_INIT; | |||||
| command.module_index = kProfModelLoad | kProfTrainingTrace; | |||||
| ret = graph_loader.CommandHandle(command); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Handle profiling command %s failed, config = %s", PROFILING_INIT.c_str(), profiler_path); | |||||
| return ret; | |||||
| } | |||||
| if (!g_graph_prof_init_) { | |||||
| g_graph_prof_init_ = true; | |||||
| GELOGI("Profiling init successfully."); | |||||
| } | |||||
| GELOGI("Successfully execute GraphProfInit."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status aclgrphProfFinalize() { | |||||
| std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance(); | |||||
| if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { | |||||
| GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized."); | |||||
| return FAILED; | |||||
| } | |||||
| std::lock_guard<std::mutex> lock(g_prof_mutex_); | |||||
| // if command mode is set, just return | |||||
| if (ProfilingManager::Instance().ProfilingOn()) { | |||||
| GELOGW("Graph prof finalize failed, cause profiling command pattern is running."); | |||||
| return GE_PROF_MODE_CONFLICT; | |||||
| } | |||||
| if (!g_graph_prof_init_) { | |||||
| GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize."); | |||||
| return GE_PROF_NOT_INIT; | |||||
| } | |||||
| GraphLoader graph_loader; | |||||
| Command command; | |||||
| command.cmd_params.clear(); | |||||
| command.cmd_type = PROFILING_FINALIZE; | |||||
| Status ret = graph_loader.CommandHandle(command); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Handle profiling command %s failed.", PROFILING_FINALIZE.c_str()); | |||||
| return ret; | |||||
| } | |||||
| ret = ProfFinalize(); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Finalize profiling failed, result = %d", ret); | |||||
| } | |||||
| if (ret == SUCCESS) { | |||||
| g_graph_prof_init_ = false; | |||||
| GELOGI("Successfully execute GraphProfFinalize."); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) { | |||||
| prof_config_params.clear(); | |||||
| prof_config_params.emplace_back(DEVICES_NUMS); | |||||
| prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums)); | |||||
| prof_config_params.emplace_back(DEVICE_ID_LIST); | |||||
| std::string devID = ""; | |||||
| if (profiler_config->config.devNums == 0) { | |||||
| GELOGW("The device num is invalid."); | |||||
| return false; | |||||
| } | |||||
| for (uint32_t i = 0; i < profiler_config->config.devNums; i++) { | |||||
| devID.append(std::to_string(profiler_config->config.devIdList[i])); | |||||
| if (i != profiler_config->config.devNums - 1) { | |||||
| devID.append(","); | |||||
| } | |||||
| } | |||||
| prof_config_params.push_back(devID); | |||||
| prof_config_params.push_back(AICORE_METRICS); | |||||
| auto iter = | |||||
| kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics)); | |||||
| if (iter == kProfAicoreMetricsToString.end()) { | |||||
| GELOGW("The prof aicore metrics is invalid."); | |||||
| return false; | |||||
| } | |||||
| prof_config_params.push_back(iter->second); | |||||
| return true; | |||||
| } | |||||
| bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) { | |||||
| if (deviceid_list == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "deviceIdList is nullptr"); | |||||
| return false; | |||||
| } | |||||
| if (device_nums == 0 || device_nums > kMaxDeviceNum) { | |||||
| GELOGE(PARAM_INVALID, "The device nums is invalid."); | |||||
| return false; | |||||
| } | |||||
| // real device num | |||||
| int32_t dev_count = 0; | |||||
| rtError_t rt_err = rtGetDeviceCount(&dev_count); | |||||
| if (rt_err != RT_ERROR_NONE) { | |||||
| GELOGE(INTERNAL_ERROR, "Get the Device count fail."); | |||||
| return false; | |||||
| } | |||||
| if (device_nums > static_cast<uint32_t>(dev_count)) { | |||||
| GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count); | |||||
| return false; | |||||
| } | |||||
| std::unordered_set<uint32_t> record; | |||||
| for (size_t i = 0; i < device_nums; ++i) { | |||||
| uint32_t dev_id = deviceid_list[i]; | |||||
| if (dev_id >= static_cast<uint32_t>(dev_count)) { | |||||
| GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count); | |||||
| return false; | |||||
| } | |||||
| if (record.count(dev_id) > 0) { | |||||
| GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id); | |||||
| return false; | |||||
| } | |||||
| record.insert(dev_id); | |||||
| } | |||||
| return true; | |||||
| } | |||||
| aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums, | |||||
| ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events, | |||||
| uint64_t data_type_config) { | |||||
| if (!isProfConfigValid(deviceid_list, device_nums)) { | |||||
| return nullptr; | |||||
| } | |||||
| aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig(); | |||||
| if (config == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail"); | |||||
| return nullptr; | |||||
| } | |||||
| config->config.devNums = device_nums; | |||||
| if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list, | |||||
| device_nums * sizeof(uint32_t)) != EOK) { | |||||
| GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums); | |||||
| delete config; | |||||
| return nullptr; | |||||
| } | |||||
| config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics); | |||||
| uint64_t data_type = 0; | |||||
| for (auto &iter : kDataTypeConfigMapping) { | |||||
| if ((iter.first & data_type_config) == iter.first) { | |||||
| data_type |= iter.second; | |||||
| } | |||||
| } | |||||
| config->config.dataTypeConfig = data_type; | |||||
| GELOGI("Successfully create prof config."); | |||||
| return config; | |||||
| } | |||||
| Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) { | |||||
| if (profiler_config == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr"); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| delete profiler_config; | |||||
| GELOGI("Successfully destroy prof config."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status aclgrphProfStart(aclgrphProfConfig *profiler_config) { | |||||
| if (profiler_config == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid."); | |||||
| return FAILED; | |||||
| } | |||||
| std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance(); | |||||
| if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { | |||||
| GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized."); | |||||
| return FAILED; | |||||
| } | |||||
| std::lock_guard<std::mutex> lock(g_prof_mutex_); | |||||
| // if command mode is set, just return | |||||
| if (ProfilingManager::Instance().ProfilingOn()) { | |||||
| GELOGW("Graph prof finalize failed, cause profiling command pattern is running."); | |||||
| return GE_PROF_MODE_CONFLICT; | |||||
| } | |||||
| if (!g_graph_prof_init_) { | |||||
| GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize."); | |||||
| return GE_PROF_NOT_INIT; | |||||
| } | |||||
| Status ret = ProfStartProfiling(&profiler_config->config); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Start profiling failed, prof result = %d", ret); | |||||
| return FAILED; | |||||
| } | |||||
| std::vector<string> prof_params; | |||||
| if (!TransProfConfigToParam(profiler_config, prof_params)) { | |||||
| GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed"); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| GraphLoader graph_loader; | |||||
| Command command; | |||||
| command.cmd_params.clear(); | |||||
| command.cmd_type = PROFILING_START; | |||||
| command.cmd_params = prof_params; | |||||
| command.module_index = profiler_config->config.dataTypeConfig; | |||||
| ret = graph_loader.CommandHandle(command); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Handle profiling command failed"); | |||||
| return FAILED; | |||||
| } | |||||
| GELOGI("Successfully execute GraphProfStartProfiling."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status aclgrphProfStop(aclgrphProfConfig *profiler_config) { | |||||
| if (profiler_config == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid."); | |||||
| return FAILED; | |||||
| } | |||||
| std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance(); | |||||
| if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { | |||||
| GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized."); | |||||
| return FAILED; | |||||
| } | |||||
| std::lock_guard<std::mutex> lock(g_prof_mutex_); | |||||
| // if command mode is set, just return | |||||
| if (ProfilingManager::Instance().ProfilingOn()) { | |||||
| GELOGW("Graph prof finalize failed, cause profiling command pattern is running."); | |||||
| return GE_PROF_MODE_CONFLICT; | |||||
| } | |||||
| if (!g_graph_prof_init_) { | |||||
| GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize."); | |||||
| return GE_PROF_NOT_INIT; | |||||
| } | |||||
| Status ret = ProfStopProfiling(&profiler_config->config); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Stop profiling failed, prof result = %d", ret); | |||||
| return ret; | |||||
| } | |||||
| std::vector<string> prof_params; | |||||
| if (!TransProfConfigToParam(profiler_config, prof_params)) { | |||||
| GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed"); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| GraphLoader graph_loader; | |||||
| Command command; | |||||
| command.cmd_params.clear(); | |||||
| command.cmd_type = PROFILING_STOP; | |||||
| command.cmd_params = prof_params; | |||||
| command.module_index = profiler_config->config.dataTypeConfig; | |||||
| ret = graph_loader.CommandHandle(command); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Handle profiling command failed"); | |||||
| return FAILED; | |||||
| } | |||||
| GELOGI("Successfully execute GraphProfStopProfiling."); | |||||
| return SUCCESS; | |||||
| } | |||||
| } // namespace ge | |||||
| @@ -4,6 +4,7 @@ LOCAL_PATH := $(call my-dir) | |||||
| COMMON_LOCAL_SRC_FILES := \ | COMMON_LOCAL_SRC_FILES := \ | ||||
| proto/ge_api.proto \ | proto/ge_api.proto \ | ||||
| ge_api.cc \ | ge_api.cc \ | ||||
| ge_prof.cc \ | |||||
| COMMON_LOCAL_C_INCLUDES := \ | COMMON_LOCAL_C_INCLUDES := \ | ||||
| @@ -69,6 +70,8 @@ LOCAL_SHARED_LIBRARIES := \ | |||||
| libregister \ | libregister \ | ||||
| libge_compiler \ | libge_compiler \ | ||||
| libge_common \ | libge_common \ | ||||
| libmsprof | |||||
| LOCAL_LDFLAGS := -lrt -ldl | LOCAL_LDFLAGS := -lrt -ldl | ||||
| @@ -102,6 +105,7 @@ LOCAL_SHARED_LIBRARIES := \ | |||||
| libruntime \ | libruntime \ | ||||
| libge_compiler \ | libge_compiler \ | ||||
| libge_common \ | libge_common \ | ||||
| libmsprof | |||||
| LOCAL_LDFLAGS := -lrt -ldl | LOCAL_LDFLAGS := -lrt -ldl | ||||
| @@ -27,6 +27,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | |||||
| "context/ctx.cc" | "context/ctx.cc" | ||||
| "cust_aicpu_kernel_store.cc" | "cust_aicpu_kernel_store.cc" | ||||
| "debug/memory_dumper.cc" | "debug/memory_dumper.cc" | ||||
| "dump/dump_properties.cc" | |||||
| "fmk_error_codes.cc" | "fmk_error_codes.cc" | ||||
| "formats/format_transfers/datatype_transfer.cc" | "formats/format_transfers/datatype_transfer.cc" | ||||
| "formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc" | "formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc" | ||||
| @@ -49,7 +49,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf | |||||
| dump_properties_.ClearDumpPropertyValue(); | dump_properties_.ClearDumpPropertyValue(); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| dump_properties_.SetDumpStatus(dump_status); | |||||
| dump_op_switch = dump_config.dump_op_switch; | dump_op_switch = dump_config.dump_op_switch; | ||||
| dump_properties_.SetDumpOpSwitch(dump_op_switch); | |||||
| if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) { | if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) { | ||||
| GELOGE(PARAM_INVALID, "Dump list is invalid,dump_op_switch is %s", dump_op_switch.c_str()); | GELOGE(PARAM_INVALID, "Dump list is invalid,dump_op_switch is %s", dump_op_switch.c_str()); | ||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| @@ -95,14 +98,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpManager::IsDumpOpen() { | |||||
| std::lock_guard<std::mutex> lock(mutex_); | |||||
| if (!dump_properties_.GetDumpPath().empty()) { | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties() { | FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties() { | ||||
| std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
| return dump_properties_; | return dump_properties_; | ||||
| @@ -28,7 +28,6 @@ class DumpManager { | |||||
| static DumpManager &GetInstance(); | static DumpManager &GetInstance(); | ||||
| Status SetDumpConf(const DumpConfig &dump_config); | Status SetDumpConf(const DumpConfig &dump_config); | ||||
| bool IsDumpOpen(); | |||||
| const DumpProperties &GetDumpProperties(); | const DumpProperties &GetDumpProperties(); | ||||
| void SetModelName(const std::string &model_name); | void SetModelName(const std::string &model_name); | ||||
| const std::string &GetModelName(); | const std::string &GetModelName(); | ||||
| @@ -16,7 +16,6 @@ | |||||
| #include "common/dump/dump_op.h" | #include "common/dump/dump_op.h" | ||||
| #include "aicpu/common/aicpu_task_struct.h" | |||||
| #include "common/dump/dump_manager.h" | #include "common/dump/dump_manager.h" | ||||
| #include "common/ge/datatype_util.h" | #include "common/ge/datatype_util.h" | ||||
| #include "framework/common/debug/ge_log.h" | #include "framework/common/debug/ge_log.h" | ||||
| @@ -28,6 +27,7 @@ | |||||
| #include "proto/ge_ir.pb.h" | #include "proto/ge_ir.pb.h" | ||||
| #include "proto/op_mapping_info.pb.h" | #include "proto/op_mapping_info.pb.h" | ||||
| #include "runtime/mem.h" | #include "runtime/mem.h" | ||||
| #include "aicpu/common/aicpu_task_struct.h" | |||||
| namespace { | namespace { | ||||
| const uint32_t kAicpuLoadFlag = 1; | const uint32_t kAicpuLoadFlag = 1; | ||||
| @@ -31,7 +31,7 @@ | |||||
| namespace { | namespace { | ||||
| const std::string kEnableFlag = "1"; | const std::string kEnableFlag = "1"; | ||||
| const std::string kDumpStatusOpen = "on"; | |||||
| const uint32_t kAicoreOverflow = (0x1 << 0); | const uint32_t kAicoreOverflow = (0x1 << 0); | ||||
| const uint32_t kAtomicOverflow = (0x1 << 1); | const uint32_t kAtomicOverflow = (0x1 << 1); | ||||
| const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow); | const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow); | ||||
| @@ -81,12 +81,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti | |||||
| if (enable_dump_ == kEnableFlag) { | if (enable_dump_ == kEnableFlag) { | ||||
| std::string dump_step; | std::string dump_step; | ||||
| if (GetContext().GetOption(OPTION_EXEC_DUMP_STEP, dump_step) == GRAPH_SUCCESS) { | if (GetContext().GetOption(OPTION_EXEC_DUMP_STEP, dump_step) == GRAPH_SUCCESS) { | ||||
| GELOGD("Get dump step %s successfully", dump_step.c_str()); | |||||
| GELOGI("Get dump step %s successfully", dump_step.c_str()); | |||||
| SetDumpStep(dump_step); | SetDumpStep(dump_step); | ||||
| } | } | ||||
| string dump_mode; | string dump_mode; | ||||
| if (GetContext().GetOption(OPTION_EXEC_DUMP_MODE, dump_mode) == GRAPH_SUCCESS) { | if (GetContext().GetOption(OPTION_EXEC_DUMP_MODE, dump_mode) == GRAPH_SUCCESS) { | ||||
| GELOGD("Get dump mode %s successfully", dump_mode.c_str()); | |||||
| GELOGI("Get dump mode %s successfully", dump_mode.c_str()); | |||||
| SetDumpMode(dump_mode); | SetDumpMode(dump_mode); | ||||
| } | } | ||||
| AddPropertyValue(DUMP_ALL_MODEL, {}); | AddPropertyValue(DUMP_ALL_MODEL, {}); | ||||
| @@ -192,6 +192,37 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti | |||||
| return dump_mode_; | return dump_mode_; | ||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpStatus(const std::string &status) { | |||||
| dump_status_ = status; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpStatus() const { | |||||
| return dump_status_; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch( | |||||
| const std::string &dump_op_switch) { | |||||
| dump_op_switch_ = dump_op_switch; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpOpSwitch() const { | |||||
| return dump_op_switch_; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsSingleOpNeedDump() const { | |||||
| if (dump_op_switch_ == kDumpStatusOpen) { | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsDumpOpen() const { | |||||
| if (enable_dump_ == kEnableFlag || dump_status_ == kDumpStatusOpen) { | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| void DumpProperties::CopyFrom(const DumpProperties &other) { | void DumpProperties::CopyFrom(const DumpProperties &other) { | ||||
| if (&other != this) { | if (&other != this) { | ||||
| enable_dump_ = other.enable_dump_; | enable_dump_ = other.enable_dump_; | ||||
| @@ -61,10 +61,26 @@ class DumpProperties { | |||||
| const std::string &GetDumpMode() const; | const std::string &GetDumpMode() const; | ||||
| void SetDumpStatus(const std::string &status); | |||||
| const std::string &GetDumpStatus() const; | |||||
| void SetDumpOpSwitch(const std::string &dump_op_switch); | |||||
| const std::string &GetDumpOpSwitch() const; | |||||
| bool IsOpDebugOpen() const { return is_op_debug_; } | bool IsOpDebugOpen() const { return is_op_debug_; } | ||||
| bool IsDumpOpen() const; | |||||
| bool IsSingleOpNeedDump() const; | |||||
| uint32_t GetOpDebugMode() const { return op_debug_mode_; } | uint32_t GetOpDebugMode() const { return op_debug_mode_; } | ||||
| const std::string &GetEnableDump() const { return enable_dump_; } | |||||
| const std::string &GetEnableDumpDebug() const { return enable_dump_debug_; } | |||||
| private: | private: | ||||
| void CopyFrom(const DumpProperties &other); | void CopyFrom(const DumpProperties &other); | ||||
| @@ -76,6 +92,8 @@ class DumpProperties { | |||||
| std::string dump_path_; | std::string dump_path_; | ||||
| std::string dump_step_; | std::string dump_step_; | ||||
| std::string dump_mode_; | std::string dump_mode_; | ||||
| std::string dump_status_; | |||||
| std::string dump_op_switch_; | |||||
| std::map<std::string, std::set<std::string>> model_dump_properties_map_; | std::map<std::string, std::set<std::string>> model_dump_properties_map_; | ||||
| bool is_op_debug_ = false; | bool is_op_debug_ = false; | ||||
| @@ -15,14 +15,15 @@ | |||||
| */ | */ | ||||
| #include "common/ge/op_tiling_manager.h" | #include "common/ge/op_tiling_manager.h" | ||||
| #include "common/util/error_manager/error_manager.h" | |||||
| #include "framework/common/debug/log.h" | #include "framework/common/debug/log.h" | ||||
| #include <string> | #include <string> | ||||
| namespace { | namespace { | ||||
| const char *const kEnvName = "ASCEND_OPP_PATH"; | const char *const kEnvName = "ASCEND_OPP_PATH"; | ||||
| const std::string kDefaultPath = "/usr/local/Ascend/opp"; | const std::string kDefaultPath = "/usr/local/Ascend/opp"; | ||||
| const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/liboptiling.so"; | |||||
| const std::string kDefaultCustomTilingPath = "/op_impl/custom/liboptiling.so"; | |||||
| const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/ai_core/tbe/op_tiling/liboptiling.so"; | |||||
| const std::string kDefaultCustomTilingPath = "/op_impl/custom/ai_core/tbe/op_tiling/liboptiling.so"; | |||||
| const uint8_t kPrefixIndex = 9; | const uint8_t kPrefixIndex = 9; | ||||
| } // namespace | } // namespace | ||||
| @@ -44,7 +45,9 @@ std::string OpTilingManager::GetPath() { | |||||
| if (opp_path_env != nullptr) { | if (opp_path_env != nullptr) { | ||||
| char resolved_path[PATH_MAX]; | char resolved_path[PATH_MAX]; | ||||
| if (realpath(opp_path_env, resolved_path) == NULL) { | if (realpath(opp_path_env, resolved_path) == NULL) { | ||||
| GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'(%s) is invalid path.", opp_path_env); | |||||
| ErrorManager::GetInstance().ATCReportErrMessage("E19024", {"env", "value", "situation"}, | |||||
| {"ASCEND_OPP_PATH", opp_path_env, "loading the tiling lib"}); | |||||
| GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'[%s] is invalid path.", opp_path_env); | |||||
| return std::string(); | return std::string(); | ||||
| } | } | ||||
| opp_path = resolved_path; | opp_path = resolved_path; | ||||
| @@ -12,6 +12,7 @@ GE_COMMON_LOCAL_SRC_FILES := \ | |||||
| math/fp16_math.cc \ | math/fp16_math.cc \ | ||||
| debug/memory_dumper.cc \ | debug/memory_dumper.cc \ | ||||
| formats/utils/formats_trans_utils.cc \ | formats/utils/formats_trans_utils.cc \ | ||||
| dump/dump_properties.cc \ | |||||
| formats/format_transfers/datatype_transfer.cc \ | formats/format_transfers/datatype_transfer.cc \ | ||||
| formats/format_transfers/format_transfer_transpose.cc \ | formats/format_transfers/format_transfer_transpose.cc \ | ||||
| formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \ | formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \ | ||||
| @@ -497,7 +497,25 @@ Status ModelCacheHelper::LoadJsonFromFile(const string &file_name, Json &json) c | |||||
| GELOGW("Fail to open the file: %s.", path.c_str()); | GELOGW("Fail to open the file: %s.", path.c_str()); | ||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| } | } | ||||
| ifs >> json; | |||||
| try { | |||||
| ifs >> json; | |||||
| } catch (nlohmann::detail::parse_error e) { | |||||
| GELOGW("Fail to load json from file, json throw an error:%s.", e.what()); | |||||
| return INTERNAL_ERROR; | |||||
| } catch (nlohmann::detail::invalid_iterator e) { | |||||
| GELOGW("Fail to load json from file, json throw an error:%s.", e.what()); | |||||
| return INTERNAL_ERROR; | |||||
| } catch (nlohmann::detail::type_error e) { | |||||
| GELOGW("Fail to load json from file, json throw an error:%s.", e.what()); | |||||
| return INTERNAL_ERROR; | |||||
| } catch (nlohmann::detail::out_of_range e) { | |||||
| GELOGW("Fail to load json from file, json throw an error:%s.", e.what()); | |||||
| return INTERNAL_ERROR; | |||||
| } catch (nlohmann::detail::other_error e) { | |||||
| GELOGW("Fail to load json from file, json throw an error:%s.", e.what()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| if (!json.is_object()) { | if (!json.is_object()) { | ||||
| GELOGW("Fail to load the json file: %s.", path.c_str()); | GELOGW("Fail to load the json file: %s.", path.c_str()); | ||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| @@ -41,7 +41,22 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil | |||||
| const uint8_t *data, size_t size) { | const uint8_t *data, size_t size) { | ||||
| if (size < 1 || size > UINT32_MAX) { | if (size < 1 || size > UINT32_MAX) { | ||||
| GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size); | GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size); | ||||
| ErrorManager::GetInstance().ATCReportErrMessage("E19022"); | |||||
| if (size > UINT32_MAX) { | |||||
| string item = "item"; | |||||
| if (type == MODEL_DEF) { | |||||
| item = "model info"; | |||||
| } else if (type == WEIGHTS_DATA) { | |||||
| item = "weight data"; | |||||
| } else if (type == TASK_INFO) { | |||||
| item = "task info"; | |||||
| } else if (type == TBE_KERNELS) { | |||||
| item = "tbe kernels"; | |||||
| } else if (type == CUST_AICPU_KERNELS) { | |||||
| item = "aicpu kernels"; | |||||
| } | |||||
| ErrorManager::GetInstance().ATCReportErrMessage("E19023", {"size", "item", "maxsize"}, | |||||
| {std::to_string(size), item, std::to_string(UINT32_MAX)}); | |||||
| } | |||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| } | } | ||||
| if (data == nullptr) { | if (data == nullptr) { | ||||
| @@ -263,7 +278,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c | |||||
| } | } | ||||
| Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_); | Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_); | ||||
| if (ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_) != SUCCESS) { | |||||
| if (status != SUCCESS) { | |||||
| GELOGE(status, "Parse model content failed!"); | GELOGE(status, "Parse model content failed!"); | ||||
| return status; | return status; | ||||
| } | } | ||||
| @@ -51,10 +51,23 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager &ProfilingMana | |||||
| return profiling_manager; | return profiling_manager; | ||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options) { | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options, | |||||
| bool convert_2_phy_device_id) { | |||||
| #ifdef DAVINCI_SUPPORT_PROFILING | #ifdef DAVINCI_SUPPORT_PROFILING | ||||
| vector<int32_t>().swap(device_id_); | vector<int32_t>().swap(device_id_); | ||||
| device_id_.push_back(options.device_id); | |||||
| // profiling need phy device id | |||||
| if (!convert_2_phy_device_id) { | |||||
| device_id_.push_back(options.device_id); | |||||
| } else { | |||||
| uint32_t phy_device_id = 0; | |||||
| rtError_t rt_ret = rtGetDevicePhyIdByIndex(static_cast<uint32_t>(options.device_id), &phy_device_id); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id); | |||||
| return FAILED; | |||||
| } | |||||
| device_id_.push_back(phy_device_id); | |||||
| } | |||||
| job_id_ = options.job_id; | job_id_ = options.job_id; | ||||
| Status ret; | Status ret; | ||||
| @@ -69,7 +69,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { | |||||
| ProfilingManager(); | ProfilingManager(); | ||||
| virtual ~ProfilingManager(); | virtual ~ProfilingManager(); | ||||
| static ProfilingManager &Instance(); | static ProfilingManager &Instance(); | ||||
| ge::Status Init(const Options &options); | |||||
| ge::Status Init(const Options &options, bool convert_2_phy_device_id = false); | |||||
| ge::Status InitFromOptions(const Options &options); | ge::Status InitFromOptions(const Options &options); | ||||
| ge::Status InitFromAclCfg(const std::string &config); | ge::Status InitFromAclCfg(const std::string &config); | ||||
| ge::Status StartProfiling(int32_t iter, int32_t device_id); | ge::Status StartProfiling(int32_t iter, int32_t device_id); | ||||
| @@ -172,6 +172,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpProperties &PropertiesManag | |||||
| return dump_properties_map_[session_id]; | return dump_properties_map_[session_id]; | ||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::AddDumpProperties( | |||||
| uint64_t session_id, const DumpProperties &dump_properties) { | |||||
| std::lock_guard<std::mutex> lock(mutex_); | |||||
| dump_properties_map_.emplace(session_id, dump_properties); | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::RemoveDumpProperties(uint64_t session_id) { | FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::RemoveDumpProperties(uint64_t session_id) { | ||||
| std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
| auto iter = dump_properties_map_.find(session_id); | auto iter = dump_properties_map_.find(session_id); | ||||
| @@ -23,8 +23,8 @@ | |||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include "graph/op_desc.h" | |||||
| #include "common/dump/dump_properties.h" | #include "common/dump/dump_properties.h" | ||||
| #include "graph/op_desc.h" | |||||
| namespace ge { | namespace ge { | ||||
| // Configuration property management | // Configuration property management | ||||
| @@ -83,6 +83,10 @@ class PropertiesManager { | |||||
| void SetPropertyDelimiter(const std::string &de); | void SetPropertyDelimiter(const std::string &de); | ||||
| DumpProperties &GetDumpProperties(uint64_t session_id); | DumpProperties &GetDumpProperties(uint64_t session_id); | ||||
| const map<uint64_t, DumpProperties> &GetDumpPropertiesMap() { return dump_properties_map_; } | |||||
| void AddDumpProperties(uint64_t session_id, const DumpProperties &dump_properties); | |||||
| void RemoveDumpProperties(uint64_t session_id); | void RemoveDumpProperties(uint64_t session_id); | ||||
| private: | private: | ||||
| @@ -19,16 +19,16 @@ | |||||
| #include <fcntl.h> | #include <fcntl.h> | ||||
| #include <sys/stat.h> | #include <sys/stat.h> | ||||
| #include <unistd.h> | |||||
| #include <regex.h> | #include <regex.h> | ||||
| #include <unistd.h> | |||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <climits> | #include <climits> | ||||
| #include <cstdlib> | #include <cstdlib> | ||||
| #include <ctime> | #include <ctime> | ||||
| #include <fstream> | #include <fstream> | ||||
| #include "external/ge/ge_api_error_codes.h" | |||||
| #include "common/util/error_manager/error_manager.h" | #include "common/util/error_manager/error_manager.h" | ||||
| #include "external/ge/ge_api_error_codes.h" | |||||
| #include "framework/common/debug/ge_log.h" | #include "framework/common/debug/ge_log.h" | ||||
| #include "framework/common/fmk_types.h" | #include "framework/common/fmk_types.h" | ||||
| #include "framework/common/ge_inner_error_codes.h" | #include "framework/common/ge_inner_error_codes.h" | ||||
| @@ -58,6 +58,7 @@ const int kWarningThreshold = 536870912 * 2; // 536870912 represent 512M | |||||
| const int kMaxFileSizeLimit = INT_MAX; | const int kMaxFileSizeLimit = INT_MAX; | ||||
| const int kMaxBuffSize = 256; | const int kMaxBuffSize = 256; | ||||
| const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character"; | const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character"; | ||||
| constexpr uint32_t MAX_CONFIG_FILE_BYTE = 10 * 1024 * 1024; | |||||
| } // namespace | } // namespace | ||||
| namespace ge { | namespace ge { | ||||
| @@ -482,4 +483,69 @@ FMK_FUNC_HOST_VISIBILITY bool ValidateStr(const std::string &str, const std::str | |||||
| regfree(®); | regfree(®); | ||||
| return true; | return true; | ||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY bool IsValidFile(const char *file_path) { | |||||
| if (file_path == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "Config path is null."); | |||||
| return false; | |||||
| } | |||||
| if (!CheckInputPathValid(file_path)) { | |||||
| GELOGE(PARAM_INVALID, "Config path is invalid: %s", file_path); | |||||
| return false; | |||||
| } | |||||
| // Normalize the path | |||||
| std::string resolved_file_path = RealPath(file_path); | |||||
| if (resolved_file_path.empty()) { | |||||
| GELOGE(PARAM_INVALID, "Invalid input file path [%s], make sure that the file path is correct.", file_path); | |||||
| return false; | |||||
| } | |||||
| mmStat_t stat = {0}; | |||||
| int32_t ret = mmStatGet(resolved_file_path.c_str(), &stat); | |||||
| if (ret != EN_OK) { | |||||
| GELOGE(PARAM_INVALID, "cannot get config file status, which path is %s, maybe not exist, return %d, errcode %d", | |||||
| resolved_file_path.c_str(), ret, mmGetErrorCode()); | |||||
| return false; | |||||
| } | |||||
| if ((stat.st_mode & S_IFMT) != S_IFREG) { | |||||
| GELOGE(PARAM_INVALID, "config file is not a common file, which path is %s, mode is %u", resolved_file_path.c_str(), | |||||
| stat.st_mode); | |||||
| return false; | |||||
| } | |||||
| if (stat.st_size > MAX_CONFIG_FILE_BYTE) { | |||||
| GELOGE(PARAM_INVALID, "config file %s size[%ld] is larger than max config file Bytes[%u]", | |||||
| resolved_file_path.c_str(), stat.st_size, MAX_CONFIG_FILE_BYTE); | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status CheckPath(const char *path, size_t length) { | |||||
| if (path == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "Config path is invalid."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| if (strlen(path) != length) { | |||||
| GELOGE(PARAM_INVALID, "Path is invalid or length of config path is not equal to given length."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| if (length == 0 || length > MMPA_MAX_PATH) { | |||||
| GELOGE(PARAM_INVALID, "Length of config path is invalid."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| INT32 is_dir = mmIsDir(path); | |||||
| if (is_dir != EN_OK) { | |||||
| GELOGE(PATH_INVALID, "Open directory %s failed, maybe it is not exit or not a dir", path); | |||||
| return PATH_INVALID; | |||||
| } | |||||
| if (mmAccess2(path, M_R_OK) != EN_OK) { | |||||
| GELOGE(PATH_INVALID, "Read path[%s] failed, errmsg[%s]", path, strerror(errno)); | |||||
| return PATH_INVALID; | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| } // namespace ge | } // namespace ge | ||||
| @@ -73,6 +73,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} | |||||
| "../graph/manager/trans_var_data_utils.cc" | "../graph/manager/trans_var_data_utils.cc" | ||||
| "../graph/manager/util/debug.cc" | "../graph/manager/util/debug.cc" | ||||
| "../hybrid/hybrid_davinci_model_stub.cc" | "../hybrid/hybrid_davinci_model_stub.cc" | ||||
| "../hybrid/node_executor/aicpu/aicpu_ext_info.cc" | |||||
| "../model/ge_model.cc" | "../model/ge_model.cc" | ||||
| "../model/ge_root_model.cc" | "../model/ge_root_model.cc" | ||||
| "../omm/csa_interact.cc" | "../omm/csa_interact.cc" | ||||
| @@ -118,6 +119,7 @@ target_link_libraries(ge_executor | |||||
| ${slog} | ${slog} | ||||
| ${mmpa} | ${mmpa} | ||||
| ${msprof} | ${msprof} | ||||
| ${error_manager} | |||||
| rt | rt | ||||
| dl) | dl) | ||||
| @@ -182,6 +182,37 @@ bool IsDynamicImageSizeMatchModel(uint64_t image_height, uint64_t image_width, | |||||
| GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width); | GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width); | ||||
| return false; | return false; | ||||
| } | } | ||||
| bool IsDynmaicDimsSizeMatchModel(const vector<uint64_t> cur_dynamic_dims, const vector<vector<int64_t>> &batch_info) { | |||||
| if (batch_info.empty()) { | |||||
| GELOGE(ge::FAILED, "Dynamic batch info is empty."); | |||||
| return false; | |||||
| } | |||||
| bool find_match = false; | |||||
| for (auto resolution : batch_info) { | |||||
| if (cur_dynamic_dims.size() != resolution.size()) { | |||||
| GELOGE(ge::FAILED, "Cur dynamic dims param num is %zu, current resolution size is %zu.", cur_dynamic_dims.size(), | |||||
| resolution.size()); | |||||
| return false; | |||||
| } | |||||
| bool flag = true; | |||||
| for (std::size_t i = 0; i < resolution.size(); ++i) { | |||||
| if (cur_dynamic_dims[i] != static_cast<uint64_t>(resolution[i])) { | |||||
| flag = false; | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (flag) { | |||||
| find_match = true; | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (!find_match) { | |||||
| GELOGE(ge::FAILED, "choose dynamic dims can not match the gear of model."); | |||||
| } | |||||
| return find_match; | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| namespace ge { | namespace ge { | ||||
| @@ -347,9 +378,21 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u | |||||
| vector<uint64_t> cur_dynamic_dims; | vector<uint64_t> cur_dynamic_dims; | ||||
| Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims); | Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(FAILED, "Set cur gear dynmaic dims failed"); | |||||
| GELOGE(FAILED, "Set cur gear dynamic dims failed"); | |||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| std::vector<std::vector<int64_t>> batch_info; | |||||
| int32_t dynamic_type = static_cast<int32_t>(FIXED); | |||||
| ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info, dynamic_type); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Get dynamic input info failed."); | |||||
| return ret; | |||||
| } | |||||
| if (!IsDynmaicDimsSizeMatchModel(cur_dynamic_dims, batch_info)) { | |||||
| GELOGE(PARAM_INVALID, "The current dynamic input does not match the gear of the model."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS)); | ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS)); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| @@ -410,6 +453,10 @@ Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector<uint64_t> & | |||||
| for (std::size_t i = 0; i < all_data_dims.size(); ++i) { | for (std::size_t i = 0; i < all_data_dims.size(); ++i) { | ||||
| if (all_data_dims[i] < 0) { | if (all_data_dims[i] < 0) { | ||||
| cur_dynamic_dims.push_back(dynamic_dims[i]); | cur_dynamic_dims.push_back(dynamic_dims[i]); | ||||
| } else if (static_cast<uint64_t>(all_data_dims[i]) != dynamic_dims[i]) { | |||||
| GELOGE(PARAM_INVALID, "Static dims should be same, index: %zu value: %d should be %d", i, dynamic_dims[i], | |||||
| all_data_dims[i]); | |||||
| return PARAM_INVALID; | |||||
| } | } | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -60,6 +60,7 @@ local_ge_executor_src_files := \ | |||||
| ../single_op/task/aicpu_task_builder.cc \ | ../single_op/task/aicpu_task_builder.cc \ | ||||
| ../single_op/task/aicpu_kernel_task_builder.cc \ | ../single_op/task/aicpu_kernel_task_builder.cc \ | ||||
| ../hybrid/hybrid_davinci_model_stub.cc\ | ../hybrid/hybrid_davinci_model_stub.cc\ | ||||
| ../hybrid/node_executor/aicpu/aicpu_ext_info.cc \ | |||||
| local_ge_executor_c_include := \ | local_ge_executor_c_include := \ | ||||
| proto/insert_op.proto \ | proto/insert_op.proto \ | ||||
| @@ -87,6 +88,7 @@ local_ge_executor_shared_library := \ | |||||
| libgraph \ | libgraph \ | ||||
| libregister \ | libregister \ | ||||
| libmsprof \ | libmsprof \ | ||||
| liberror_manager \ | |||||
| local_ge_executor_ldflags := -lrt -ldl \ | local_ge_executor_ldflags := -lrt -ldl \ | ||||
| @@ -137,6 +139,7 @@ LOCAL_SHARED_LIBRARIES := \ | |||||
| libgraph \ | libgraph \ | ||||
| libregister \ | libregister \ | ||||
| libmsprof \ | libmsprof \ | ||||
| liberror_manager \ | |||||
| LOCAL_LDFLAGS += $(local_ge_executor_ldflags) | LOCAL_LDFLAGS += $(local_ge_executor_ldflags) | ||||
| @@ -254,6 +254,7 @@ OME_HOST_SRC_FILES := \ | |||||
| single_op/stream_resource.cc \ | single_op/stream_resource.cc \ | ||||
| single_op/single_op_manager.cc \ | single_op/single_op_manager.cc \ | ||||
| hybrid/hybrid_davinci_model_stub.cc \ | hybrid/hybrid_davinci_model_stub.cc \ | ||||
| hybrid/node_executor/aicpu/aicpu_ext_info.cc \ | |||||
| # graph/load/new_model_manager/task_info/hccl_task_info.cc | # graph/load/new_model_manager/task_info/hccl_task_info.cc | ||||
| OME_DEVICE_SRC_FILES := $(OME_HOST_SRC_FILES) | OME_DEVICE_SRC_FILES := $(OME_HOST_SRC_FILES) | ||||
| @@ -286,6 +287,7 @@ COMMON_LOCAL_C_INCLUDES := \ | |||||
| $(TOPDIR)inc/runtime \ | $(TOPDIR)inc/runtime \ | ||||
| $(TOPDIR)libc_sec/include \ | $(TOPDIR)libc_sec/include \ | ||||
| $(TOPDIR)ops/built-in/op_proto/inc \ | $(TOPDIR)ops/built-in/op_proto/inc \ | ||||
| $(TOPDIR)toolchain/ide/ide-daemon/external \ | |||||
| third_party/json/include \ | third_party/json/include \ | ||||
| third_party/protobuf/include \ | third_party/protobuf/include \ | ||||
| third_party/opencv/include \ | third_party/opencv/include \ | ||||
| @@ -340,6 +342,7 @@ DEVICE_LOCAL_C_INCLUDES := \ | |||||
| $(TOPDIR)inc/runtime \ | $(TOPDIR)inc/runtime \ | ||||
| $(TOPDIR)ops/built-in/op_proto/inc \ | $(TOPDIR)ops/built-in/op_proto/inc \ | ||||
| $(TOPDIR)framework/domi \ | $(TOPDIR)framework/domi \ | ||||
| $(TOPDIR)toolchain/ide/ide-daemon/external \ | |||||
| third_party/json/include \ | third_party/json/include \ | ||||
| third_party/protobuf/include \ | third_party/protobuf/include \ | ||||
| third_party/opencv/include \ | third_party/opencv/include \ | ||||
| @@ -368,6 +371,7 @@ LOCAL_SRC_FILES += $(BUILER_SRC_FILES) | |||||
| LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES) | LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES) | ||||
| LOCAL_STATIC_LIBRARIES := libge_memory \ | LOCAL_STATIC_LIBRARIES := libge_memory \ | ||||
| libadump_server_stub \ | |||||
| LOCAL_SHARED_LIBRARIES := \ | LOCAL_SHARED_LIBRARIES := \ | ||||
| libc_sec \ | libc_sec \ | ||||
| @@ -432,6 +436,7 @@ LOCAL_C_INCLUDES := $(DEVICE_LOCAL_C_INCLUDES) | |||||
| LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES) | LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES) | ||||
| LOCAL_STATIC_LIBRARIES := libge_memory \ | LOCAL_STATIC_LIBRARIES := libge_memory \ | ||||
| libadump_server_stub \ | |||||
| LOCAL_SHARED_LIBRARIES := \ | LOCAL_SHARED_LIBRARIES := \ | ||||
| libc_sec \ | libc_sec \ | ||||
| @@ -25,40 +25,65 @@ | |||||
| #include "common/ge/plugin_manager.h" | #include "common/ge/plugin_manager.h" | ||||
| #include "graph/utils/type_utils.h" | #include "graph/utils/type_utils.h" | ||||
| #include "common/fp16_t.h" | #include "common/fp16_t.h" | ||||
| #include "common/math/math_util.h" | |||||
| namespace { | namespace { | ||||
| #define CREATE_OUTPUT_CASE(DTYPE, TYPE) \ | |||||
| case (DTYPE): { \ | |||||
| GeTensorPtr ge_tensor = nullptr; \ | |||||
| if (need_create_flag) { \ | |||||
| int64_t data_num = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize(); \ | |||||
| std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]()); \ | |||||
| if (buf == nullptr) { \ | |||||
| GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", \ | |||||
| static_cast<size_t>(sizeof(TYPE) * data_num)); \ | |||||
| return MEMALLOC_FAILED; \ | |||||
| } \ | |||||
| ge_tensor = MakeShared<GeTensor>(out_desc); \ | |||||
| GE_CHECK_NOTNULL(ge_tensor); \ | |||||
| GELOGI("node:%s allocate output %zu, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \ | |||||
| ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)); \ | |||||
| ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); \ | |||||
| ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); \ | |||||
| outputs.emplace_back(ge_tensor); \ | |||||
| } else { \ | |||||
| ge_tensor = outputs[i]; \ | |||||
| GE_CHECK_NOTNULL(ge_tensor); \ | |||||
| GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \ | |||||
| reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \ | |||||
| } \ | |||||
| auto tensor = TensorAdapter::AsTensor(*ge_tensor); \ | |||||
| auto tensor_name = op_desc->GetOutputNameByIndex(i); \ | |||||
| GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", \ | |||||
| op_desc->GetName().c_str(), i); \ | |||||
| GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \ | |||||
| op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize()); \ | |||||
| named_outputs.emplace(tensor_name, tensor); \ | |||||
| break; \ | |||||
| #define CREATE_OUTPUT_CASE(DTYPE, TYPE) \ | |||||
| case (DTYPE): { \ | |||||
| GeTensorPtr ge_tensor = nullptr; \ | |||||
| if (need_create_flag) { \ | |||||
| int64_t num_size = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize(); \ | |||||
| if (out_desc.GetShape().IsUnknownShape()) { \ | |||||
| std::vector<std::pair<int64_t, int64_t>> range; \ | |||||
| if (out_desc.GetShapeRange(range) != GRAPH_SUCCESS) { \ | |||||
| GELOGE(INTERNAL_ERROR, "Get shape range failed, node:%s", op_desc->GetName().c_str()); \ | |||||
| return INTERNAL_ERROR; \ | |||||
| } \ | |||||
| int64_t max_range_size = 1; \ | |||||
| for (const auto &item : range) { \ | |||||
| FMK_INT64_MULCHECK(max_range_size, item.second); \ | |||||
| max_range_size *= item.second; \ | |||||
| } \ | |||||
| num_size = max_range_size; \ | |||||
| } \ | |||||
| if (num_size < 0) { \ | |||||
| GELOGE(INTERNAL_ERROR, "node:%s, get size for output %zu failed, num=%lld", op_desc->GetName().c_str(), i, \ | |||||
| num_size); \ | |||||
| return INTERNAL_ERROR; \ | |||||
| } \ | |||||
| auto data_num = static_cast<uint64_t>(num_size); \ | |||||
| GELOGI("node:%s allocate output %zu start, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \ | |||||
| std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]()); \ | |||||
| if (buf == nullptr) { \ | |||||
| GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", \ | |||||
| static_cast<size_t>(sizeof(TYPE) * data_num)); \ | |||||
| return MEMALLOC_FAILED; \ | |||||
| } \ | |||||
| ge_tensor = MakeShared<GeTensor>(out_desc); \ | |||||
| GE_CHECK_NOTNULL(ge_tensor); \ | |||||
| GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, \ | |||||
| data_num * sizeof(TYPE)); \ | |||||
| if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) { \ | |||||
| GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str()); \ | |||||
| return MEMALLOC_FAILED; \ | |||||
| } \ | |||||
| ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); \ | |||||
| ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); \ | |||||
| outputs.emplace_back(ge_tensor); \ | |||||
| } else { \ | |||||
| ge_tensor = outputs[i]; \ | |||||
| GE_CHECK_NOTNULL(ge_tensor); \ | |||||
| GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \ | |||||
| reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \ | |||||
| } \ | |||||
| auto tensor = TensorAdapter::AsTensor(*ge_tensor); \ | |||||
| auto tensor_name = op_desc->GetOutputNameByIndex(i); \ | |||||
| GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", \ | |||||
| op_desc->GetName().c_str(), i); \ | |||||
| GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \ | |||||
| op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize()); \ | |||||
| named_outputs.emplace(tensor_name, tensor); \ | |||||
| break; \ | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| @@ -296,6 +296,7 @@ LIBGE_LOCAL_SRC_FILES := \ | |||||
| LIBCLIENT_LOCAL_SRC_FILES := \ | LIBCLIENT_LOCAL_SRC_FILES := \ | ||||
| proto/ge_api.proto \ | proto/ge_api.proto \ | ||||
| client/ge_api.cc \ | client/ge_api.cc \ | ||||
| client/ge_prof.cc \ | |||||
| RUNNER_LOCAL_C_INCLUDES := \ | RUNNER_LOCAL_C_INCLUDES := \ | ||||
| $(LOCAL_PATH) ./ \ | $(LOCAL_PATH) ./ \ | ||||
| @@ -312,6 +313,7 @@ RUNNER_LOCAL_C_INCLUDES := \ | |||||
| $(TOPDIR)libc_sec/include \ | $(TOPDIR)libc_sec/include \ | ||||
| $(TOPDIR)ops/built-in/op_proto/inc \ | $(TOPDIR)ops/built-in/op_proto/inc \ | ||||
| $(TOPDIR)framework/domi/analyzer \ | $(TOPDIR)framework/domi/analyzer \ | ||||
| $(TOPDIR)toolchain/ide/ide-daemon/external \ | |||||
| proto/fwk_adapter.proto \ | proto/fwk_adapter.proto \ | ||||
| proto/ge_ir.proto \ | proto/ge_ir.proto \ | ||||
| proto/insert_op.proto \ | proto/insert_op.proto \ | ||||
| @@ -353,6 +355,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES) | |||||
| LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | ||||
| LOCAL_STATIC_LIBRARIES := libge_memory \ | LOCAL_STATIC_LIBRARIES := libge_memory \ | ||||
| libadump_server \ | |||||
| LOCAL_SHARED_LIBRARIES := \ | LOCAL_SHARED_LIBRARIES := \ | ||||
| libc_sec \ | libc_sec \ | ||||
| @@ -371,6 +374,7 @@ LOCAL_LDFLAGS := -lrt -ldl | |||||
| LOCAL_SHARED_LIBRARIES += \ | LOCAL_SHARED_LIBRARIES += \ | ||||
| libruntime \ | libruntime \ | ||||
| libresource \ | libresource \ | ||||
| stub/libascend_hal \ | |||||
| include $(BUILD_HOST_SHARED_LIBRARY) | include $(BUILD_HOST_SHARED_LIBRARY) | ||||
| @@ -389,6 +393,7 @@ endif | |||||
| LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES) | LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES) | ||||
| LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc | LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc | ||||
| LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_prof.cc | |||||
| LOCAL_SHARED_LIBRARIES := | LOCAL_SHARED_LIBRARIES := | ||||
| @@ -438,6 +443,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES) | |||||
| LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | ||||
| LOCAL_STATIC_LIBRARIES := libge_memory \ | LOCAL_STATIC_LIBRARIES := libge_memory \ | ||||
| libadump_server \ | |||||
| LOCAL_SHARED_LIBRARIES := \ | LOCAL_SHARED_LIBRARIES := \ | ||||
| libc_sec \ | libc_sec \ | ||||
| @@ -450,6 +456,7 @@ LOCAL_LDFLAGS := -lrt -ldl | |||||
| LOCAL_SHARED_LIBRARIES += \ | LOCAL_SHARED_LIBRARIES += \ | ||||
| libruntime \ | libruntime \ | ||||
| libresource \ | libresource \ | ||||
| stub/libascend_hal \ | |||||
| include $(BUILD_HOST_STATIC_LIBRARY) | include $(BUILD_HOST_STATIC_LIBRARY) | ||||
| @@ -469,6 +476,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES) | |||||
| LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) | ||||
| LOCAL_STATIC_LIBRARIES := libge_memory \ | LOCAL_STATIC_LIBRARIES := libge_memory \ | ||||
| libadump_server \ | |||||
| LOCAL_SHARED_LIBRARIES := \ | LOCAL_SHARED_LIBRARIES := \ | ||||
| libc_sec \ | libc_sec \ | ||||
| @@ -481,5 +489,6 @@ LOCAL_LDFLAGS := -lrt -ldl | |||||
| LOCAL_SHARED_LIBRARIES += \ | LOCAL_SHARED_LIBRARIES += \ | ||||
| libruntime \ | libruntime \ | ||||
| libresource \ | libresource \ | ||||
| libascend_hal \ | |||||
| include $(BUILD_STATIC_LIBRARY) | include $(BUILD_STATIC_LIBRARY) | ||||
| @@ -1296,6 +1296,11 @@ void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &s | |||||
| return; | return; | ||||
| } | } | ||||
| if (dest[i] != nullptr && src[i] != nullptr) { | if (dest[i] != nullptr && src[i] != nullptr) { | ||||
| if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) { | |||||
| GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.", i, | |||||
| dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_); | |||||
| continue; | |||||
| } | |||||
| for (auto &symbol : src[i]->SymbolList()) { | for (auto &symbol : src[i]->SymbolList()) { | ||||
| dest[i]->AddSymbol(symbol); | dest[i]->AddSymbol(symbol); | ||||
| } | } | ||||
| @@ -227,7 +227,10 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse | |||||
| if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) { | if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) { | ||||
| GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset, | GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset, | ||||
| VarManager::Instance(session_id)->GetGraphMemoryMaxSize()); | VarManager::Instance(session_id)->GetGraphMemoryMaxSize()); | ||||
| ErrorManager::GetInstance().ATCReportErrMessage("E19022"); | |||||
| ErrorManager::GetInstance().ATCReportErrMessage( | |||||
| "E19022", {"size", "item", "maxsize"}, | |||||
| {std::to_string(mem_offset), "featuremap", | |||||
| std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())}); | |||||
| return ge::FAILED; | return ge::FAILED; | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -908,6 +911,8 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP | |||||
| GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str()); | GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str()); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| } else { | |||||
| GELOGW("Current atomic node %s does not have attr ATOMIC_WORKSPACE_INFO.", node->GetName().c_str()); | |||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -1452,14 +1457,56 @@ Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int6 | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| ge::Status GraphMemoryAssigner::IsIndependentAtomicClean(const ge::NodePtr &node, | |||||
| bool &is_independent_atomic_clean_node) { | |||||
| GE_CHECK_NOTNULL(node); | |||||
| const auto &out_control_anchor = node->GetOutControlAnchor(); | |||||
| GE_CHECK_NOTNULL(out_control_anchor); | |||||
| for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) { | |||||
| if (peer_in_control_anchor != nullptr) { | |||||
| auto peer_in_node = peer_in_control_anchor->GetOwnerNode(); | |||||
| auto peer_in_node_desc = peer_in_node->GetOpDesc(); | |||||
| if (peer_in_node_desc != nullptr) { | |||||
| bool is_atomic_node = false; | |||||
| // If GetBool fail, is_atomic_node is false. | |||||
| (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic_node); | |||||
| if (is_atomic_node) { | |||||
| vector<int> is_connect_netoutput; | |||||
| // If GetBool fail, attr is_connect_netoutput is an empty vector. | |||||
| (void)ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connect_netoutput); | |||||
| if (!is_connect_netoutput.empty()) { | |||||
| GELOGD("Peer in node %s is independent atomic clean node", peer_in_node->GetName().c_str()); | |||||
| is_independent_atomic_clean_node = true; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start, | ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start, | ||||
| const vector<int64_t> &atomic_mem_size) { | const vector<int64_t> &atomic_mem_size) { | ||||
| for (ge::NodePtr &node : compute_graph_->GetAllNodes()) { | for (ge::NodePtr &node : compute_graph_->GetAllNodes()) { | ||||
| auto node_op_desc = node->GetOpDesc(); | auto node_op_desc = node->GetOpDesc(); | ||||
| GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); | GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); | ||||
| if (((n != nullptr) && (node->GetName() == n->GetName())) || | |||||
| ((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) { | |||||
| bool is_valid_atomic_clean_node = (n != nullptr) && (node->GetName() == n->GetName()); | |||||
| if (((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) { | |||||
| bool is_independent_atomic_clean = false; | |||||
| if (IsIndependentAtomicClean(node, is_independent_atomic_clean) != SUCCESS) { | |||||
| GELOGE(FAILED, "Failed to determine the connection relationship of atomic addr clean node."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| is_valid_atomic_clean_node = is_valid_atomic_clean_node || (!is_independent_atomic_clean); | |||||
| } | |||||
| if (is_valid_atomic_clean_node) { | |||||
| GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str()); | |||||
| vector<int64_t> workspace_vector = node_op_desc->GetWorkspace(); | vector<int64_t> workspace_vector = node_op_desc->GetWorkspace(); | ||||
| vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes(); | vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes(); | ||||
| workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end()); | workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end()); | ||||
| @@ -175,6 +175,8 @@ class GraphMemoryAssigner { | |||||
| ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start, | ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start, | ||||
| const std::vector<int64_t> &atomic_mem_size); | const std::vector<int64_t> &atomic_mem_size); | ||||
| ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node); | |||||
| void AlignMemOffset(const int64_t &mem_align_size); | void AlignMemOffset(const int64_t &mem_align_size); | ||||
| ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const; | ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const; | ||||
| @@ -266,6 +266,14 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| if (is_unknown_shape) { | if (is_unknown_shape) { | ||||
| GE_CHK_STATUS_RET(SetUnknownShapeStream(run_context, stream), "Set unknown shape stream failed."); | GE_CHK_STATUS_RET(SetUnknownShapeStream(run_context, stream), "Set unknown shape stream failed."); | ||||
| } | } | ||||
| std::function<void()> callback = [&]() { | |||||
| if (is_unknown_shape) { | |||||
| if (DestroyUnknownShapeStream(run_context, stream) != SUCCESS) { | |||||
| GELOGE(FAILED, "Destory unknown shape stream failed."); | |||||
| } | |||||
| } | |||||
| }; | |||||
| GE_MAKE_GUARD(release, callback); | |||||
| for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
| OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
| @@ -352,9 +360,6 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, | op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, | ||||
| task_list_size_after - task_list_size_before); | task_list_size_after - task_list_size_before); | ||||
| } | } | ||||
| if (is_unknown_shape) { | |||||
| GE_CHK_STATUS_RET(DestroyUnknownShapeStream(run_context, stream), "Destory unknown shape stream failed."); | |||||
| } | |||||
| GE_TIMESTAMP_CALLNUM_EVENT_END(GenerateTask, "GraphBuild::GenerateTask"); | GE_TIMESTAMP_CALLNUM_EVENT_END(GenerateTask, "GraphBuild::GenerateTask"); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -532,6 +537,9 @@ Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) { | |||||
| (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node); | (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node); | ||||
| } | } | ||||
| (void)op_desc->DelAttr(kIsFirstNode); | |||||
| (void)op_desc->DelAttr(kIsLastNode); | |||||
| all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc); | all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc); | ||||
| } | } | ||||
| @@ -645,8 +653,6 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP | |||||
| vector<uint32_t> &all_reduce_nodes) const { | vector<uint32_t> &all_reduce_nodes) const { | ||||
| GELOGI("Start AutoFindBpOpIndex"); | GELOGI("Start AutoFindBpOpIndex"); | ||||
| NodePtr bp_node = nullptr; | NodePtr bp_node = nullptr; | ||||
| uint32_t last_bp = 0; | |||||
| uint32_t iter_end = 0; | |||||
| uint32_t current_idx = 0; | uint32_t current_idx = 0; | ||||
| for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
| OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
| @@ -662,20 +668,40 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP | |||||
| all_reduce_nodes.emplace_back(current_idx); | all_reduce_nodes.emplace_back(current_idx); | ||||
| GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); | GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); | ||||
| } | } | ||||
| if (op_desc->GetType() == NETOUTPUT) { | |||||
| if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) { | |||||
| if (bp_node == nullptr) { | if (bp_node == nullptr) { | ||||
| bp_node = node; | bp_node = node; | ||||
| } | } | ||||
| iter_end = current_idx; | |||||
| GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end); | |||||
| } | |||||
| if (graph->GetNeedIteration()) { | |||||
| if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive", | |||||
| op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| } else { | |||||
| if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| profiling_point.end_index = iter_end; | |||||
| if (bp_node == nullptr) { | if (bp_node == nullptr) { | ||||
| GELOGW("not find bp_node."); | GELOGW("not find bp_node."); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node); | |||||
| return SUCCESS; | |||||
| } | |||||
| uint32_t TaskGenerator::FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const { | |||||
| uint32_t last_bp = 0; | |||||
| OpDescPtr bp_op_desc = nullptr; | OpDescPtr bp_op_desc = nullptr; | ||||
| for (auto &in_anchor : bp_node->GetAllInDataAnchors()) { | for (auto &in_anchor : bp_node->GetAllInDataAnchors()) { | ||||
| auto out_anchor = in_anchor->GetPeerOutAnchor(); | auto out_anchor = in_anchor->GetPeerOutAnchor(); | ||||
| @@ -691,7 +717,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP | |||||
| } | } | ||||
| GE_CHECK_NOTNULL(bp_op_desc); | GE_CHECK_NOTNULL(bp_op_desc); | ||||
| current_idx = 0; | |||||
| uint32_t current_idx = 0; | |||||
| for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
| OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
| GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
| @@ -702,8 +728,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP | |||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| profiling_point.bp_index = last_bp; | |||||
| return SUCCESS; | |||||
| return last_bp; | |||||
| } | } | ||||
| Status TaskGenerator::FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, | Status TaskGenerator::FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, | ||||
| @@ -734,7 +759,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin | |||||
| ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes) const { | ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes) const { | ||||
| GELOGI("Start FindBpOfEnv"); | GELOGI("Start FindBpOfEnv"); | ||||
| uint32_t current_idx = 0; | uint32_t current_idx = 0; | ||||
| uint32_t iter_end = 0; | |||||
| uint32_t last_bp = 0; | uint32_t last_bp = 0; | ||||
| for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
| OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
| @@ -745,10 +769,23 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin | |||||
| continue; | continue; | ||||
| } | } | ||||
| if (op_desc->GetType() == NETOUTPUT) { | |||||
| iter_end = current_idx; | |||||
| GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end); | |||||
| if (graph->GetNeedIteration()) { | |||||
| if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive", | |||||
| op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| } else { | |||||
| if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) { | |||||
| profiling_point.end_index.insert(current_idx); | |||||
| GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx); | |||||
| } | |||||
| } | } | ||||
| if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) { | if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) { | ||||
| all_reduce_nodes.emplace_back(current_idx); | all_reduce_nodes.emplace_back(current_idx); | ||||
| GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); | GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); | ||||
| @@ -760,7 +797,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin | |||||
| } | } | ||||
| profiling_point.bp_index = last_bp; | profiling_point.bp_index = last_bp; | ||||
| profiling_point.end_index = iter_end; | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -857,7 +893,7 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||||
| bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
| ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ||||
| if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | ||||
| (profiling_point.end_index == 0)) { | |||||
| (profiling_point.end_index.empty())) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (profiling_point.fp_index == node_index) { | if (profiling_point.fp_index == node_index) { | ||||
| @@ -914,7 +950,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||||
| bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
| ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ||||
| if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | ||||
| (profiling_point.end_index == 0)) { | |||||
| (profiling_point.end_index.empty())) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (profiling_point.bp_index == node_index) { | if (profiling_point.bp_index == node_index) { | ||||
| @@ -928,7 +964,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||||
| bp_log_def->set_notify(false); | bp_log_def->set_notify(false); | ||||
| task_def_list.emplace_back(bp_task_def); | task_def_list.emplace_back(bp_task_def); | ||||
| } | } | ||||
| if (profiling_point.end_index == node_index) { | |||||
| if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) { | |||||
| GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
| TaskDef end_task_def; | TaskDef end_task_def; | ||||
| end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
| @@ -36,7 +36,7 @@ class OpsKernelManager; | |||||
| struct ProfilingPoint { | struct ProfilingPoint { | ||||
| uint32_t fp_index = 0; | uint32_t fp_index = 0; | ||||
| uint32_t bp_index = 0; | uint32_t bp_index = 0; | ||||
| uint32_t end_index = 0; | |||||
| std::set<uint32_t> end_index; | |||||
| }; | }; | ||||
| // Describes infos needed by generate task for fusion node | // Describes infos needed by generate task for fusion node | ||||
| struct FusionTaskInfo { | struct FusionTaskInfo { | ||||
| @@ -112,6 +112,7 @@ class TaskGenerator { | |||||
| Status AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point) const; | Status AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point) const; | ||||
| Status AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | Status AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | ||||
| vector<uint32_t> &all_reduce_nodes) const; | vector<uint32_t> &all_reduce_nodes) const; | ||||
| uint32_t FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const; | |||||
| Status FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, | Status FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, | ||||
| ProfilingPoint &profiling_point) const; | ProfilingPoint &profiling_point) const; | ||||
| @@ -125,6 +125,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener | |||||
| rt_model_stream_(nullptr), | rt_model_stream_(nullptr), | ||||
| is_inner_model_stream_(false), | is_inner_model_stream_(false), | ||||
| is_async_mode_(false), | is_async_mode_(false), | ||||
| last_execute_mode_(false), | |||||
| session_id_(0), | session_id_(0), | ||||
| device_id_(0), | device_id_(0), | ||||
| maxDumpOpNum_(0), | maxDumpOpNum_(0), | ||||
| @@ -2879,6 +2880,12 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<v | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId()); | |||||
| if (it == zero_copy_op_id_batch_label_.end()) { | |||||
| zero_copy_task.SetBatchLabel(kDefaultBatchLable); | |||||
| } else { | |||||
| zero_copy_task.SetBatchLabel(it->second); | |||||
| } | |||||
| std::lock_guard<std::mutex> lock(outside_addrs_mutex_); | std::lock_guard<std::mutex> lock(outside_addrs_mutex_); | ||||
| if (zero_copy_task.IsTaskArgsSet()) { | if (zero_copy_task.IsTaskArgsSet()) { | ||||
| @@ -3045,6 +3052,9 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> & | |||||
| data.first, addr, size, buffer_addr); | data.first, addr, size, buffer_addr); | ||||
| // For input data, just copy for rts task. | // For input data, just copy for rts task. | ||||
| for (ZeroCopyTask &task : zero_copy_tasks_) { | for (ZeroCopyTask &task : zero_copy_tasks_) { | ||||
| if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) { | |||||
| continue; | |||||
| } | |||||
| uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr); | uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr); | ||||
| if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) { | if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) { | ||||
| return FAILED; | return FAILED; | ||||
| @@ -3365,6 +3375,7 @@ Status DavinciModel::InitModelStream(rtStream_t stream) { | |||||
| if (is_async_mode_) { | if (is_async_mode_) { | ||||
| rt_model_stream_ = stream; | rt_model_stream_ = stream; | ||||
| is_inner_model_stream_ = false; | is_inner_model_stream_ = false; | ||||
| last_execute_mode_ = true; | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -3376,12 +3387,14 @@ Status DavinciModel::InitModelStream(rtStream_t stream) { | |||||
| rt_model_stream_ = stream; | rt_model_stream_ = stream; | ||||
| is_inner_model_stream_ = false; | is_inner_model_stream_ = false; | ||||
| last_execute_mode_ = false; | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (rt_model_stream_ == nullptr) { | |||||
| if (last_execute_mode_ || (rt_model_stream_ == nullptr)) { | |||||
| GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_FORBIDDEN_DEFAULT)); | GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_FORBIDDEN_DEFAULT)); | ||||
| is_inner_model_stream_ = true; | is_inner_model_stream_ = true; | ||||
| last_execute_mode_ = false; | |||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -3516,7 +3529,7 @@ uint8_t *DavinciModel::MallocWeightsMem(size_t weights_size) { | |||||
| } | } | ||||
| void DavinciModel::FreeFeatureMapMem() { | void DavinciModel::FreeFeatureMapMem() { | ||||
| if (std::getenv(kEnvGeuseStaticMemory) != nullptr) { | |||||
| if (std::getenv(kEnvGeuseStaticMemory) != nullptr && is_inner_mem_base_) { | |||||
| string weight_memory_key = std::to_string(0) + "_f"; | string weight_memory_key = std::to_string(0) + "_f"; | ||||
| if (MemManager::Instance(RT_MEMORY_HBM)->GetMemoryAddr(weight_memory_key) != nullptr) { | if (MemManager::Instance(RT_MEMORY_HBM)->GetMemoryAddr(weight_memory_key) != nullptr) { | ||||
| GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_HBM)->FreeMemory(weight_memory_key, GetDeviceId()), | GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_HBM)->FreeMemory(weight_memory_key, GetDeviceId()), | ||||
| @@ -884,6 +884,7 @@ class DavinciModel { | |||||
| bool is_inner_model_stream_; | bool is_inner_model_stream_; | ||||
| bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_. | bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_. | ||||
| bool last_execute_mode_; | |||||
| bool is_stream_list_bind_{false}; | bool is_stream_list_bind_{false}; | ||||
| bool is_pure_head_stream_{false}; | bool is_pure_head_stream_{false}; | ||||
| @@ -43,6 +43,13 @@ const std::string kCmdTypeProfInit = "prof_init"; | |||||
| const std::string kCmdTypeProfFinalize = "prof_finalize"; | const std::string kCmdTypeProfFinalize = "prof_finalize"; | ||||
| const std::string kCmdTypeProfStart = "prof_start"; | const std::string kCmdTypeProfStart = "prof_start"; | ||||
| const std::string kCmdTypeProfStop = "prof_stop"; | const std::string kCmdTypeProfStop = "prof_stop"; | ||||
| const char *const kLoadOpFromBuf = "loadOpFromBuf"; | |||||
| struct CustAicpuSoBuf { | |||||
| uint64_t kernelSoBuf; | |||||
| uint32_t kernelSoBufLen; | |||||
| uint64_t kernelSoName; | |||||
| uint32_t kernelSoNameLen; | |||||
| } __attribute__((packed)); | |||||
| } // namespace | } // namespace | ||||
| DumpProperties ModelManager::dump_properties_; | DumpProperties ModelManager::dump_properties_; | ||||
| @@ -163,7 +170,13 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { | |||||
| GELOGI("The session: %lu not created.", session_id); | GELOGI("The session: %lu not created.", session_id); | ||||
| return; | return; | ||||
| } else { | } else { | ||||
| GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId()))); | |||||
| rtContext_t ctx = nullptr; | |||||
| bool has_ctx = (rtCtxGetCurrent(&ctx) == RT_ERROR_NONE); | |||||
| if (!has_ctx) { | |||||
| GELOGI("Set device %u.", GetContext().DeviceId()); | |||||
| GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId()))); | |||||
| } | |||||
| Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0); | Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGW("The session: %lu destroy failed.", session_id); | GELOGW("The session: %lu destroy failed.", session_id); | ||||
| @@ -171,7 +184,11 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { | |||||
| (void)sess_ids_.erase(session_id); | (void)sess_ids_.erase(session_id); | ||||
| GELOGI("The session: %lu destroyed.", session_id); | GELOGI("The session: %lu destroyed.", session_id); | ||||
| } | } | ||||
| GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | |||||
| if (!has_ctx) { | |||||
| GELOGI("Reset device %u.", GetContext().DeviceId()); | |||||
| GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -219,6 +236,7 @@ ModelManager::~ModelManager() { | |||||
| std::lock_guard<std::mutex> lock(map_mutex_); | std::lock_guard<std::mutex> lock(map_mutex_); | ||||
| model_map_.clear(); | model_map_.clear(); | ||||
| model_aicpu_kernel_.clear(); | model_aicpu_kernel_.clear(); | ||||
| cust_aicpu_so_.clear(); | |||||
| GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); | GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); | ||||
| } | } | ||||
| @@ -919,7 +937,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model | |||||
| } | } | ||||
| davinci_model->SetDeviceId(device_id); | davinci_model->SetDeviceId(device_id); | ||||
| davinci_model->SetOmName(model.om_name); | davinci_model->SetOmName(model.om_name); | ||||
| if (DumpManager::GetInstance().IsDumpOpen()) { | |||||
| if (DumpManager::GetInstance().GetDumpProperties().IsDumpOpen()) { | |||||
| davinci_model->SetDumpProperties(DumpManager::GetInstance().GetDumpProperties()); | davinci_model->SetDumpProperties(DumpManager::GetInstance().GetDumpProperties()); | ||||
| } else { | } else { | ||||
| davinci_model->SetDumpProperties(dump_properties_); | davinci_model->SetDumpProperties(dump_properties_); | ||||
| @@ -1070,6 +1088,67 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status ModelManager::LoadCustAicpuSo(const OpDescPtr op_desc, string so_name) { | |||||
| std::lock_guard<std::mutex> lock(cust_aicpu_mutex_); | |||||
| auto it = cust_aicpu_so_.find(so_name); | |||||
| if (it == cust_aicpu_so_.end()) { | |||||
| GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, so_name), "LaunchCustAicpuSo failed. op name %s, so_name %s", | |||||
| op_desc->GetName().c_str(), so_name.c_str()); | |||||
| (void)cust_aicpu_so_.insert(so_name); | |||||
| GELOGI("LaunchCustAicpuSo op name %s, so_name %s.", op_desc->GetName().c_str(), so_name.c_str()); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status ModelManager::LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name) { | |||||
| CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); | |||||
| if (aicpu_kernel == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| const void *aicpu_data = aicpu_kernel->GetBinData(); | |||||
| uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize(); | |||||
| void *d_aicpu_data = nullptr; | |||||
| void *d_so_name = nullptr; | |||||
| void *args = nullptr; | |||||
| rtError_t status; | |||||
| rtStream_t stream = nullptr; | |||||
| GE_CHK_RT(rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT(rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHK_RT(rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM)); | |||||
| GE_CHK_RT(rtMemcpy(d_so_name, so_name.size(), reinterpret_cast<const void *>(so_name.c_str()), so_name.size(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| CustAicpuSoBuf cust_aicpu_so_buf; | |||||
| cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data)); | |||||
| cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; | |||||
| cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name)); | |||||
| cust_aicpu_so_buf.kernelSoNameLen = so_name.size(); | |||||
| uint32_t args_size = sizeof(CustAicpuSoBuf); | |||||
| GE_CHK_RT(rtMalloc(&args, args_size, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT(rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHK_RT(rtStreamCreate(&stream, 0)); | |||||
| GE_CHK_RT(rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream)); | |||||
| status = rtStreamSynchronize(stream); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); | |||||
| GE_CHK_RT(rtStreamDestroy(stream)); | |||||
| GE_CHK_RT(rtFree(args)); | |||||
| GE_CHK_RT(rtFree(d_aicpu_data)); | |||||
| GE_CHK_RT(rtFree(d_so_name)); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| GE_CHK_RT(rtStreamDestroy(stream)); | |||||
| GE_CHK_RT(rtFree(args)); | |||||
| GE_CHK_RT(rtFree(d_aicpu_data)); | |||||
| GE_CHK_RT(rtFree(d_so_name)); | |||||
| GELOGI("Cpu kernel launch loadOpFromBuf task success."); | |||||
| return SUCCESS; | |||||
| } | |||||
| /// | /// | ||||
| /// @ingroup ge | /// @ingroup ge | ||||
| /// @brief get model memory size and weight | /// @brief get model memory size and weight | ||||
| @@ -268,6 +268,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { | |||||
| ge::Status DestroyAicpuSessionForInfer(uint32_t model_id); | ge::Status DestroyAicpuSessionForInfer(uint32_t model_id); | ||||
| ge::Status LoadCustAicpuSo(const OpDescPtr op_desc, string so_name); | |||||
| ge::Status LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name); | |||||
| ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info); | ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info); | ||||
| ge::Status GenSessionId(uint64_t &session_id); | ge::Status GenSessionId(uint64_t &session_id); | ||||
| @@ -333,6 +337,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { | |||||
| uint64_t session_id_bias_; | uint64_t session_id_bias_; | ||||
| std::set<uint64_t> sess_ids_; | std::set<uint64_t> sess_ids_; | ||||
| std::vector<rtExceptionInfo> exception_infos_; | std::vector<rtExceptionInfo> exception_infos_; | ||||
| std::mutex cust_aicpu_mutex_; | |||||
| std::set<std::string> cust_aicpu_so_; | |||||
| static DumpProperties dump_properties_; | static DumpProperties dump_properties_; | ||||
| }; | }; | ||||
| @@ -29,6 +29,14 @@ | |||||
| #include "framework/common/debug/ge_log.h" | #include "framework/common/debug/ge_log.h" | ||||
| #include "graph/manager/graph_var_manager.h" | #include "graph/manager/graph_var_manager.h" | ||||
| #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ | |||||
| do { \ | |||||
| if (SIZE <= static_cast<uint64_t>(OFFSET)) { \ | |||||
| GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \ | |||||
| return {}; \ | |||||
| } \ | |||||
| } while (0) | |||||
| namespace ge { | namespace ge { | ||||
| /// | /// | ||||
| /// @ingroup ge | /// @ingroup ge | ||||
| @@ -38,7 +46,7 @@ namespace ge { | |||||
| vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { | vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { | ||||
| vector<int64_t> v_input_size; | vector<int64_t> v_input_size; | ||||
| GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size); | GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size); | ||||
| const size_t inputs_size = op_desc->GetInputsSize(); | |||||
| const size_t inputs_size = op_desc->GetAllInputsSize(); | |||||
| const string op_type = op_desc->GetType(); | const string op_type = op_desc->GetType(); | ||||
| const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | ||||
| @@ -151,7 +159,7 @@ vector<int64_t> ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) { | |||||
| } | } | ||||
| // other ops get weight from connected constop | // other ops get weight from connected constop | ||||
| const size_t inputs_size = op_desc->GetInputsSize(); | |||||
| const size_t inputs_size = op_desc->GetAllInputsSize(); | |||||
| const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | ||||
| for (size_t i = 0; i < inputs_size; ++i) { | for (size_t i = 0; i < inputs_size; ++i) { | ||||
| if ((i < v_is_input_const.size()) && v_is_input_const[i]) { | if ((i < v_is_input_const.size()) && v_is_input_const[i]) { | ||||
| @@ -191,7 +199,7 @@ vector<ConstGeTensorPtr> ModelUtils::GetWeights(ConstOpDescPtr op_desc) { | |||||
| } | } | ||||
| // other ops get weight from connected constop | // other ops get weight from connected constop | ||||
| const size_t inputs_size = op_desc->GetInputsSize(); | |||||
| const size_t inputs_size = op_desc->GetAllInputsSize(); | |||||
| const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | ||||
| for (size_t i = 0; i < inputs_size; ++i) { | for (size_t i = 0; i < inputs_size; ++i) { | ||||
| if ((i < v_is_input_const.size()) && v_is_input_const[i]) { | if ((i < v_is_input_const.size()) && v_is_input_const[i]) { | ||||
| @@ -221,7 +229,7 @@ vector<::tagCcAICPUTensor> ModelUtils::GetInputDescs(ConstOpDescPtr op_desc) { | |||||
| vector<::opTensor_t> v_input_descs; | vector<::opTensor_t> v_input_descs; | ||||
| GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_descs); | GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_descs); | ||||
| const size_t inputs_size = op_desc->GetInputsSize(); | |||||
| const size_t inputs_size = op_desc->GetAllInputsSize(); | |||||
| const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); | ||||
| for (size_t i = 0; i < inputs_size; ++i) { | for (size_t i = 0; i < inputs_size; ++i) { | ||||
| @@ -306,7 +314,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||||
| GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr); | GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr); | ||||
| uint64_t session_id = model_param.session_id; | uint64_t session_id = model_param.session_id; | ||||
| const size_t inputs_size = op_desc->GetInputsSize(); | |||||
| const size_t inputs_size = op_desc->GetAllInputsSize(); | |||||
| const vector<int64_t> v_input_offset = op_desc->GetInputOffset(); | const vector<int64_t> v_input_offset = op_desc->GetInputOffset(); | ||||
| const string op_type = op_desc->GetType(); | const string op_type = op_desc->GetType(); | ||||
| @@ -334,6 +342,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||||
| if (tensor_size) { | if (tensor_size) { | ||||
| int64_t data_offset = 0; | int64_t data_offset = 0; | ||||
| GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset)); | GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset)); | ||||
| VALIDATE_MEM_RANGE(op_desc, model_param.weight_size, data_offset); | |||||
| uint8_t *weight_addr = model_param.weight_base + data_offset; | uint8_t *weight_addr = model_param.weight_base + data_offset; | ||||
| v_input_data_addr.push_back(weight_addr); | v_input_data_addr.push_back(weight_addr); | ||||
| GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id, | GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id, | ||||
| @@ -345,11 +354,12 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||||
| GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(), | GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(), | ||||
| GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index); | GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index); | ||||
| break;); | |||||
| break); | |||||
| int64_t input_offset = v_input_offset[non_const_index]; | int64_t input_offset = v_input_offset[non_const_index]; | ||||
| non_const_index++; | non_const_index++; | ||||
| GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset), | GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset), | ||||
| VALIDATE_MEM_RANGE(op_desc, model_param.var_size, input_offset - model_param.logic_var_base); | |||||
| uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base; | uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base; | ||||
| v_input_data_addr.push_back(variable_addr); | v_input_data_addr.push_back(variable_addr); | ||||
| GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]", | GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]", | ||||
| @@ -363,6 +373,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||||
| mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset)); | mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset)); | ||||
| v_input_data_addr.push_back(mem_addr); | v_input_data_addr.push_back(mem_addr); | ||||
| } else { | } else { | ||||
| VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, input_offset); | |||||
| mem_addr = model_param.mem_base + input_offset; | mem_addr = model_param.mem_base + input_offset; | ||||
| v_input_data_addr.push_back(mem_addr); | v_input_data_addr.push_back(mem_addr); | ||||
| } | } | ||||
| @@ -398,6 +409,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C | |||||
| } | } | ||||
| for (size_t i = 0; i < outputs_size; ++i) { | for (size_t i = 0; i < outputs_size; ++i) { | ||||
| GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]), | GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]), | ||||
| VALIDATE_MEM_RANGE(op_desc, model_param.var_size, v_output_offset[i] - model_param.logic_var_base); | |||||
| uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base; | uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base; | ||||
| v_output_data_addr.push_back(variable_addr); | v_output_data_addr.push_back(variable_addr); | ||||
| GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]", | GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]", | ||||
| @@ -410,6 +422,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C | |||||
| mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i])); | mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i])); | ||||
| v_output_data_addr.push_back(mem_addr); | v_output_data_addr.push_back(mem_addr); | ||||
| } else { | } else { | ||||
| VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_output_offset[i]); | |||||
| mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]); | mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]); | ||||
| v_output_data_addr.push_back(mem_addr); | v_output_data_addr.push_back(mem_addr); | ||||
| } | } | ||||
| @@ -440,15 +453,19 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param | |||||
| for (size_t i = 0; i < v_workspace_bytes.size(); ++i) { | for (size_t i = 0; i < v_workspace_bytes.size(); ++i) { | ||||
| if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) { | if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) { | ||||
| v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i]))); | v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i]))); | ||||
| GELOGI("Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i, | |||||
| reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i]))); | |||||
| GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[L1] name[%s], mem_addr[workspace index %zu]:0x%lx", | |||||
| model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i]); | |||||
| } else if (v_workspace_bytes[i] == 0) { | |||||
| v_workspace_data_addr.push_back(nullptr); | |||||
| GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] Null addr", | |||||
| model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i]); | |||||
| } else { | } else { | ||||
| int64_t workspace_offset = v_workspace_offset[i]; | |||||
| int64_t workspace_bytes = v_workspace_bytes[i]; | |||||
| uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : model_param.mem_base + workspace_offset; | |||||
| VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_workspace_offset[i]); | |||||
| uint8_t *mem_addr = model_param.mem_base + v_workspace_offset[i]; | |||||
| v_workspace_data_addr.push_back(mem_addr); | v_workspace_data_addr.push_back(mem_addr); | ||||
| GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]", | GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]", | ||||
| model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr); | |||||
| model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i], | |||||
| mem_addr); | |||||
| } | } | ||||
| } | } | ||||
| @@ -26,6 +26,7 @@ | |||||
| #include "framework/common/l2_cache_optimize.h" | #include "framework/common/l2_cache_optimize.h" | ||||
| #include "graph/debug/ge_attr_define.h" | #include "graph/debug/ge_attr_define.h" | ||||
| #include "graph/load/new_model_manager/davinci_model.h" | #include "graph/load/new_model_manager/davinci_model.h" | ||||
| #include "graph/load/new_model_manager/model_manager.h" | |||||
| #include "graph/load/new_model_manager/model_utils.h" | #include "graph/load/new_model_manager/model_utils.h" | ||||
| #include "runtime/kernel.h" | #include "runtime/kernel.h" | ||||
| #include "super_kernel/super_kernel.h" | #include "super_kernel/super_kernel.h" | ||||
| @@ -41,13 +42,6 @@ const char *kIsLastNode = "is_last_node"; | |||||
| const char *kIsFirstNode = "is_first_node"; | const char *kIsFirstNode = "is_first_node"; | ||||
| const int64_t kCloseSkt = 100; | const int64_t kCloseSkt = 100; | ||||
| const uint32_t kAddrLen = sizeof(void *); | const uint32_t kAddrLen = sizeof(void *); | ||||
| const char *const kLoadOpFromBuf = "loadOpFromBuf"; | |||||
| struct CustAicpuSoBuf { | |||||
| uint64_t kernelSoBuf; | |||||
| uint32_t kernelSoBufLen; | |||||
| uint64_t kernelSoName; | |||||
| uint32_t kernelSoNameLen; | |||||
| } __attribute__((packed)); | |||||
| } // namespace | } // namespace | ||||
| namespace ge { | namespace ge { | ||||
| @@ -861,92 +855,6 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status KernelTaskInfo::LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def) { | |||||
| CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); | |||||
| if (aicpu_kernel == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| const void *aicpu_data = aicpu_kernel->GetBinData(); | |||||
| uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize(); | |||||
| void *d_aicpu_data = nullptr; | |||||
| rtError_t status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| void *d_so_name = nullptr; | |||||
| status = rtMalloc(&d_so_name, so_name_.size(), RT_MEMORY_HBM); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| status = rtMemcpy(d_so_name, so_name_.size(), reinterpret_cast<const void *>(so_name_.c_str()), so_name_.size(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| CustAicpuSoBuf cust_aicpu_so_buf; | |||||
| cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data)); | |||||
| cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; | |||||
| cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name)); | |||||
| cust_aicpu_so_buf.kernelSoNameLen = so_name_.size(); | |||||
| void *args = nullptr; | |||||
| uint32_t args_size = sizeof(CustAicpuSoBuf); | |||||
| status = rtMalloc(&args, args_size, RT_MEMORY_HBM); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| GELOGI("loadOpFromBuf kernelSoBuf %p, kernelSoBufLen %u, kernelSoName %p, kernelSoNameLen %u.", d_aicpu_data, | |||||
| aicpu_data_length, d_so_name, so_name_.size()); | |||||
| status = rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| rtStream_t stream = nullptr; | |||||
| status = rtStreamCreate(&stream, 0); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt create stream failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| status = rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt CpuKernelLaunch loadOpFromBuf failed, status: 0x%X", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| GELOGI("Cpu kernel launch loadOpFromBuf."); | |||||
| status = rtStreamSynchronize(stream); | |||||
| if (status != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); | |||||
| return RT_ERROR_TO_GE_STATUS(status); | |||||
| } | |||||
| GE_CHK_RT(rtFree(args)); | |||||
| GE_CHK_RT(rtFree(d_aicpu_data)); | |||||
| GE_CHK_RT(rtFree(d_so_name)); | |||||
| GELOGI("Cpu kernel launch loadOpFromBuf task success."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) { | Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) { | ||||
| GELOGI("Do InitAicpuTask"); | GELOGI("Do InitAicpuTask"); | ||||
| so_name_ = kernel_def.so_name(); | so_name_ = kernel_def.so_name(); | ||||
| @@ -961,7 +869,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k | |||||
| } | } | ||||
| if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { | if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { | ||||
| GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, kernel_def), "launch cust aicpu so failed"); | |||||
| GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_), "launch cust aicpu so failed"); | |||||
| } | } | ||||
| // copy args to new host memory | // copy args to new host memory | ||||
| @@ -106,8 +106,6 @@ class KernelTaskInfo : public TaskInfo { | |||||
| Status InitAicpuTaskExtInfo(const std::string &ext_info); | Status InitAicpuTaskExtInfo(const std::string &ext_info); | ||||
| Status LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def); | |||||
| Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs, | Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs, | ||||
| const std::vector<void *> &output_data_addrs, | const std::vector<void *> &output_data_addrs, | ||||
| const std::vector<::tagCcAICPUTensor> &input_descs, | const std::vector<::tagCcAICPUTensor> &input_descs, | ||||
| @@ -118,13 +118,11 @@ bool ZeroCopyTask::CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_ad | |||||
| */ | */ | ||||
| Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs, | Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs, | ||||
| const string &batch_label) { | const string &batch_label) { | ||||
| for (auto pair : task_addr_offset_) { | |||||
| if (pair.first != addr) { | |||||
| continue; | |||||
| } | |||||
| auto iter = task_addr_offset_.find(addr); | |||||
| if (iter != task_addr_offset_.end()) { | |||||
| auto &cur_pair = *iter; | |||||
| uint8_t *args_info = args_info_.data(); | uint8_t *args_info = args_info_.data(); | ||||
| for (auto offset : pair.second) { | |||||
| for (auto offset : cur_pair.second) { | |||||
| if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) { | if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -83,6 +83,10 @@ class ZeroCopyTask { | |||||
| */ | */ | ||||
| ge::Status DistributeParam(bool async_mode, rtStream_t stream); | ge::Status DistributeParam(bool async_mode, rtStream_t stream); | ||||
| void SetBatchLabel(const string &batch_label) { batch_label_ = batch_label; } | |||||
| const string &GetBatchLabel() const { return batch_label_; } | |||||
| protected: | protected: | ||||
| bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr); | bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr); | ||||
| @@ -93,7 +97,7 @@ class ZeroCopyTask { | |||||
| const size_t args_size_; | const size_t args_size_; | ||||
| vector<uint8_t> args_info_; | vector<uint8_t> args_info_; | ||||
| bool is_updated_; | bool is_updated_; | ||||
| string batch_label_; | |||||
| // <address from Op, {offset in args}> | // <address from Op, {offset in args}> | ||||
| map<uintptr_t, vector<size_t>> task_addr_offset_; | map<uintptr_t, vector<size_t>> task_addr_offset_; | ||||
| }; | }; | ||||
| @@ -267,6 +267,14 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, | |||||
| auto compute_graph = GraphUtils::GetComputeGraph(graph); | auto compute_graph = GraphUtils::GetComputeGraph(graph); | ||||
| if (compute_graph != nullptr) { | if (compute_graph != nullptr) { | ||||
| compute_graph->SetGraphID(graph_id); | compute_graph->SetGraphID(graph_id); | ||||
| bool graph_has_been_added = false; | |||||
| if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) && | |||||
| graph_has_been_added) { | |||||
| GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] same graph object can not be added again, graph_id = %u.", | |||||
| graph_id); | |||||
| return GE_GRAPH_GRAPH_ALREADY_EXIST; | |||||
| } | |||||
| (void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true); | |||||
| } else { | } else { | ||||
| GELOGE(FAILED, "compute graph is null"); | GELOGE(FAILED, "compute graph is null"); | ||||
| return FAILED; | return FAILED; | ||||
| @@ -1953,9 +1961,9 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { | |||||
| names_to_passes.emplace_back("MergePass", &merge_pass); | names_to_passes.emplace_back("MergePass", &merge_pass); | ||||
| names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass); | names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass); | ||||
| names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass); | names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass); | ||||
| names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); | |||||
| names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass); | names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass); | ||||
| names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass); | names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass); | ||||
| names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); | |||||
| names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); | names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); | ||||
| names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); | names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); | ||||
| names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass); | names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass); | ||||
| @@ -23,6 +23,7 @@ | |||||
| #include <mutex> | #include <mutex> | ||||
| #include "common/op/ge_op_utils.h" | #include "common/op/ge_op_utils.h" | ||||
| #include "common/util/error_manager/error_manager.h" | |||||
| #include "graph/utils/graph_utils.h" | #include "graph/utils/graph_utils.h" | ||||
| #include "graph/utils/op_desc_utils.h" | #include "graph/utils/op_desc_utils.h" | ||||
| #include "init/gelib.h" | #include "init/gelib.h" | ||||
| @@ -82,6 +83,8 @@ Status EnginePlacer::Run() { | |||||
| // If can't get op's engine name, keep check support finish and return failed | // If can't get op's engine name, keep check support finish and return failed | ||||
| if (engine_name.empty()) { | if (engine_name.empty()) { | ||||
| is_check_support_success = false; | is_check_support_success = false; | ||||
| ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"}, | |||||
| {op_desc->GetName(), op_desc->GetType()}); | |||||
| GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s", | GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s", | ||||
| node_ptr->GetOpDesc()->GetType().c_str()); | node_ptr->GetOpDesc()->GetType().c_str()); | ||||
| continue; | continue; | ||||
| @@ -190,6 +190,10 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc | |||||
| GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index); | GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| GE_IF_BOOL_EXEC( | |||||
| in_data_anchor->GetPeerOutAnchor() == nullptr, | |||||
| GELOGW("Get null input by index %d from node %s ", in_data_anchor->GetIdx(), node->GetName().c_str()); | |||||
| continue); | |||||
| data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor()); | data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor()); | ||||
| } | } | ||||
| @@ -239,7 +239,7 @@ Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, N | |||||
| GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32); | GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32); | ||||
| GeTensor tensor(const_tensor); | GeTensor tensor(const_tensor); | ||||
| tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t)); | |||||
| (void)tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t)); | |||||
| if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) { | if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) { | ||||
| GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str()); | GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str()); | ||||
| return FAILED; | return FAILED; | ||||
| @@ -50,9 +50,12 @@ Status InsertReshapeIfNeed(const NodePtr &node) { | |||||
| GE_CHECK_NOTNULL(src_tensor); | GE_CHECK_NOTNULL(src_tensor); | ||||
| for (auto dst_anchor : src_anchor->GetPeerInDataAnchors()) { | for (auto dst_anchor : src_anchor->GetPeerInDataAnchors()) { | ||||
| auto dst_node = dst_anchor->GetOwnerNode(); | auto dst_node = dst_anchor->GetOwnerNode(); | ||||
| GELOGD("Try insert reshape between %s[%d] and %s[%d] to keep the shape continues", node->GetName().c_str(), | |||||
| src_anchor->GetIdx(), dst_node->GetName().c_str(), dst_anchor->GetIdx()); | |||||
| GE_CHECK_NOTNULL(dst_node); | GE_CHECK_NOTNULL(dst_node); | ||||
| GE_CHECK_NOTNULL(dst_node->GetOpDesc()); | GE_CHECK_NOTNULL(dst_node->GetOpDesc()); | ||||
| auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx()); | auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx()); | ||||
| GE_CHECK_NOTNULL(dst_tensor); | |||||
| bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK && | bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK && | ||||
| dst_tensor->GetShape().GetDims() != UNKNOWN_RANK && | dst_tensor->GetShape().GetDims() != UNKNOWN_RANK && | ||||
| src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims(); | src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims(); | ||||
| @@ -113,10 +113,9 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) { | |||||
| desc->CopyAttrsFrom(*src_op_desc); | desc->CopyAttrsFrom(*src_op_desc); | ||||
| for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) { | for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) { | ||||
| auto input_desc = desc->MutableInputDesc(i); | auto input_desc = desc->MutableInputDesc(i); | ||||
| GE_IF_BOOL_EXEC(input_desc == nullptr, | |||||
| GELOGE(INTERNAL_ERROR, "Failed to get input desc by index %u from node %s when copy from %s", i, | |||||
| desc->GetName().c_str(), node->GetName().c_str()); | |||||
| return nullptr); | |||||
| GE_IF_BOOL_EXEC(input_desc == nullptr, GELOGW("Get null input desc by index %u from node %s when copy from %s", i, | |||||
| desc->GetName().c_str(), node->GetName().c_str()); | |||||
| continue); | |||||
| input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i)); | input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i)); | ||||
| } | } | ||||
| @@ -991,12 +990,17 @@ Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() { | |||||
| size_t i = 0; | size_t i = 0; | ||||
| for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { | for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { | ||||
| for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { | for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { | ||||
| auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY); | |||||
| GE_CHECK_NOTNULL(identity_desc); | |||||
| auto out_node = in_data_anchor->GetOwnerNode(); | auto out_node = in_data_anchor->GetOwnerNode(); | ||||
| auto op_desc = out_node->GetOpDesc(); | auto op_desc = out_node->GetOpDesc(); | ||||
| GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
| if ((out_node->GetType() == MERGE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) { | |||||
| GELOGD("No need to insert identity between %s and %s.", node->GetName().c_str(), out_node->GetName().c_str()); | |||||
| continue; | |||||
| } | |||||
| auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY); | |||||
| GE_CHECK_NOTNULL(identity_desc); | |||||
| string batch_label; | string batch_label; | ||||
| if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { | if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { | ||||
| if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { | if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { | ||||
| @@ -16,131 +16,262 @@ | |||||
| #include "host_kernels/strided_slice_kernel.h" | #include "host_kernels/strided_slice_kernel.h" | ||||
| #include <memory> | |||||
| #include "common/fp16_t.h" | #include "common/fp16_t.h" | ||||
| #include "common/ge_inner_error_codes.h" | #include "common/ge_inner_error_codes.h" | ||||
| #include "common/math/math_util.h" | #include "common/math/math_util.h" | ||||
| #include "common/op/ge_op_utils.h" | #include "common/op/ge_op_utils.h" | ||||
| #include "external/graph/types.h" | |||||
| #include "framework/common/debug/ge_log.h" | #include "framework/common/debug/ge_log.h" | ||||
| #include "host_kernels/kernel_utils.h" | |||||
| #include "graph/utils/type_utils.h" | #include "graph/utils/type_utils.h" | ||||
| #include "host_kernels/kernel_utils.h" | |||||
| #include "inc/kernel_factory.h" | #include "inc/kernel_factory.h" | ||||
| #include <memory> | |||||
| namespace ge { | namespace ge { | ||||
| namespace { | namespace { | ||||
| const int32_t kNumOne = 1; | const int32_t kNumOne = 1; | ||||
| const size_t kStridedSliceInputSize = 4; | const size_t kStridedSliceInputSize = 4; | ||||
| const size_t kStridedSliceInputIndex0 = 0; | |||||
| const size_t kStridedSliceInputIndex1 = 1; | |||||
| const size_t kStridedSliceInputIndex2 = 2; | |||||
| const size_t kStridedSliceInputIndex3 = 3; | |||||
| const int32_t kDefaultSrideSize = 1; | |||||
| } // namespace | |||||
| Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input, | |||||
| Attr &args) { | |||||
| int64_t begin_mask = 0; | |||||
| int64_t end_mask = 0; | |||||
| int64_t ellipsis_mask = 0; | |||||
| int64_t new_axis_mask = 0; | |||||
| int64_t shrink_axis_mask = 0; | |||||
| const size_t kStridedSliceInputIndex = 0; | |||||
| const size_t kStridedSliceBeginIndex = 1; | |||||
| const size_t kStridedSliceEndIndex = 2; | |||||
| const size_t kStridedSliceStrideIndex = 3; | |||||
| const int32_t kDefaultStrideSize = 1; | |||||
| const std::set<DataType> kIndexNumberType = {DT_INT32, DT_INT64}; | |||||
| if (attr == nullptr) { | |||||
| GELOGW("input opdescptr is nullptr."); | |||||
| return PARAM_INVALID; | |||||
| bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const int ellipsis_mask) { | |||||
| if (ellipsis_mask != 0) { | |||||
| auto ellipsis_num = 0; | |||||
| auto input_shape = input_desc->GetShape(); | |||||
| bool ellipsis_mask_flag = false; | |||||
| for (size_t i = 0; i < input_shape.GetDimNum(); i++) { | |||||
| uint32_t i_temp = static_cast<uint32_t>(i); | |||||
| ellipsis_mask_flag = (static_cast<uint32_t>(ellipsis_mask) & (1 << i_temp)); | |||||
| if (ellipsis_mask_flag) { | |||||
| ++ellipsis_num; | |||||
| } | |||||
| if (ellipsis_num > 1) { | |||||
| GELOGW("Only one non-zero bit is allowed in ellipsis_mask."); | |||||
| return false; | |||||
| } | |||||
| } | |||||
| } | } | ||||
| if (input.size() != kStridedSliceInputSize) { | |||||
| GELOGW("The number of input for strided slice must be %zu.", kStridedSliceInputSize); | |||||
| return PARAM_INVALID; | |||||
| return true; | |||||
| } | |||||
| } // namespace | |||||
| Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input, | |||||
| vector<ge::GeTensorPtr> &v_output) { | |||||
| GELOGD("StridedSliceKernel in."); | |||||
| // 1.Check input and attrs | |||||
| if (CheckAndGetAttr(attr) != SUCCESS) { | |||||
| GELOGW("Check and get attrs failed.Ignore kernel."); | |||||
| return NOT_CHANGED; | |||||
| } | } | ||||
| if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_BEGIN_MASK, begin_mask)) { | |||||
| GELOGW("get begin_mask attr failed."); | |||||
| return PARAM_INVALID; | |||||
| if (CheckInputParam(input) != SUCCESS) { | |||||
| GELOGW("Check input params failed.Ignore kernel."); | |||||
| return NOT_CHANGED; | |||||
| } | } | ||||
| if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_END_MASK, end_mask)) { | |||||
| GELOGW("get end_mask attr failed."); | |||||
| return PARAM_INVALID; | |||||
| // 2.Init param with mask attrs. | |||||
| std::vector<int64_t> input_dims; | |||||
| std::vector<int64_t> begin_vec; | |||||
| std::vector<int64_t> output_dims; | |||||
| std::vector<int64_t> stride_vec; | |||||
| if (InitParamWithAttrs(input, input_dims, begin_vec, output_dims, stride_vec) != SUCCESS) { | |||||
| GELOGW("Init param with mask attrs failed.Ignore kernel."); | |||||
| return NOT_CHANGED; | |||||
| } | } | ||||
| if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_ELLIPSIS_MASK, ellipsis_mask)) { | |||||
| GELOGW("get ellipsis_mask attr failed."); | |||||
| return PARAM_INVALID; | |||||
| // 3.Set sliced data to output_ptr | |||||
| ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex]; | |||||
| auto data_type = weight0->GetTensorDesc().GetDataType(); | |||||
| size_t data_size = weight0->GetData().size() / GetSizeByDataType(data_type); | |||||
| void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data())); | |||||
| GE_CHECK_NOTNULL(data); | |||||
| // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. | |||||
| auto output_tensor_desc = attr->GetOutputDesc(0); | |||||
| GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc); | |||||
| if (output_ptr == nullptr) { | |||||
| GELOGE(MEMALLOC_FAILED, "MakeShared GeTensor failed, node name %s.", attr->GetName().c_str()); | |||||
| return NOT_CHANGED; | |||||
| } | } | ||||
| if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_NEW_AXIS_MASK, new_axis_mask)) { | |||||
| GELOGW("get new_axis_mask attr failed."); | |||||
| return PARAM_INVALID; | |||||
| auto ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), data_type, input_dims, begin_vec, | |||||
| output_dims, output_ptr.get(), stride_vec); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "SetOutputSliceData failed."); | |||||
| return NOT_CHANGED; | |||||
| } | } | ||||
| if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, shrink_axis_mask)) { | |||||
| GELOGW("get shrink_axis_mask attr failed."); | |||||
| // 4.Set output data_type and shape | |||||
| GeTensorDesc &t_d = output_ptr->MutableTensorDesc(); | |||||
| t_d.SetDataType(static_cast<DataType>(data_type)); | |||||
| auto final_dim_size = static_cast<uint32_t>(output_dims.size()); | |||||
| vector<int64_t> v_dims; | |||||
| GetOutputDims(final_dim_size, output_dims, v_dims); | |||||
| t_d.SetShape(GeShape(v_dims)); | |||||
| v_output.push_back(output_ptr); | |||||
| GELOGI("StridedSliceKernel success."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr) { | |||||
| if (attr == nullptr) { | |||||
| GELOGE(PARAM_INVALID, "input opdescptr is nullptr."); | |||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| } | } | ||||
| if ((ellipsis_mask != 0) || (new_axis_mask != 0)) { | |||||
| GELOGW("ellipsis_mask or new_axis_mask must be 0 with optimizer."); | |||||
| return NOT_CHANGED; | |||||
| // Get all op attr value of strided_slice | |||||
| for (auto &attr_2_value : attr_value_map_) { | |||||
| if (!AttrUtils::GetInt(attr, attr_2_value.first, attr_2_value.second)) { | |||||
| GELOGE(PARAM_INVALID, "Get %s attr failed.", attr_2_value.first.c_str()); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| } | } | ||||
| const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex0); | |||||
| // Check ellipsis_mask is valid | |||||
| const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex); | |||||
| GE_CHECK_NOTNULL(input_desc); | GE_CHECK_NOTNULL(input_desc); | ||||
| DataType data_type = input_desc->GetDataType(); | |||||
| if ((data_type != DT_FLOAT) && (data_type != DT_INT32)) { | |||||
| GELOGW( | |||||
| "Data type of StridedSlice OP must be float or int32." | |||||
| "Constant folding will not be carried out in this condition" | |||||
| "which might affect the time performance but not the accuracy"); | |||||
| } | |||||
| args.begin_mask = begin_mask; | |||||
| args.end_mask = end_mask; | |||||
| args.ellipsis_mask = ellipsis_mask; | |||||
| args.new_axis_mask = new_axis_mask; | |||||
| args.data_type = static_cast<int64_t>(data_type); | |||||
| args.shrink_axis_mask = shrink_axis_mask; | |||||
| ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0]; | |||||
| ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1]; | |||||
| ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2]; | |||||
| ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3]; | |||||
| if (CheckWeight(weight0, weight1, weight2, weight3) != SUCCESS) { | |||||
| GELOGW("Check And Get Attr failed."); | |||||
| auto ellipsis_mask = attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK); | |||||
| if (!IsEllipsisMaskValid(input_desc, ellipsis_mask)) { | |||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status StridedSliceKernel::CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, | |||||
| const ConstGeTensorPtr &weight2, const ConstGeTensorPtr &weight3) const { | |||||
| if ((weight0 == nullptr) || (weight1 == nullptr) || (weight2 == nullptr) || (weight3 == nullptr)) { | |||||
| GELOGW("weight is nullptr."); | |||||
| Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const { | |||||
| if (input.size() != kStridedSliceInputSize) { | |||||
| GELOGE(PARAM_INVALID, "The number of input for strided slice must be %zu.", kStridedSliceInputSize); | |||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| } | } | ||||
| if (!(weight1->GetTensorDesc().GetDataType() == DT_INT32 && weight2->GetTensorDesc().GetDataType() == DT_INT32 && | |||||
| weight3->GetTensorDesc().GetDataType() == DT_INT32)) { | |||||
| GELOGE(INTERNAL_ERROR, "Data type of StridedSlice OP(begin,end,strides) must be int32."); | |||||
| return INTERNAL_ERROR; | |||||
| ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex]; | |||||
| ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex]; | |||||
| ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex]; | |||||
| ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex]; | |||||
| GE_CHECK_NOTNULL(weight0); | |||||
| GE_CHECK_NOTNULL(begin_tensor); | |||||
| GE_CHECK_NOTNULL(end_tensor); | |||||
| GE_CHECK_NOTNULL(stride_tensor); | |||||
| // check if begin,end,strides data type is supported | |||||
| auto begin_tensor_desc = begin_tensor->GetTensorDesc(); | |||||
| auto end_tensor_desc = begin_tensor->GetTensorDesc(); | |||||
| auto stride_tensor_desc = begin_tensor->GetTensorDesc(); | |||||
| if (begin_tensor_desc.GetDataType() != end_tensor_desc.GetDataType() || | |||||
| end_tensor_desc.GetDataType() != stride_tensor_desc.GetDataType()) { | |||||
| GELOGW("Data type of StridedSlice OP(begin,end,strides) must be same."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| if (kIndexNumberType.find(begin_tensor_desc.GetDataType()) == kIndexNumberType.end()) { | |||||
| GELOGW("Data type of StridedSlice OP(begin,end,strides) must be int32 or int64."); | |||||
| return PARAM_INVALID; | |||||
| } | } | ||||
| // check data | // check data | ||||
| size_t weight0_size = weight0->GetData().size() / sizeof(int32_t); | |||||
| size_t weight1_size = weight1->GetData().size() / sizeof(int32_t); | |||||
| size_t weight2_size = weight2->GetData().size() / sizeof(int32_t); | |||||
| size_t weight3_size = weight3->GetData().size() / sizeof(int32_t); | |||||
| if ((weight0_size == 0) || (weight1_size == 0) || (weight2_size == 0) || (weight3_size == 0)) { | |||||
| auto x_data_type = weight0->GetTensorDesc().GetDataType(); | |||||
| auto x_data_size = GetSizeByDataType(x_data_type); | |||||
| if (x_data_size < 0) { | |||||
| GELOGW("Data type of x input %s is not supported.", TypeUtils::DataTypeToSerialString(x_data_type).c_str()); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| size_t weight0_size = weight0->GetData().size() / x_data_size; | |||||
| size_t begin_data_size = begin_tensor->GetData().size() / sizeof(int32_t); | |||||
| size_t end_data_size = end_tensor->GetData().size() / sizeof(int32_t); | |||||
| size_t stride_data_size = stride_tensor->GetData().size() / sizeof(int32_t); | |||||
| if ((weight0_size == 0) || (begin_data_size == 0) || (end_data_size == 0) || (stride_data_size == 0)) { | |||||
| GELOGW("Data size of inputs is 0."); | GELOGW("Data size of inputs is 0."); | ||||
| return PARAM_INVALID; | return PARAM_INVALID; | ||||
| } | } | ||||
| // check dim size | // check dim size | ||||
| size_t weight0_dim_size = weight0->GetTensorDesc().GetShape().GetDimNum(); | |||||
| if (!((weight0_dim_size >= weight1_size) && (weight1_size == weight2_size) && (weight1_size == weight3_size))) { | |||||
| if (!((begin_data_size == end_data_size) && (end_data_size == stride_data_size))) { | |||||
| GELOGW("The sizes of begin, end and stride is not supported."); | GELOGW("The sizes of begin, end and stride is not supported."); | ||||
| return NOT_CHANGED; | |||||
| return PARAM_INVALID; | |||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, | |||||
| int32_t &begin_i, int32_t &end_i, int32_t &dim_i) const { | |||||
| Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input, | |||||
| std::vector<int64_t> &input_dims, std::vector<int64_t> &begin_vec, | |||||
| std::vector<int64_t> &output_dims, std::vector<int64_t> &stride_vec) { | |||||
| ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex]; | |||||
| ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex]; | |||||
| ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex]; | |||||
| ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex]; | |||||
| const GeShape x_shape = weight0->GetTensorDesc().GetShape(); | |||||
| auto x_dims = x_shape.GetDims(); | |||||
| auto x_dims_num = x_shape.GetDimNum(); | |||||
| // handle new_axis_mask | |||||
| ExpandDimsWithNewAxis(begin_tensor, x_dims_num, x_dims); | |||||
| const int32_t *begin = reinterpret_cast<const int32_t *>(begin_tensor->GetData().data()); | |||||
| const int32_t *end = reinterpret_cast<const int32_t *>(end_tensor->GetData().data()); | |||||
| const int32_t *stride = reinterpret_cast<const int32_t *>(stride_tensor->GetData().data()); | |||||
| auto begin_dim_num = begin_tensor->GetData().size() / sizeof(int32_t); | |||||
| auto min_dim = x_dims_num > begin_dim_num ? begin_dim_num : x_dims_num; | |||||
| for (size_t i = 0; i < x_dims.size(); ++i) { | |||||
| auto i_temp = static_cast<uint64_t>(i); | |||||
| bool new_axis_mask_flag = | |||||
| (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp)); | |||||
| if (new_axis_mask_flag) { | |||||
| output_dims.push_back(1); | |||||
| input_dims.push_back(1); | |||||
| begin_vec.push_back(0); | |||||
| stride_vec.push_back(1); | |||||
| continue; | |||||
| } | |||||
| int64_t begin_i = 0; | |||||
| int64_t end_i = 0; | |||||
| int64_t stride_i = 1; | |||||
| if (i < min_dim) { | |||||
| begin_i = begin[i]; | |||||
| end_i = end[i]; | |||||
| stride_i = stride[i]; | |||||
| } else { | |||||
| begin_i = 0; | |||||
| end_i = x_dims.at(i); | |||||
| stride_i = 1; | |||||
| } | |||||
| GELOGD("Before mask calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i, | |||||
| stride_i, x_dims.at(i)); | |||||
| auto ret = MaskCal(i, begin_i, end_i, x_dims.at(i)); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGW("MaskCal failed, because of data overflow."); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| int64_t dim_final; | |||||
| GELOGD("Before stride calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i, | |||||
| stride_i, x_dims.at(i)); | |||||
| (void)StrideCal(x_dims.at(i), begin_i, end_i, stride_i, dim_final); | |||||
| output_dims.push_back(dim_final); | |||||
| input_dims.push_back(x_dims.at(i)); | |||||
| begin_vec.push_back(begin_i); | |||||
| stride_vec.push_back(stride_i); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num, | |||||
| vector<int64_t> &x_dims) { | |||||
| auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType()); | |||||
| size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size; | |||||
| auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num; | |||||
| for (size_t i = 0; i < final_dim_num; i++) { | |||||
| auto i_temp = static_cast<uint64_t>(i); | |||||
| bool new_axis_mask_flag = | |||||
| (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp)); | |||||
| if (new_axis_mask_flag) { | |||||
| x_dims.insert(x_dims.begin() + i, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| Status StridedSliceKernel::MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const { | |||||
| uint64_t i_temp = static_cast<uint64_t>(i); | |||||
| bool begin_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_BEGIN_MASK)) & (1 << i_temp)); | |||||
| bool end_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK)) & (1 << i_temp)); | |||||
| bool ellipsis_mask_flag = | |||||
| (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK)) & (1 << i_temp)); | |||||
| bool shrink_mask_flag = | |||||
| (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << i_temp)); | |||||
| if (shrink_mask_flag) { | if (shrink_mask_flag) { | ||||
| begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i); | begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i); | ||||
| FMK_INT32_ADDCHECK(begin_i, kNumOne); | |||||
| FMK_INT32_ADDCHECK(begin_i, kNumOne) | |||||
| end_i = begin_i + kNumOne; | end_i = begin_i + kNumOne; | ||||
| } else { | } else { | ||||
| if (begin_mask_flag) { | if (begin_mask_flag) { | ||||
| @@ -153,130 +284,43 @@ Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_ | |||||
| } else { | } else { | ||||
| end_i = (end_i < 0 ? (dim_i + end_i) : end_i); | end_i = (end_i < 0 ? (dim_i + end_i) : end_i); | ||||
| } | } | ||||
| if (ellipsis_mask_flag) { | |||||
| begin_i = 0; | |||||
| end_i = dim_i; | |||||
| } | |||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status StridedSliceKernel::StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i, | |||||
| int64_t &dim_final) const { | |||||
| if (stride_i == 0) { | |||||
| stride_i = kDefaultStrideSize; | |||||
| } else if (stride_i < 0) { | |||||
| stride_i = -stride_i; | |||||
| begin_i = x_dims_i - begin_i - 1; | |||||
| end_i = x_dims_i - end_i - 1; | |||||
| } | |||||
| void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args, | |||||
| if (end_i > x_dims_i) { | |||||
| end_i = x_dims_i; | |||||
| } | |||||
| if ((begin_i == 0) && (end_i == 0)) { | |||||
| dim_final = x_dims_i; | |||||
| } else { | |||||
| dim_final = abs(end_i - begin_i) / stride_i; | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, | |||||
| vector<int64_t> &v_dims) { | vector<int64_t> &v_dims) { | ||||
| for (uint32_t k = 0; k < dims_size; k++) { | for (uint32_t k = 0; k < dims_size; k++) { | ||||
| bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << k)); | |||||
| bool shrink_mask_i = (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << k)); | |||||
| if (shrink_mask_i) { | if (shrink_mask_i) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| v_dims.push_back(output_dims[k]); | v_dims.push_back(output_dims[k]); | ||||
| } | } | ||||
| } | } | ||||
| Status StridedSliceKernel::CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr) { | |||||
| // check dim not all less than 0 | |||||
| for (auto dim : output_dims) { | |||||
| if (dim > 0) { | |||||
| return SUCCESS; | |||||
| } | |||||
| } | |||||
| GELOGW("all output dim <=0, can't be processed. op_name : %s", attr->GetName().c_str()); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input, | |||||
| vector<ge::GeTensorPtr> &v_output) { | |||||
| GELOGI("StridedSliceKernel in."); | |||||
| Attr args; | |||||
| Status ret = CheckAndGetAttr(attr, input, args); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGW("Check And Get Attr failed."); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0]; | |||||
| ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1]; | |||||
| ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2]; | |||||
| ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3]; | |||||
| const GeShape x_shape = weight0->GetTensorDesc().GetShape(); | |||||
| size_t dim_size = x_shape.GetDimNum(); | |||||
| size_t data_size = weight0->GetData().size() / sizeof(int32_t); | |||||
| const int32_t *begin = reinterpret_cast<const int32_t *>(weight1->GetData().data()); | |||||
| const int32_t *end = reinterpret_cast<const int32_t *>(weight2->GetData().data()); | |||||
| const int32_t *stride = reinterpret_cast<const int32_t *>(weight3->GetData().data()); | |||||
| if ((begin == nullptr) || (end == nullptr) || (stride == nullptr)) { | |||||
| GELOGW("input weight tensor is nullptr."); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| std::vector<int64_t> input_dims; | |||||
| std::vector<int64_t> begin_vec; | |||||
| std::vector<int64_t> output_dims; | |||||
| std::vector<int64_t> stride_vec; | |||||
| int64_t dim_final; | |||||
| for (size_t i = 0; i < dim_size; i++) { | |||||
| int32_t begin_i = begin[i]; | |||||
| int32_t end_i = end[i]; | |||||
| int32_t stride_i = stride[i]; | |||||
| int32_t dim_i = static_cast<int32_t>(x_shape.GetDim(i)); | |||||
| GELOGI("%d\t %d\t %d\t %d", begin_i, end_i, stride_i, dim_i); | |||||
| uint32_t i_temp = static_cast<uint32_t>(i); | |||||
| bool begin_mask_i = (static_cast<uint32_t>(args.begin_mask) & (1 << i_temp)); | |||||
| bool end_mask_i = (static_cast<uint32_t>(args.end_mask) & (1 << i_temp)); | |||||
| bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << i_temp)); | |||||
| ret = MaskCal(begin_mask_i, end_mask_i, shrink_mask_i, begin_i, end_i, dim_i); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGW("MaskCal failed, because of data overflow."); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| if (stride_i == 0) { | |||||
| stride_i = kDefaultSrideSize; | |||||
| } else if (stride_i < 0) { | |||||
| stride_i = -stride_i; | |||||
| begin_i = x_shape.GetDim(i) - begin_i - 1; | |||||
| end_i = x_shape.GetDim(i) - end_i - 1; | |||||
| } | |||||
| if ((begin_i == 0) && (end_i == 0)) { | |||||
| dim_final = x_shape.GetDim(i); | |||||
| } else { | |||||
| dim_final = abs(end_i - begin_i) / stride_i; | |||||
| } | |||||
| output_dims.push_back(dim_final); | |||||
| input_dims.push_back(x_shape.GetDim(i)); | |||||
| begin_vec.push_back(begin_i); | |||||
| stride_vec.push_back(stride_i); | |||||
| } | |||||
| // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. | |||||
| auto output_tensor_desc = attr->GetOutputDesc(0); | |||||
| GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc); | |||||
| if (output_ptr == nullptr) { | |||||
| GELOGW("MakeShared GeTensor failed, node name %s.", attr->GetName().c_str()); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data())); | |||||
| GE_CHECK_NOTNULL(data); | |||||
| ret = CheckOutputDims(output_dims, attr); | |||||
| if (ret != SUCCESS) { | |||||
| return ret; | |||||
| } | |||||
| ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), args.data_type, input_dims, begin_vec, | |||||
| output_dims, output_ptr.get(), stride_vec); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGW("SetOutputSliceData failed."); | |||||
| return NOT_CHANGED; | |||||
| } | |||||
| GeTensorDesc &t_d = output_ptr->MutableTensorDesc(); | |||||
| t_d.SetDataType(static_cast<DataType>(args.data_type)); | |||||
| uint32_t final_dim_size = static_cast<uint32_t>(output_dims.size()); | |||||
| vector<int64_t> v_dims; | |||||
| GetOutputDims(final_dim_size, output_dims, args, v_dims); | |||||
| t_d.SetShape(GeShape(v_dims)); | |||||
| v_output.push_back(output_ptr); | |||||
| GELOGI("StridedSliceKernel success."); | |||||
| return SUCCESS; | |||||
| } | |||||
| REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel); | REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel); | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -17,34 +17,33 @@ | |||||
| #ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | #ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | ||||
| #define GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | #define GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | ||||
| #include <vector> | |||||
| #include "inc/kernel.h" | #include "inc/kernel.h" | ||||
| #include <vector> | |||||
| namespace ge { | namespace ge { | ||||
| struct Attr { | |||||
| int64_t begin_mask; | |||||
| int64_t end_mask; | |||||
| int64_t ellipsis_mask; | |||||
| int64_t new_axis_mask; | |||||
| int64_t data_type; | |||||
| int64_t shrink_axis_mask; | |||||
| }; | |||||
| class StridedSliceKernel : public Kernel { | class StridedSliceKernel : public Kernel { | ||||
| public: | public: | ||||
| Status Compute(const OpDescPtr attr, const std::vector<ConstGeTensorPtr> &input, | Status Compute(const OpDescPtr attr, const std::vector<ConstGeTensorPtr> &input, | ||||
| vector<GeTensorPtr> &v_output) override; | vector<GeTensorPtr> &v_output) override; | ||||
| private: | private: | ||||
| Status CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input, Attr &args); | |||||
| Status CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, const ConstGeTensorPtr &weight2, | |||||
| const ConstGeTensorPtr &weight3) const; | |||||
| Status MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, int32_t &begin_i, | |||||
| int32_t &end_i, int32_t &dim_i) const; | |||||
| void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args, | |||||
| vector<int64_t> &v_dims); | |||||
| Status CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr); | |||||
| Status CheckAndGetAttr(const OpDescPtr &attr); | |||||
| Status CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const; | |||||
| Status InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input, std::vector<int64_t> &input_dims, | |||||
| std::vector<int64_t> &begin_vec, std::vector<int64_t> &output_dims, | |||||
| std::vector<int64_t> &stride_vec); | |||||
| Status MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const; | |||||
| Status StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i, | |||||
| int64_t &dim_final) const; | |||||
| void ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num, vector<int64_t> &x_dims); | |||||
| void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, vector<int64_t> &v_dims); | |||||
| map<string, uint32_t> attr_value_map_ = {{STRIDE_SLICE_ATTR_BEGIN_MASK, 0}, | |||||
| {STRIDE_SLICE_ATTR_END_MASK, 0}, | |||||
| {STRIDE_SLICE_ATTR_ELLIPSIS_MASK, 0}, | |||||
| {STRIDE_SLICE_ATTR_NEW_AXIS_MASK, 0}, | |||||
| {STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, 0}}; | |||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| #endif // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | #endif // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ | ||||
| @@ -27,6 +27,12 @@ const char *const kEnvProfilingLevel = "HYBRID_PROFILING_LEVEL"; | |||||
| HybridModelExecutor::HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream) | HybridModelExecutor::HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream) | ||||
| : model_(model), device_id_(device_id), stream_(stream) {} | : model_(model), device_id_(device_id), stream_(stream) {} | ||||
| HybridModelExecutor::~HybridModelExecutor() { | |||||
| if (context_.rt_gen_context != nullptr) { | |||||
| (void)rtCtxDestroy(context_.rt_gen_context); | |||||
| } | |||||
| } | |||||
| Status HybridModelExecutor::Init() { | Status HybridModelExecutor::Init() { | ||||
| GELOGD("Start to init HybridGraphEngine."); | GELOGD("Start to init HybridGraphEngine."); | ||||
| GE_CHK_STATUS_RET_NOLOG(InitExecutionContext()); | GE_CHK_STATUS_RET_NOLOG(InitExecutionContext()); | ||||
| @@ -35,7 +35,7 @@ class HybridModelExecutor { | |||||
| HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream); | HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream); | ||||
| ~HybridModelExecutor() = default; | |||||
| ~HybridModelExecutor(); | |||||
| Status Init(); | Status Init(); | ||||
| @@ -618,7 +618,8 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ | |||||
| } | } | ||||
| int64_t var_size = CalcVarSizeInBytes(*tensor_desc); | int64_t var_size = CalcVarSizeInBytes(*tensor_desc); | ||||
| tensor.reset(new (std::nothrow) TensorValue(dev_mem, var_size)); | |||||
| // var size is only for checking, will not allocate any memory by it | |||||
| tensor.reset(new (std::nothrow) TensorValue(dev_mem, static_cast<size_t>(var_size))); | |||||
| GE_CHECK_NOTNULL(tensor); | GE_CHECK_NOTNULL(tensor); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -197,7 +197,7 @@ void AicpuExtInfoHandler::GetShapeAndType(const AicpuShapeAndType *shape_and_typ | |||||
| dims.emplace_back(tmpDim); | dims.emplace_back(tmpDim); | ||||
| } | } | ||||
| data_type = static_cast<DataType>(shape_and_type->type); | data_type = static_cast<DataType>(shape_and_type->type); | ||||
| shape = std::move(GeShape(dims)); | |||||
| shape = GeShape(dims); | |||||
| } | } | ||||
| } // namespace hybrid | } // namespace hybrid | ||||
| } // namespace ge | |||||
| } // namespace ge | |||||
| @@ -48,6 +48,7 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) { | |||||
| std::vector<ConstGeTensorPtr> inputs; | std::vector<ConstGeTensorPtr> inputs; | ||||
| for (int32_t i = 0; i < context.NumInputs(); ++i) { | for (int32_t i = 0; i < context.NumInputs(); ++i) { | ||||
| const auto &input_desc = op_desc->GetInputDesc(i); | const auto &input_desc = op_desc->GetInputDesc(i); | ||||
| GE_CHECK_NOTNULL(context.GetInput(i)); | |||||
| auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()), | auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()), | ||||
| context.GetInput(i)->GetSize()); | context.GetInput(i)->GetSize()); | ||||
| GE_CHECK_NOTNULL(in_tensor); | GE_CHECK_NOTNULL(in_tensor); | ||||
| @@ -167,7 +167,6 @@ Status GELib::SystemInitialize(const map<string, string> &options) { | |||||
| // In train and infer, profiling is always needed. | // In train and infer, profiling is always needed. | ||||
| InitOptions(options); | InitOptions(options); | ||||
| InitProfiling(this->options_); | |||||
| auto model_manager = ModelManager::GetInstance(); | auto model_manager = ModelManager::GetInstance(); | ||||
| GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
| GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS, | GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS, | ||||
| @@ -175,23 +174,23 @@ Status GELib::SystemInitialize(const map<string, string> &options) { | |||||
| return FAILED); | return FAILED); | ||||
| // 1.`is_train_mode_` means case: train | // 1.`is_train_mode_` means case: train | ||||
| // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer | // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer | ||||
| // these two case need call `InitSystemWithOptions->rtGetDeviceIndexByPhyId` | |||||
| // to convert phy device id to logical device id | |||||
| // note:rtGetDeviceIndexByPhyId return `0` logical id when input phy device id is `0` | |||||
| // these two case with logical device id | |||||
| if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) { | if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) { | ||||
| InitProfiling(this->options_, true); | |||||
| status = InitSystemWithOptions(this->options_); | status = InitSystemWithOptions(this->options_); | ||||
| } else { | } else { | ||||
| InitProfiling(this->options_); | |||||
| status = InitSystemWithoutOptions(); | status = InitSystemWithoutOptions(); | ||||
| } | } | ||||
| return status; | return status; | ||||
| } | } | ||||
| void GELib::InitProfiling(Options &options) { | |||||
| void GELib::InitProfiling(Options &options, bool convert_2_phy_device_id) { | |||||
| GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id); | GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id); | ||||
| std::lock_guard<std::mutex> lock(status_mutex_); | std::lock_guard<std::mutex> lock(status_mutex_); | ||||
| GetContext().Init(); | GetContext().Init(); | ||||
| // Profiling init | // Profiling init | ||||
| if (ProfilingManager::Instance().Init(options) != SUCCESS) { | |||||
| if (ProfilingManager::Instance().Init(options, convert_2_phy_device_id) != SUCCESS) { | |||||
| GELOGW("Profiling init failed."); | GELOGW("Profiling init failed."); | ||||
| } | } | ||||
| } | } | ||||
| @@ -362,6 +361,9 @@ Status GELib::Finalize() { | |||||
| GELOGW("not initialize"); | GELOGW("not initialize"); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) { | |||||
| GE_CHK_RT_RET(rtSetDevice(options_.device_id)); | |||||
| } | |||||
| Status final_state = SUCCESS; | Status final_state = SUCCESS; | ||||
| Status mid_state; | Status mid_state; | ||||
| GELOGI("engineManager finalization."); | GELOGI("engineManager finalization."); | ||||
| @@ -412,10 +414,14 @@ Status GELib::Finalize() { | |||||
| GetMutableGlobalOptions().erase(ENABLE_SINGLE_STREAM); | GetMutableGlobalOptions().erase(ENABLE_SINGLE_STREAM); | ||||
| if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) { | |||||
| GE_CHK_RT_RET(rtDeviceReset(options_.device_id)); | |||||
| } | |||||
| instancePtr_ = nullptr; | instancePtr_ = nullptr; | ||||
| init_flag_ = false; | init_flag_ = false; | ||||
| if (final_state != SUCCESS) { | if (final_state != SUCCESS) { | ||||
| GELOGE(FAILED, "MemManager finalization."); | |||||
| GELOGE(FAILED, "finalization failed."); | |||||
| return final_state; | return final_state; | ||||
| } | } | ||||
| GELOGI("finalization success."); | GELOGI("finalization success."); | ||||
| @@ -68,7 +68,7 @@ class GELib { | |||||
| // get incre build cache path | // get incre build cache path | ||||
| const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; } | const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; } | ||||
| void InitProfiling(Options &options); | |||||
| void InitProfiling(Options &options, bool convert_2_phy_device_id = false); | |||||
| void ShutDownProfiling(); | void ShutDownProfiling(); | ||||
| Status InitSystemWithoutOptions(); | Status InitSystemWithoutOptions(); | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include <map> | #include <map> | ||||
| #include <memory> | #include <memory> | ||||
| #include <vector> | #include <vector> | ||||
| #include "common/dump/dump_properties.h" | |||||
| #include "common/util.h" | #include "common/util.h" | ||||
| #include "framework/common/debug/ge_log.h" | #include "framework/common/debug/ge_log.h" | ||||
| #include "graph/ge_context.h" | #include "graph/ge_context.h" | ||||
| @@ -30,6 +31,8 @@ | |||||
| namespace ge { | namespace ge { | ||||
| namespace { | namespace { | ||||
| const int32_t kDumpStatus = 0; | |||||
| Status CheckReuseMemoryOption(const std::map<string, string> &options) { | Status CheckReuseMemoryOption(const std::map<string, string> &options) { | ||||
| auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY); | auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY); | ||||
| if (iter != options.end()) { | if (iter != options.end()) { | ||||
| @@ -47,7 +50,7 @@ Status CheckReuseMemoryOption(const std::map<string, string> &options) { | |||||
| } // namespace | } // namespace | ||||
| static std::mutex mutex_; // BuildGraph and RunGraph use | static std::mutex mutex_; // BuildGraph and RunGraph use | ||||
| bool InnerSession::is_dump_server_inited_ = false; | |||||
| InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options) | InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options) | ||||
| : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {} | : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {} | ||||
| @@ -71,12 +74,12 @@ Status InnerSession::Initialize() { | |||||
| GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId())); | GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId())); | ||||
| PropertiesManager::Instance().GetDumpProperties(session_id_).InitByOptions(); | |||||
| DumpProperties dump_properties; | |||||
| dump_properties.InitByOptions(); | |||||
| ret = graph_manager_.Initialize(options_); | ret = graph_manager_.Initialize(options_); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_); | GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_); | ||||
| PropertiesManager::Instance().RemoveDumpProperties(session_id_); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -84,7 +87,6 @@ Status InnerSession::Initialize() { | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(ret, "failed to set malloc size"); | GELOGE(ret, "failed to set malloc size"); | ||||
| (void)graph_manager_.Finalize(); | (void)graph_manager_.Finalize(); | ||||
| PropertiesManager::Instance().RemoveDumpProperties(session_id_); | |||||
| GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -95,7 +97,6 @@ Status InnerSession::Initialize() { | |||||
| ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID); | ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(ret, "failed to init session instance"); | GELOGE(ret, "failed to init session instance"); | ||||
| PropertiesManager::Instance().RemoveDumpProperties(session_id_); | |||||
| } | } | ||||
| init_flag_ = true; | init_flag_ = true; | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -120,8 +121,6 @@ Status InnerSession::Finalize() { | |||||
| GELOGI("VarManager free var memory."); | GELOGI("VarManager free var memory."); | ||||
| (void)VarManager::Instance(session_id_)->FreeVarMemory(); | (void)VarManager::Instance(session_id_)->FreeVarMemory(); | ||||
| PropertiesManager::Instance().RemoveDumpProperties(session_id_); | |||||
| GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); | ||||
| return ret; | return ret; | ||||
| @@ -297,4 +296,5 @@ Status InnerSession::SaveVariables(const Graph &graph, const std::vector<std::st | |||||
| const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) { | const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) { | ||||
| return graph_manager_.SaveVariables(graph, var_names, outputs, var_values); | return graph_manager_.SaveVariables(graph, var_names, outputs, var_values); | ||||
| } | } | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -71,6 +71,7 @@ class InnerSession { | |||||
| std::mutex resource_mutex_; // AddGraph, RemoveGraph and Finalize use | std::mutex resource_mutex_; // AddGraph, RemoveGraph and Finalize use | ||||
| void UpdateThreadContext(const std::map<std::string, std::string> &options); | void UpdateThreadContext(const std::map<std::string, std::string> &options); | ||||
| void UpdateThreadContext(uint32_t graph_id); | void UpdateThreadContext(uint32_t graph_id); | ||||
| static bool is_dump_server_inited_; | |||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -24,6 +24,7 @@ | |||||
| #include "graph/load/new_model_manager/model_utils.h" | #include "graph/load/new_model_manager/model_utils.h" | ||||
| #include "runtime/mem.h" | #include "runtime/mem.h" | ||||
| #include "single_op/single_op_manager.h" | #include "single_op/single_op_manager.h" | ||||
| #include "graph/load/new_model_manager/model_manager.h" | |||||
| namespace ge { | namespace ge { | ||||
| namespace { | namespace { | ||||
| @@ -42,6 +43,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { | |||||
| delete task; | delete task; | ||||
| task = nullptr; | task = nullptr; | ||||
| } | } | ||||
| GELOGI("SingleOp destory sessionId = %lu", aicpu_session_id_); | |||||
| ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_); | |||||
| } | } | ||||
| Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) { | Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) { | ||||
| @@ -166,6 +169,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| ret = task->OpenDump(args_, stream_); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Open dump failed"); | |||||
| return ret; | |||||
| } | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| @@ -173,9 +181,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c | |||||
| void SingleOp::SetStream(rtStream_t stream) { stream_ = stream; } | void SingleOp::SetStream(rtStream_t stream) { stream_ = stream; } | ||||
| void SingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; } | |||||
| DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex, rtStream_t stream) | DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex, rtStream_t stream) | ||||
| : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {} | : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {} | ||||
| DynamicSingleOp::~DynamicSingleOp() { | |||||
| GELOGI("DynamicSingleOp destory sessionId = %lu", aicpu_session_id_); | |||||
| ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_); | |||||
| } | |||||
| Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, | Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, | ||||
| std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs) const { | std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs) const { | ||||
| if (inputs.size() != input_desc.size()) { | if (inputs.size() != input_desc.size()) { | ||||
| @@ -236,14 +251,22 @@ Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs, | |||||
| vector<GeTensorDesc> &output_desc, vector<void *> &outputs) { | |||||
| GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc)); | |||||
| std::vector<void *> workspace_buffers; | |||||
| GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers)); | |||||
| return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_); | |||||
| } | |||||
| Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers, | Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers, | ||||
| vector<GeTensorDesc> &output_desc, vector<DataBuffer> &output_buffers) { | vector<GeTensorDesc> &output_desc, vector<DataBuffer> &output_buffers) { | ||||
| GE_CHECK_NOTNULL(op_task_); | GE_CHECK_NOTNULL(op_task_); | ||||
| GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); | GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); | ||||
| std::lock_guard<std::mutex> lk(*stream_mutex_); | std::lock_guard<std::mutex> lk(*stream_mutex_); | ||||
| GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc)); | |||||
| std::vector<void *> workspace_buffers; | |||||
| GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers)); | |||||
| std::vector<void *> inputs; | std::vector<void *> inputs; | ||||
| std::vector<void *> outputs; | std::vector<void *> outputs; | ||||
| for (auto &buffer : input_buffers) { | for (auto &buffer : input_buffers) { | ||||
| @@ -252,6 +275,17 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, con | |||||
| for (auto &buffer : output_buffers) { | for (auto &buffer : output_buffers) { | ||||
| outputs.emplace_back(buffer.data); | outputs.emplace_back(buffer.data); | ||||
| } | } | ||||
| return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_); | |||||
| if (op_task_->GetOpTaskType() == OP_TASK_TBE) { | |||||
| return ExecuteTbeTask(input_desc, inputs, output_desc, outputs); | |||||
| } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) { | |||||
| return op_task_->LaunchKernel(input_desc, inputs, output_desc, outputs, stream_); | |||||
| } else { | |||||
| GELOGE(UNSUPPORTED, "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u", | |||||
| op_task_->GetOpTaskType()); | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| } | } | ||||
| void DynamicSingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; } | |||||
| } // namespace ge | } // namespace ge | ||||
| @@ -27,6 +27,7 @@ | |||||
| #include "framework/executor/ge_executor.h" | #include "framework/executor/ge_executor.h" | ||||
| #include "runtime/stream.h" | #include "runtime/stream.h" | ||||
| #include "task/op_task.h" | #include "task/op_task.h" | ||||
| #include "cce/aicpu_engine_struct.h" | |||||
| namespace ge { | namespace ge { | ||||
| class SingleOp { | class SingleOp { | ||||
| @@ -36,6 +37,7 @@ class SingleOp { | |||||
| Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | ||||
| void SetStream(rtStream_t stream); | void SetStream(rtStream_t stream); | ||||
| void SetSessionID(uint64_t session_id); | |||||
| private: | private: | ||||
| Status ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | Status ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | ||||
| @@ -50,6 +52,7 @@ class SingleOp { | |||||
| std::vector<void *> output_addr_list_; | std::vector<void *> output_addr_list_; | ||||
| std::vector<size_t> output_sizes_; | std::vector<size_t> output_sizes_; | ||||
| std::vector<uintptr_t> args_; | std::vector<uintptr_t> args_; | ||||
| uint64_t aicpu_session_id_ = 0; | |||||
| std::vector<OpTask *> tasks_; | std::vector<OpTask *> tasks_; | ||||
| std::vector<std::vector<uintptr_t *>> arg_table_; | std::vector<std::vector<uintptr_t *>> arg_table_; | ||||
| @@ -58,9 +61,10 @@ class SingleOp { | |||||
| class DynamicSingleOp { | class DynamicSingleOp { | ||||
| public: | public: | ||||
| DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); | DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); | ||||
| ~DynamicSingleOp() = default; | |||||
| ~DynamicSingleOp(); | |||||
| Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, | Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, | ||||
| std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs); | std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs); | ||||
| void SetSessionID(uint64_t session_id); | |||||
| private: | private: | ||||
| friend class SingleOpModel; | friend class SingleOpModel; | ||||
| @@ -69,12 +73,16 @@ class DynamicSingleOp { | |||||
| Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, std::vector<void *> &workspaces); | Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, std::vector<void *> &workspaces); | ||||
| std::unique_ptr<TbeOpTask> op_task_; | |||||
| Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs, | |||||
| vector<GeTensorDesc> &output_desc, vector<void *> &outputs); | |||||
| std::unique_ptr<OpTask> op_task_; | |||||
| uintptr_t resource_id_ = 0; | uintptr_t resource_id_ = 0; | ||||
| std::mutex *stream_mutex_; | std::mutex *stream_mutex_; | ||||
| rtStream_t stream_ = nullptr; | rtStream_t stream_ = nullptr; | ||||
| size_t num_inputs_ = 0; | size_t num_inputs_ = 0; | ||||
| size_t num_outputs_ = 0; | size_t num_outputs_ = 0; | ||||
| uint64_t aicpu_session_id_ = 0; | |||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| #endif // GE_SINGLE_OP_SINGLE_OP_H_ | #endif // GE_SINGLE_OP_SINGLE_OP_H_ | ||||
| @@ -16,6 +16,7 @@ | |||||
| #include "single_op/single_op_model.h" | #include "single_op/single_op_model.h" | ||||
| #include <atomic> | |||||
| #include <memory> | #include <memory> | ||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| @@ -31,6 +32,8 @@ | |||||
| #include "task/aicpu_kernel_task_builder.h" | #include "task/aicpu_kernel_task_builder.h" | ||||
| #include "task/tbe_task_builder.h" | #include "task/tbe_task_builder.h" | ||||
| static std::atomic<std::uint64_t> aicpu_sessionid(0); | |||||
| using domi::TaskDef; | using domi::TaskDef; | ||||
| using std::unique_ptr; | using std::unique_ptr; | ||||
| using std::vector; | using std::vector; | ||||
| @@ -250,17 +253,21 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| } | } | ||||
| single_op.tasks_.emplace_back(task); | single_op.tasks_.emplace_back(task); | ||||
| } else { | } else { | ||||
| GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernek are supported, but got %u", context.kernel_type()); | |||||
| GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type()); | |||||
| return UNSUPPORTED; | return UNSUPPORTED; | ||||
| } | } | ||||
| } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { | } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { | ||||
| GELOGD("Building AICPU_TF task"); | GELOGD("Building AICPU_TF task"); | ||||
| OpTask *task = nullptr; | |||||
| auto ret = BuildKernelExTask(task_def.kernel_ex(), single_op, &task); | |||||
| AiCpuTask *aicpu_task = nullptr; | |||||
| bool depend_compute_flag = false; | |||||
| uint64_t singleop_sessionid = aicpu_sessionid++; | |||||
| GELOGI("Build singleOp, sessionId = %lu", singleop_sessionid); | |||||
| auto ret = BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_sessionid); | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| single_op.tasks_.emplace_back(task); | |||||
| single_op.tasks_.emplace_back(aicpu_task); | |||||
| single_op.SetSessionID(singleop_sessionid); | |||||
| } else { | } else { | ||||
| // skip | // skip | ||||
| GELOGD("Skip task type: %d", static_cast<int>(task_type)); | GELOGD("Skip task type: %d", static_cast<int>(task_type)); | ||||
| @@ -316,7 +323,8 @@ Status SingleOpModel::BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTa | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task) { | |||||
| Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag, | |||||
| bool &depend_compute_flag, uint64_t session_id) { | |||||
| auto iter = op_list_.find(kernel_def.op_index()); | auto iter = op_list_.find(kernel_def.op_index()); | ||||
| if (iter == op_list_.end()) { | if (iter == op_list_.end()) { | ||||
| GELOGE(INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index()); | GELOGE(INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index()); | ||||
| @@ -329,11 +337,12 @@ Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, Sin | |||||
| return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
| } | } | ||||
| auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def); | auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def); | ||||
| auto ret = builder.BuildTask(*aicpu_task, model_params_); | |||||
| auto ret = builder.BuildTask(*aicpu_task, model_params_, dynamic_flag, session_id); | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(ret, "build aicpu_TF op task failed"); | GELOGE(ret, "build aicpu_TF op task failed"); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| depend_compute_flag = (aicpu_task->GetUnknownType() == DEPEND_COMPUTE); | |||||
| *task = aicpu_task.release(); | *task = aicpu_task.release(); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -370,6 +379,27 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { | |||||
| return BuildTaskList(single_op); | return BuildTaskList(single_op); | ||||
| } | } | ||||
| Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { | |||||
| const domi::KernelDef &kernel_def = task_def.kernel(); | |||||
| const auto &context = kernel_def.context(); | |||||
| auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type()); | |||||
| if (kernel_type == cce::ccKernelType::TE) { | |||||
| GELOGD("Building TBE task"); | |||||
| TbeOpTask *tbe_task = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task)); | |||||
| single_op.op_task_.reset(tbe_task); | |||||
| } else if (kernel_type == cce::ccKernelType::AI_CPU) { | |||||
| GELOGD("Building AICPU_CC task"); | |||||
| OpTask *task = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task)); | |||||
| single_op.op_task_.reset(task); | |||||
| } else { | |||||
| GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type()); | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { | Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { | ||||
| auto ge_model = model_helper_.GetGeModel(); | auto ge_model = model_helper_.GetGeModel(); | ||||
| GE_CHECK_NOTNULL(ge_model); | GE_CHECK_NOTNULL(ge_model); | ||||
| @@ -385,10 +415,30 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { | |||||
| GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks."); | GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks."); | ||||
| return UNSUPPORTED; | return UNSUPPORTED; | ||||
| } | } | ||||
| TbeOpTask *task = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &task)); | |||||
| single_op.op_task_.reset(task); | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(task_def, single_op)); | |||||
| } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { | |||||
| if (single_op.op_task_ != nullptr) { | |||||
| GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks."); | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| GELOGD("Building AICPU_TF task"); | |||||
| AiCpuTask *aicpu_task = nullptr; | |||||
| bool depend_compute_flag = false; | |||||
| uint64_t dynamic_singleop_sessionid = aicpu_sessionid++; | |||||
| GELOGI("Build dynamic singleOp, sessionId = %lu", dynamic_singleop_sessionid); | |||||
| GE_CHK_STATUS_RET_NOLOG( | |||||
| BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, true, depend_compute_flag, dynamic_singleop_sessionid)); | |||||
| if (depend_compute_flag) { | |||||
| if (i >= tasks.size() - 1) { | |||||
| GELOGE(FAILED, "The copy task of the fourth operator was not found."); | |||||
| return FAILED; | |||||
| } | |||||
| ++i; | |||||
| const TaskDef ©_task_def = tasks[i]; | |||||
| GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex())); | |||||
| } | |||||
| single_op.op_task_.reset(aicpu_task); | |||||
| single_op.SetSessionID(dynamic_singleop_sessionid); | |||||
| } else { | } else { | ||||
| // skip | // skip | ||||
| GELOGD("Skip task type: %d", static_cast<int>(task_type)); | GELOGD("Skip task type: %d", static_cast<int>(task_type)); | ||||
| @@ -66,8 +66,10 @@ class SingleOpModel { | |||||
| Status BuildTaskList(SingleOp &single_op); | Status BuildTaskList(SingleOp &single_op); | ||||
| Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); | Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); | ||||
| Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); | Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); | ||||
| Status BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task); | |||||
| Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag, | |||||
| bool &depend_compute_flag, uint64_t session_id); | |||||
| Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task); | Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task); | ||||
| Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); | |||||
| static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); | static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); | ||||
| void ParseArgTable(TbeOpTask *task, SingleOp &op); | void ParseArgTable(TbeOpTask *task, SingleOp &op); | ||||
| @@ -54,6 +54,29 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task) { | |||||
| task.SetSoName(so_name); | task.SetSoName(so_name); | ||||
| task.SetkernelName(kernel_name); | task.SetkernelName(kernel_name); | ||||
| task.op_desc_ = op_desc_; | task.op_desc_ = op_desc_; | ||||
| task.num_inputs_ = op_desc_->GetInputsSize(); | |||||
| task.num_outputs_ = op_desc_->GetOutputsSize(); | |||||
| // get kernel_ext_info | |||||
| auto &kernel_ext_info = kernel_def_.kernel_ext_info(); | |||||
| auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size(); | |||||
| GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED, | |||||
| "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(), | |||||
| kernel_ext_info_size); | |||||
| ret = task.SetExtInfoAndType(kernel_ext_info); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Init ext info failed."); | |||||
| return ret; | |||||
| } | |||||
| auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get()); | |||||
| if (task.ext_info_addr_dev_ != nullptr) { | |||||
| aicpu_param_head->extInfoLength = kernel_ext_info.size(); | |||||
| aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_); | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -30,13 +30,13 @@ Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<vo | |||||
| size_t arg_size = kernel_def_.args_size(); | size_t arg_size = kernel_def_.args_size(); | ||||
| auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); | auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); | ||||
| if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
| GELOGE(RT_FAILED, "rtMallocHost failed, size = %zu, ret = %d", arg_size, rt_ret); | |||||
| GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret); | |||||
| return RT_FAILED; | return RT_FAILED; | ||||
| } | } | ||||
| const void *src_addr = reinterpret_cast<const void *>(addresses.data()); | const void *src_addr = reinterpret_cast<const void *>(addresses.data()); | ||||
| uint64_t src_len = sizeof(void *) * addresses.size(); | uint64_t src_len = sizeof(void *) * addresses.size(); | ||||
| rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST); | |||||
| rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
| (void)rtFree(*io_addr); | (void)rtFree(*io_addr); | ||||
| GELOGE(RT_FAILED, "rtMemcpy addresses failed, ret = %d", rt_ret); | GELOGE(RT_FAILED, "rtMemcpy addresses failed, ret = %d", rt_ret); | ||||
| @@ -69,8 +69,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke | |||||
| return RT_FAILED; | return RT_FAILED; | ||||
| } | } | ||||
| rt_ret = | |||||
| rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_HOST); | |||||
| rt_ret = rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), | |||||
| RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
| (void)rtFree(fwk_op_args); | (void)rtFree(fwk_op_args); | ||||
| GELOGE(RT_FAILED, "copy args failed, ret = %d", rt_ret); | GELOGE(RT_FAILED, "copy args failed, ret = %d", rt_ret); | ||||
| @@ -80,7 +80,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m) { | |||||
| Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam ¶m, | |||||
| bool dynamic_flag) { | |||||
| if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | ||||
| GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL), | GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL), | ||||
| kernel_def_.args_size()); | kernel_def_.args_size()); | ||||
| @@ -88,31 +89,60 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam | |||||
| } | } | ||||
| auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | ||||
| auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); | auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); | ||||
| if (ws_addr_vec.empty()) { | |||||
| GELOGE(PARAM_INVALID, "workspace Data Address is empty."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| auto rt_ret = rtMemcpy(ws_addr_vec[0], kernel_def_.task_info_size(), kernel_def_.task_info().data(), | |||||
| kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| GELOGE(FAILED, "rtMemcpy error: 0x%X", rt_ret); | |||||
| return FAILED; | |||||
| if (dynamic_flag) { | |||||
| GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM)); | |||||
| } else { | |||||
| if (ws_addr_vec.empty()) { | |||||
| GELOGE(PARAM_INVALID, "workspace Data Address is empty."); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| *kernel_workspace = ws_addr_vec[0]; | |||||
| } | } | ||||
| GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), kernel_def_.task_info().data(), | |||||
| kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| void *io_addr = nullptr; | |||||
| auto ret = SetInputOutputAddr(&io_addr, BuildTaskUtils::JoinAddresses(addresses)); | |||||
| auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses)); | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag, | |||||
| uint64_t session_id) { | |||||
| void *io_addr = nullptr; | |||||
| void *kernel_workspace = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&io_addr, &kernel_workspace, param, dynamic_flag)); | |||||
| STR_FWK_OP_KERNEL fwk_op_kernel = {0}; | STR_FWK_OP_KERNEL fwk_op_kernel = {0}; | ||||
| ret = SetFmkOpKernel(io_addr, ws_addr_vec[0], fwk_op_kernel); | |||||
| auto ret = SetFmkOpKernel(io_addr, kernel_workspace, fwk_op_kernel); | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| (void)rtFree(io_addr); | (void)rtFree(io_addr); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| task.op_desc_ = op_desc_; | |||||
| task.num_inputs_ = op_desc_->GetInputsSize(); | |||||
| task.num_outputs_ = op_desc_->GetOutputsSize(); | |||||
| // get kernel_ext_info | |||||
| auto &kernel_ext_info = kernel_def_.kernel_ext_info(); | |||||
| auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size(); | |||||
| GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED, | |||||
| "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(), | |||||
| kernel_ext_info_size); | |||||
| GE_CHK_STATUS_RET(task.SetExtInfoAndType(kernel_ext_info), "Init ext info failed."); | |||||
| if (task.ext_info_addr_dev_ != nullptr) { | |||||
| fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_); | |||||
| fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = kernel_ext_info_size; | |||||
| } | |||||
| GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "AiCpuTask init for summary and copy task failed."); | |||||
| // Create session | // Create session | ||||
| auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID; | |||||
| fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = session_id; | |||||
| GELOGI("Begin to CreateAicpuSession, session id: %lu", session_id); | |||||
| GE_CHECK_NOTNULL(ModelManager::GetInstance()); | GE_CHECK_NOTNULL(ModelManager::GetInstance()); | ||||
| GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS, | GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS, | ||||
| GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id); | GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id); | ||||
| @@ -127,8 +157,8 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam | |||||
| task.op_type_ = op_desc_->GetName(); | task.op_type_ = op_desc_->GetName(); | ||||
| task.io_addr_ = io_addr; | task.io_addr_ = io_addr; | ||||
| task.task_info_ = kernel_def_.task_info(); | task.task_info_ = kernel_def_.task_info(); | ||||
| task.workspace_addr_ = ws_addr_vec[0]; | |||||
| task.op_desc_ = op_desc_; | |||||
| task.workspace_addr_ = kernel_workspace; | |||||
| task.dynamic_flag_ = dynamic_flag; | |||||
| auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_); | auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_); | ||||
| GELOGI("[TASK_INFO] %s %s", task.task_info_.c_str(), debug_info.c_str()); | GELOGI("[TASK_INFO] %s %s", task.task_info_.c_str(), debug_info.c_str()); | ||||
| @@ -29,12 +29,14 @@ class AiCpuTaskBuilder { | |||||
| AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def); | AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def); | ||||
| ~AiCpuTaskBuilder() = default; | ~AiCpuTaskBuilder() = default; | ||||
| Status BuildTask(AiCpuTask &task, const SingleOpModelParam ¶m); | |||||
| Status BuildTask(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag, uint64_t session_id); | |||||
| private: | private: | ||||
| static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); | static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); | ||||
| Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses); | Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses); | ||||
| Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); | Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); | ||||
| Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam ¶m, | |||||
| bool dynamic_flag); | |||||
| const OpDescPtr op_desc_; | const OpDescPtr op_desc_; | ||||
| const domi::KernelExDef &kernel_def_; | const domi::KernelExDef &kernel_def_; | ||||
| @@ -20,8 +20,10 @@ | |||||
| #include <chrono> | #include <chrono> | ||||
| #include <thread> | #include <thread> | ||||
| #include "aicpu/common/aicpu_task_struct.h" | |||||
| #include "common/dump/dump_manager.h" | #include "common/dump/dump_manager.h" | ||||
| #include "common/dump/dump_op.h" | #include "common/dump/dump_op.h" | ||||
| #include "common/formats/formats.h" | |||||
| #include "framework/common/debug/log.h" | #include "framework/common/debug/log.h" | ||||
| #include "register/op_tiling.h" | #include "register/op_tiling.h" | ||||
| #include "runtime/rt.h" | #include "runtime/rt.h" | ||||
| @@ -30,24 +32,31 @@ namespace ge { | |||||
| namespace { | namespace { | ||||
| constexpr int kLaunchRetryTimes = 1000; | constexpr int kLaunchRetryTimes = 1000; | ||||
| constexpr int kSleepTime = 10; | constexpr int kSleepTime = 10; | ||||
| constexpr uint64_t kReleaseFlag = 1; | |||||
| constexpr int kCopyNum = 2; | |||||
| } // namespace | } // namespace | ||||
| Status OpTask::OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream) { | |||||
| if (DumpManager::GetInstance().IsDumpOpen()) { | |||||
| Status OpTask::OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream) { | |||||
| if (DumpManager::GetInstance().GetDumpProperties().IsSingleOpNeedDump()) { | |||||
| GELOGI("Dump is open in single op,start to set dump info"); | GELOGI("Dump is open in single op,start to set dump info"); | ||||
| std::vector<uint64_t> input_addrs; | std::vector<uint64_t> input_addrs; | ||||
| std::vector<uint64_t> output_adds; | std::vector<uint64_t> output_adds; | ||||
| auto input_size = op_desc->GetAllInputsDesc().size(); | |||||
| auto output_size = op_desc->GetOutputsSize(); | |||||
| auto input_size = op_desc_->GetInputsSize(); | |||||
| auto output_size = op_desc_->GetOutputsSize(); | |||||
| auto all_size = io_addr.size(); | |||||
| if (input_size + output_size != all_size) { | |||||
| GELOGE(FAILED, "io_addr size is not equal input and output size"); | |||||
| return FAILED; | |||||
| } | |||||
| for (size_t i = 0; i < input_size; i++) { | for (size_t i = 0; i < input_size; i++) { | ||||
| uint64_t input_addr = *(reinterpret_cast<const uint64_t *>(arg) + i); | |||||
| uint64_t input_addr = static_cast<uint64_t>(io_addr[i]); | |||||
| input_addrs.emplace_back(input_addr); | input_addrs.emplace_back(input_addr); | ||||
| } | } | ||||
| for (size_t j = 0; j < output_size; j++) { | for (size_t j = 0; j < output_size; j++) { | ||||
| uint64_t output_addr = *(reinterpret_cast<const uint64_t *>(arg) + input_size + j); | |||||
| uint64_t output_addr = static_cast<uint64_t>(io_addr[input_size + j]); | |||||
| output_adds.emplace_back(output_addr); | output_adds.emplace_back(output_addr); | ||||
| } | } | ||||
| dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc, input_addrs, output_adds, stream); | |||||
| dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream); | |||||
| auto status = dump_op_.LaunchDumpOp(); | auto status = dump_op_.LaunchDumpOp(); | ||||
| if (status != SUCCESS) { | if (status != SUCCESS) { | ||||
| GELOGE(status, "Launch dump op failed in single op"); | GELOGE(status, "Launch dump op failed in single op"); | ||||
| @@ -112,11 +121,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { | |||||
| } | } | ||||
| GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); | GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); | ||||
| auto status = OpenDump(args_.get(), op_desc_, stream); | |||||
| if (status != SUCCESS) { | |||||
| GELOGE(status, "Open dump failed in tbe single op %s", stub_name_.c_str()); | |||||
| return status; | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -218,6 +222,119 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void * | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| AiCpuBaseTask::~AiCpuBaseTask() { | |||||
| if (ext_info_addr_dev_ != nullptr) { | |||||
| (void)rtFree(ext_info_addr_dev_); | |||||
| } | |||||
| } | |||||
| Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info) { | |||||
| if (kernel_ext_info.empty()) { | |||||
| GELOGI("Kernel_ext_info is empty, no need copy to device."); | |||||
| return SUCCESS; | |||||
| } | |||||
| int32_t unknown_shape_type_val = 0; | |||||
| (void)AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val); | |||||
| GELOGD("Get unknown_type is %d.", unknown_shape_type_val); | |||||
| unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val); | |||||
| aicpu_ext_handle_.reset( | |||||
| new (std::nothrow)::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(), num_inputs_, num_outputs_, unknown_type_)); | |||||
| GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, FAILED, "Malloc aicpu_ext_handle mem failed!"); | |||||
| Status ret = aicpu_ext_handle_->Parse(kernel_ext_info); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size()); | |||||
| return ret; | |||||
| } | |||||
| GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, kernel_ext_info.size(), RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, kernel_ext_info.size(), kernel_ext_info.data(), kernel_ext_info.size(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc, | |||||
| std::vector<GeTensorDesc> &output_desc) { | |||||
| GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_); | |||||
| if (num_inputs_ == 0 && num_outputs_ == 0) { | |||||
| GELOGI("No input and output, no need update ext info."); | |||||
| return SUCCESS; | |||||
| } | |||||
| GE_CHECK_NOTNULL(aicpu_ext_handle_); | |||||
| for (size_t i = 0; i < num_inputs_; ++i) { | |||||
| GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(i, input_desc[i]), | |||||
| "Input[%zu] update input shape failed.", i); | |||||
| } | |||||
| if (unknown_type_ != DEPEND_COMPUTE) { | |||||
| for (size_t j = 0; j < num_outputs_; ++j) { | |||||
| GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]), | |||||
| "Output[%zu] UpdateOutputShapeAndType failed.", j); | |||||
| // debug code | |||||
| GELOGD("No input and output, no need update ext info."); | |||||
| } | |||||
| } | |||||
| GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, | |||||
| aicpu_ext_handle_->GetExtInfoLen(), // check size | |||||
| aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GELOGI("Update ext info end."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) { | |||||
| if (num_outputs_ == 0) { | |||||
| GELOGD("AiCpuBaseTask output_num is 0, no need update output shape."); | |||||
| return SUCCESS; | |||||
| } | |||||
| GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape."); | |||||
| GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_, | |||||
| aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST)); | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| GeShape shape; | |||||
| DataType data_type; | |||||
| aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type); | |||||
| GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.", | |||||
| i); | |||||
| } | |||||
| GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) { | |||||
| auto shape_old = output_desc.GetShape(); | |||||
| output_desc.SetShape(shape_new); | |||||
| GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str()); | |||||
| auto origin_shape_old = output_desc.GetOriginShape(); | |||||
| auto origin_format = output_desc.GetOriginFormat(); | |||||
| auto format = output_desc.GetFormat(); | |||||
| if (origin_format == format) { | |||||
| output_desc.SetOriginShape(shape_new); | |||||
| return SUCCESS; | |||||
| } | |||||
| std::vector<int64_t> origin_dims_new; | |||||
| auto trans_ret = | |||||
| formats::TransShape(format, shape_new.GetDims(), output_desc.GetDataType(), origin_format, origin_dims_new); | |||||
| GE_CHK_STATUS_RET(trans_ret, "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.", | |||||
| origin_format, format, shape_new.ToString().c_str()); | |||||
| auto origin_shape_new = GeShape(origin_dims_new); | |||||
| output_desc.SetOriginShape(origin_shape_new); | |||||
| GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.", origin_format, format, | |||||
| origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str()); | |||||
| return SUCCESS; | |||||
| } | |||||
| AiCpuTask::~AiCpuTask() { | AiCpuTask::~AiCpuTask() { | ||||
| if (args_ != nullptr) { | if (args_ != nullptr) { | ||||
| (void)rtFree(args_); | (void)rtFree(args_); | ||||
| @@ -226,6 +343,43 @@ AiCpuTask::~AiCpuTask() { | |||||
| if (io_addr_ != nullptr) { | if (io_addr_ != nullptr) { | ||||
| (void)rtFree(io_addr_); | (void)rtFree(io_addr_); | ||||
| } | } | ||||
| if (dynamic_flag_ && workspace_addr_ != nullptr) { | |||||
| (void)rtFree(workspace_addr_); | |||||
| } | |||||
| if (copy_workspace_buf_ != nullptr) { | |||||
| (void)rtFree(copy_workspace_buf_); | |||||
| } | |||||
| if (copy_ioaddr_dev_ != nullptr) { | |||||
| (void)rtFree(copy_ioaddr_dev_); | |||||
| } | |||||
| if (copy_input_release_flag_dev_ != nullptr) { | |||||
| (void)rtFree(copy_input_release_flag_dev_); | |||||
| } | |||||
| if (copy_input_data_size_dev_ != nullptr) { | |||||
| (void)rtFree(copy_input_data_size_dev_); | |||||
| } | |||||
| if (copy_input_src_dev_ != nullptr) { | |||||
| (void)rtFree(copy_input_src_dev_); | |||||
| } | |||||
| if (copy_input_dst_dev_ != nullptr) { | |||||
| (void)rtFree(copy_input_dst_dev_); | |||||
| } | |||||
| if (copy_task_args_buf_ != nullptr) { | |||||
| (void)rtFree(copy_task_args_buf_); | |||||
| } | |||||
| for (auto summary : output_summary_) { | |||||
| if (summary != nullptr) { | |||||
| (void)rtFree(summary); | |||||
| } | |||||
| } | |||||
| } | } | ||||
| const void *AiCpuTask::GetIOAddr() const { return io_addr_; } | const void *AiCpuTask::GetIOAddr() const { return io_addr_; } | ||||
| @@ -247,15 +401,225 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) { | |||||
| } | } | ||||
| GELOGI("[TASK_INFO] is %s", this->task_info_.c_str()); | GELOGI("[TASK_INFO] is %s", this->task_info_.c_str()); | ||||
| auto status = OpenDump(args_, op_desc_, stream); | |||||
| if (status != SUCCESS) { | |||||
| GELOGE(status, "Open dump failed in aicpu single op %s", op_type_.c_str()); | |||||
| return status; | |||||
| } | |||||
| GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str()); | GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str()); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm) { | |||||
| std::vector<uint64_t> copy_input_release_flag; | |||||
| std::vector<uint64_t> copy_input_data_size; | |||||
| std::vector<uint64_t> copy_input_src; | |||||
| std::vector<uint64_t> copy_input_dst; | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| const auto &summary = output_summary_host_[i]; | |||||
| GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.", i, | |||||
| summary.shape_data_ptr, summary.shape_data_size, summary.raw_data_ptr, summary.raw_data_size); | |||||
| auto output = outputs[i]; | |||||
| copy_input_release_flag.emplace_back(kReleaseFlag); | |||||
| copy_input_data_size.emplace_back(summary.raw_data_size); | |||||
| copy_input_src.emplace_back(summary.raw_data_ptr); | |||||
| copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output)); | |||||
| const auto &shape_buffer = out_shape_hbm[i]; | |||||
| copy_input_release_flag.emplace_back(kReleaseFlag); | |||||
| copy_input_data_size.emplace_back(summary.shape_data_size); | |||||
| copy_input_src.emplace_back(summary.shape_data_ptr); | |||||
| copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer)); | |||||
| } | |||||
| const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t); | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len, copy_input_release_flag.data(), | |||||
| copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len, copy_input_data_size.data(), copy_input_buf_len, | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len, copy_input_src.data(), copy_input_buf_len, | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len, copy_input_dst.data(), copy_input_buf_len, | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm) { | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| auto &result_summary = output_summary_host_[i]; | |||||
| GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary), output_summary_[i], | |||||
| sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMCPY_DEVICE_TO_HOST)); | |||||
| auto shape_data_size = result_summary.shape_data_size; | |||||
| void *shape_buffer = nullptr; | |||||
| GE_MAKE_GUARD_RTMEM(shape_buffer); | |||||
| GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM)); | |||||
| out_shape_hbm.emplace_back(shape_buffer); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream) { | |||||
| GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs, out_shape_hbm)); | |||||
| GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_KERNEL_DEFAULT, stream)); | |||||
| GE_CHK_RT_RET(rtStreamSynchronize(stream)); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm) { | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| const auto &result_summary = output_summary_host_[i]; | |||||
| std::vector<int64_t> shape_dims; | |||||
| const auto &shape_hbm = out_shape_hbm[i]; | |||||
| uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t); | |||||
| std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]()); | |||||
| GE_CHECK_NOTNULL(shape_addr); | |||||
| GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm, result_summary.shape_data_size, | |||||
| RT_MEMCPY_DEVICE_TO_HOST)); | |||||
| for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) { | |||||
| shape_dims.emplace_back(shape_addr[dim_idx]); | |||||
| GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]); | |||||
| } | |||||
| GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]), | |||||
| "AiCpuTask update [%zu]th output shape failed.", i); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs, | |||||
| rtStream_t stream) { | |||||
| if (num_outputs_ == 0) { | |||||
| GELOGI("Output num is 0, there is no need to update the output and size."); | |||||
| return SUCCESS; | |||||
| } | |||||
| GELOGI("Update shape and data by result summary begin."); | |||||
| std::vector<void *> out_shape_hbm; | |||||
| GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(out_shape_hbm), | |||||
| "Read ResultSummary and update output shape failed."); | |||||
| GE_CHK_STATUS_RET(CopyDataToHbm(outputs, out_shape_hbm, stream), "Copy data to output failed."); | |||||
| GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc, out_shape_hbm), "Update shape by hbm buffer failed."); | |||||
| GELOGI("Update shape and data by result summary end."); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) { | |||||
| vector<uint64_t> io_addrs; | |||||
| io_addrs.reserve(num_inputs_ + num_outputs_); | |||||
| for (size_t i = 0; i < num_inputs_; ++i) { | |||||
| GE_CHECK_NOTNULL(inputs[i]); | |||||
| GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]); | |||||
| io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i])); | |||||
| } | |||||
| if (unknown_type_ != DEPEND_COMPUTE) { | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| GE_CHECK_NOTNULL(outputs[i]); | |||||
| GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]); | |||||
| io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i])); | |||||
| } | |||||
| } else { | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| void *summary_addr = output_summary_[i]; | |||||
| io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr)); | |||||
| } | |||||
| } | |||||
| if (!io_addrs.empty()) { | |||||
| auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_)); | |||||
| GE_CHK_RT_RET(rtMemcpy(dst_io_addr, sizeof(uint64_t) * io_addrs.size(), &io_addrs[0], | |||||
| sizeof(uint64_t) * io_addrs.size(), RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| GE_CHECK_NOTNULL(dst_io_addr); | |||||
| }; | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::InitForSummaryAndCopy() { | |||||
| if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) { | |||||
| GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_); | |||||
| return SUCCESS; | |||||
| } | |||||
| output_summary_.resize(num_outputs_); | |||||
| constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary); | |||||
| for (size_t i = 0; i < num_outputs_; ++i) { | |||||
| GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM)); | |||||
| } | |||||
| output_summary_host_.resize(num_outputs_); | |||||
| const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t); | |||||
| GE_CHK_RT_RET(rtMalloc(©_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMalloc(©_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMalloc(©_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMalloc(©_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMalloc(©_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM)); | |||||
| std::vector<uint64_t> copy_io_addr; | |||||
| copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_)); | |||||
| copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_)); | |||||
| copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_)); | |||||
| copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_)); | |||||
| const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size(); | |||||
| GE_CHK_RT_RET(rtMalloc(©_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET( | |||||
| rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size, copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) { | |||||
| if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | |||||
| GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL), | |||||
| kernel_def.args_size()); | |||||
| return PARAM_INVALID; | |||||
| } | |||||
| GE_CHK_RT_RET(rtMalloc(©_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(), kernel_def.task_info().data(), | |||||
| kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| STR_FWK_OP_KERNEL aicpu_task = {0}; | |||||
| auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL), kernel_def.args().data(), kernel_def.args().size()); | |||||
| if (sec_ret != EOK) { | |||||
| GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); | |||||
| return FAILED; | |||||
| } | |||||
| aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_); | |||||
| aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_); | |||||
| aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0; | |||||
| aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0; | |||||
| GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), &aicpu_task, sizeof(STR_FWK_OP_KERNEL), | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs, | |||||
| std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, | |||||
| rtStream_t stream) { | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc)); | |||||
| GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs)); | |||||
| GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream)); | |||||
| GE_CHK_RT_RET(rtStreamSynchronize(stream)); | |||||
| if (unknown_type_ == DEPEND_SHAPE_RANGE) { | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc)); | |||||
| } else if (unknown_type_ == DEPEND_COMPUTE) { | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, outputs, stream)); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { | void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { | ||||
| args_ = std::move(args); | args_ = std::move(args); | ||||
| arg_size_ = arg_size; | arg_size_ = arg_size; | ||||
| @@ -291,11 +655,34 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { | |||||
| } | } | ||||
| GELOGD("Invoke rtCpuKernelLaunch succeeded"); | GELOGD("Invoke rtCpuKernelLaunch succeeded"); | ||||
| auto status = OpenDump(args_.get(), op_desc_, stream); | |||||
| if (status != SUCCESS) { | |||||
| GELOGE(status, "Open dump failed in aicpucc single op"); | |||||
| return status; | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs, | |||||
| std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, | |||||
| rtStream_t stream) { | |||||
| GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED, | |||||
| "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.", unknown_type_); | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc)); | |||||
| size_t arg_index = 0; | |||||
| auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_); | |||||
| GE_CHECK_NOTNULL(task_io_addr); | |||||
| for (auto &input : inputs) { | |||||
| task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input); | |||||
| } | |||||
| for (auto &output : outputs) { | |||||
| task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output); | |||||
| } | } | ||||
| GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream)); | |||||
| GE_CHK_RT_RET(rtStreamSynchronize(stream)); | |||||
| if (unknown_type_ == DEPEND_SHAPE_RANGE) { | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc)); | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -27,6 +27,9 @@ | |||||
| #include "graph/op_kernel_bin.h" | #include "graph/op_kernel_bin.h" | ||||
| #include "runtime/stream.h" | #include "runtime/stream.h" | ||||
| #include "graph/node.h" | #include "graph/node.h" | ||||
| #include "cce/aicpu_engine_struct.h" | |||||
| #include "hybrid/node_executor/aicpu/aicpu_ext_info.h" | |||||
| #include "init/gelib.h" | |||||
| namespace ge { | namespace ge { | ||||
| enum OpTaskType { | enum OpTaskType { | ||||
| @@ -52,14 +55,20 @@ class OpTask { | |||||
| virtual const void *GetIOAddr() const = 0; | virtual const void *GetIOAddr() const = 0; | ||||
| const vector<int64_t> &GetWorkspaceSizes() const; | const vector<int64_t> &GetWorkspaceSizes() const; | ||||
| void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes); | void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes); | ||||
| const OpDescPtr &GetOpdesc() const { return op_desc_; } | |||||
| Status OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream); | |||||
| virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs, | |||||
| std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| private: | private: | ||||
| std::vector<int64_t> workspace_sizes_; | std::vector<int64_t> workspace_sizes_; | ||||
| protected: | protected: | ||||
| Status OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream); | |||||
| DumpProperties dump_properties_; | DumpProperties dump_properties_; | ||||
| DumpOp dump_op_; | DumpOp dump_op_; | ||||
| OpDescPtr op_desc_; | |||||
| }; | }; | ||||
| class TbeOpTask : public OpTask { | class TbeOpTask : public OpTask { | ||||
| @@ -97,10 +106,30 @@ class TbeOpTask : public OpTask { | |||||
| uint32_t max_tiling_size_ = 0; | uint32_t max_tiling_size_ = 0; | ||||
| std::string tiling_data_; | std::string tiling_data_; | ||||
| NodePtr node_; | NodePtr node_; | ||||
| OpDescPtr op_desc_; | |||||
| }; | }; | ||||
| class AiCpuTask : public OpTask { | |||||
| class AiCpuBaseTask : public OpTask { | |||||
| public: | |||||
| AiCpuBaseTask() = default; | |||||
| ~AiCpuBaseTask() override; | |||||
| const UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||||
| protected: | |||||
| Status SetExtInfoAndType(const std::string &kernel_ext_info); | |||||
| Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc, std::vector<GeTensorDesc> &output_desc); | |||||
| Status UpdateOutputShape(vector<GeTensorDesc> &output_desc); | |||||
| Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc); | |||||
| protected: | |||||
| size_t num_inputs_ = 0; | |||||
| size_t num_outputs_ = 0; | |||||
| UnknowShapeOpType unknown_type_ = DEPEND_IN_SHAPE; | |||||
| std::unique_ptr<ge::hybrid::AicpuExtInfoHandler> aicpu_ext_handle_; | |||||
| void *ext_info_addr_dev_ = nullptr; | |||||
| }; | |||||
| class AiCpuTask : public AiCpuBaseTask { | |||||
| public: | public: | ||||
| AiCpuTask() = default; | AiCpuTask() = default; | ||||
| ~AiCpuTask() override; | ~AiCpuTask() override; | ||||
| @@ -109,7 +138,24 @@ class AiCpuTask : public OpTask { | |||||
| OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; } | OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; } | ||||
| const void *GetIOAddr() const override; | const void *GetIOAddr() const override; | ||||
| Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs, | |||||
| std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override; | |||||
| Status SetMemCopyTask(const domi::KernelExDef &kernel_def); | |||||
| private: | private: | ||||
| Status SetIO(const vector<void *> &inputs, vector<void *> &outputs); | |||||
| // for copy task. | |||||
| Status InitForSummaryAndCopy(); | |||||
| Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs, | |||||
| rtStream_t stream); | |||||
| Status ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm); | |||||
| Status CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream); | |||||
| Status PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm); | |||||
| Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm); | |||||
| friend class AiCpuTaskBuilder; | friend class AiCpuTaskBuilder; | ||||
| void *workspace_addr_ = nullptr; | void *workspace_addr_ = nullptr; | ||||
| std::string task_info_; | std::string task_info_; | ||||
| @@ -117,10 +163,24 @@ class AiCpuTask : public OpTask { | |||||
| size_t arg_size_ = 0; | size_t arg_size_ = 0; | ||||
| std::string op_type_; | std::string op_type_; | ||||
| void *io_addr_ = nullptr; | void *io_addr_ = nullptr; | ||||
| OpDescPtr op_desc_; | |||||
| bool dynamic_flag_ = false; | |||||
| // for copy task | |||||
| void *copy_task_args_buf_; | |||||
| void *copy_workspace_buf_; | |||||
| std::vector<void *> output_summary_; | |||||
| std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_; | |||||
| void *copy_ioaddr_dev_; | |||||
| void *copy_input_release_flag_dev_; | |||||
| void *copy_input_data_size_dev_; | |||||
| void *copy_input_src_dev_; | |||||
| void *copy_input_dst_dev_; | |||||
| }; | }; | ||||
| class AiCpuCCTask : public OpTask { | |||||
| class AiCpuCCTask : public AiCpuBaseTask { | |||||
| public: | public: | ||||
| AiCpuCCTask() = default; | AiCpuCCTask() = default; | ||||
| ~AiCpuCCTask() override; | ~AiCpuCCTask() override; | ||||
| @@ -137,6 +197,9 @@ class AiCpuCCTask : public OpTask { | |||||
| void SetIoAddr(void *io_addr); | void SetIoAddr(void *io_addr); | ||||
| size_t GetArgSize() const; | size_t GetArgSize() const; | ||||
| Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs, | |||||
| std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override; | |||||
| private: | private: | ||||
| friend class AiCpuCCTaskBuilder; | friend class AiCpuCCTaskBuilder; | ||||
| std::string so_name_; | std::string so_name_; | ||||
| @@ -146,7 +209,6 @@ class AiCpuCCTask : public OpTask { | |||||
| uint32_t block_dim_ = 1; | uint32_t block_dim_ = 1; | ||||
| void *sm_desc_ = nullptr; | void *sm_desc_ = nullptr; | ||||
| void *io_addr_ = nullptr; | void *io_addr_ = nullptr; | ||||
| OpDescPtr op_desc_; | |||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -25,14 +25,16 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC), image normalization (by subtracting the mean value or multiplying a factor), image cropping (by specifying the crop start and cropping the image to the size required by the neural network), and much more. | |||||
| *@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC), | |||||
| image normalization (by subtracting the mean value or multiplying a factor), image cropping | |||||
| (by specifying the crop start and cropping the image to the size required by the neural network), and much more. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. | *@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. | ||||
| *@li params: Dynamic AIPP configuration parameters of type uint8. | |||||
| *@li params: Dynamic AIPP configuration parameters of type uint8. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *aipp_config_path: A required string, specifying the path of the AIPP configuration file | |||||
| *aipp_config_path: A required string, specifying the path of the AIPP configuration file. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *features: The AIPP-processed output tensor of type float16 or uint8. | *features: The AIPP-processed output tensor of type float16 or uint8. | ||||
| @@ -47,17 +49,17 @@ REG_OP(Aipp) | |||||
| .OP_END_FACTORY_REG(Aipp) | .OP_END_FACTORY_REG(Aipp) | ||||
| /** | /** | ||||
| *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic \n | |||||
| in aipp config file, framework will auto add one input node to graph at last. | |||||
| *@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic | |||||
| in aipp config file, framework will auto add one input node to graph at last. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. | |||||
| *data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *index: specify aipp serial num | |||||
| *index: specify aipp serial num \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *out: The AIPP-processed output tensor of all types. | |||||
| *out: The AIPP-processed output tensor of all types. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator AippData. | *Compatible with the TensorFlow operator AippData. | ||||
| @@ -26,29 +26,29 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of \n | |||||
| taking the DCT-II of a log-magnitude mel-scale spectrogram. | |||||
| *@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of | |||||
| taking the DCT-II of a log-magnitude mel-scale spectrogram . \n | |||||
| *@par Inputs: | |||||
| *Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar. \n | |||||
| *@par Inputs: | |||||
| *Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar. | |||||
| * @li spectrogram: A 3D float tensor. | * @li spectrogram: A 3D float tensor. | ||||
| * @li sample_rate: The MFCC sample rate. | |||||
| * @li sample_rate: The MFCC sample rate . \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li upper_frequency_limit: The highest frequency for calculation. | *@li upper_frequency_limit: The highest frequency for calculation. | ||||
| *@li lower_frequency_limit: The lowest frequency for calculation. | *@li lower_frequency_limit: The lowest frequency for calculation. | ||||
| *@li filterbank_channel_count: Resolution of the Mel bank. | *@li filterbank_channel_count: Resolution of the Mel bank. | ||||
| *@li dct_coefficient_count: Number of output channels to produce \n | |||||
| per time slice. | |||||
| *@li dct_coefficient_count: Number of output channels to produce | |||||
| per time slice . \n | |||||
| *@par Outputs: | |||||
| *y: A Tensor of type float32. | |||||
| *@par Outputs: | |||||
| *y: A Tensor of type float32 . \n | |||||
| *@attention Constraints: \n | |||||
| *Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *Mfcc runs on the Ascend AI CPU, which delivers poor performance. | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator Mfcc. | |||||
| *Compatible with the TensorFlow operator Mfcc . \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -64,26 +64,26 @@ REG_OP(Mfcc) | |||||
| .OP_END_FACTORY_REG(Mfcc) | .OP_END_FACTORY_REG(Mfcc) | ||||
| /** | /** | ||||
| *@brief Decodes and generates spectrogram using wav float tensor. | |||||
| *@brief Decodes and generates spectrogram using wav float tensor . \n | |||||
| *@par Inputs: | |||||
| *Input "x" is a 2D matrix. \n | |||||
| * x: A float tensor. Float representation of audio data. | |||||
| *@par Inputs: | |||||
| *Input "x" is a 2D matrix. | |||||
| * x: A float tensor. Float representation of audio data . \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li window_size: Size of the spectrogram window. | *@li window_size: Size of the spectrogram window. | ||||
| *@li stride: Size of the spectrogram stride. | *@li stride: Size of the spectrogram stride. | ||||
| *@li magnitude_squared: If true, uses squared magnitude. | |||||
| *@li magnitude_squared: If true, uses squared magnitude . \n | |||||
| *@par Outputs: | |||||
| *spectrogram: A 3D float Tensor. | |||||
| *@par Outputs: | |||||
| *spectrogram: A 3D float Tensor . \n | |||||
| *@attention Constraints: \n | |||||
| *AudioSpectrogram runs on the Ascend AI CPU, which delivers \n | |||||
| poor performance. | |||||
| *@attention Constraints: | |||||
| *AudioSpectrogram runs on the Ascend AI CPU, which delivers | |||||
| poor performance . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator AudioSpectrogram. | |||||
| *Compatible with the TensorFlow operator AudioSpectrogram . \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -98,26 +98,26 @@ REG_OP(AudioSpectrogram) | |||||
| .OP_END_FACTORY_REG(AudioSpectrogram) | .OP_END_FACTORY_REG(AudioSpectrogram) | ||||
| /** | /** | ||||
| *@brief Decodes a 16-bit WAV file into a float tensor. | |||||
| *@brief Decodes a 16-bit WAV file into a float tensor . \n | |||||
| *@par Inputs: | |||||
| *contents: A Tensor of type string. The WAV-encoded audio, usually from a file. | |||||
| *@par Inputs: | |||||
| *contents: A Tensor of type string. The WAV-encoded audio, usually from a file . \n | |||||
| *@par Attributes: | |||||
| *@li desired_channels: An optional int. Defaults to "-1". \n | |||||
| *@par Attributes: | |||||
| *@li desired_channels: An optional int. Defaults to "-1". | |||||
| Number of sample channels wanted. | Number of sample channels wanted. | ||||
| *@li desired_samples: An optional int. Defaults to "-1". \n | |||||
| Length of audio requested. | |||||
| *@li desired_samples: An optional int. Defaults to "-1". | |||||
| Length of audio requested . \n | |||||
| *@par Outputs: | |||||
| *@par Outputs: | |||||
| *@li *audio: A Tensor of type float32. | *@li *audio: A Tensor of type float32. | ||||
| *@li *sample_rate: A Tensor of type int32. | |||||
| *@li *sample_rate: A Tensor of type int32 . \n | |||||
| *@attention Constraints: \n | |||||
| *DecodeWav runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *DecodeWav runs on the Ascend AI CPU, which delivers poor performance. | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator DecodeWav. | |||||
| *Compatible with the TensorFlow operator DecodeWav . \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -132,21 +132,21 @@ REG_OP(DecodeWav) | |||||
| .OP_END_FACTORY_REG(DecodeWav) | .OP_END_FACTORY_REG(DecodeWav) | ||||
| /** | /** | ||||
| *@brief Encode audio data using the WAV file format. | |||||
| *@brief Encode audio data using the WAV file format . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *Including: \n | |||||
| *Including: | |||||
| * @li audio: A Tensor of type DT_FLOAT. | * @li audio: A Tensor of type DT_FLOAT. | ||||
| * @li sample_rate: A Tensor of type DT_INT32. | |||||
| * @li sample_rate: A Tensor of type DT_INT32 . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *contents: A Tensor of type DT_STRING. | |||||
| *contents: A Tensor of type DT_STRING . \n | |||||
| *@attention Constraints:\n | |||||
| *EncodeWav runs on the Ascend AI CPU, which delivers poor performance.\n | |||||
| *@attention Constraints: | |||||
| *EncodeWav runs on the Ascend AI CPU, which delivers poor performance. | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with tensorflow Operator EncodeWav. | |||||
| *Compatible with tensorflow Operator EncodeWav . \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -26,35 +26,36 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Creates batches of tensors in "x_tensors". | |||||
| *@brief Creates batches of tensors in "x_tensors" . \n | |||||
| *@par Inputs: | |||||
| *Input "x_tensors" is a list or a dictionary of tensors. \n | |||||
| *x_tensors: The list or dictionary of tensors to enqueue. | |||||
| *@par Inputs: | |||||
| *Input "x_tensors" is a list or a dictionary of tensors. | |||||
| *x_tensors: The list or dictionary of tensors to enqueue . | |||||
| It's a dynamic input \n | |||||
| *@par Attributes: | |||||
| *@li num_batch_threads: The number of threads enqueuing "x_tensors". \n | |||||
| *@par Attributes: | |||||
| *@li num_batch_threads: The number of threads enqueuing "x_tensors". | |||||
| The batching will be nondeterministic if "num_batch_threads" > 1. | The batching will be nondeterministic if "num_batch_threads" > 1. | ||||
| *@li max_batch_size: The maximum batch size pulled from the queue. | *@li max_batch_size: The maximum batch size pulled from the queue. | ||||
| *@li max_enqueued_batches: The maximum number of batches pulled from the queue. | *@li max_enqueued_batches: The maximum number of batches pulled from the queue. | ||||
| *@li batch_timeout_micros: The batch processing timeout, in microseconds. | *@li batch_timeout_micros: The batch processing timeout, in microseconds. | ||||
| *@li allowed_batch_sizes: The allowed batch size pulled from the queue. | *@li allowed_batch_sizes: The allowed batch size pulled from the queue. | ||||
| *@li grad_timeout_micros: The gradient batch processing timeout, \n | |||||
| *@li grad_timeout_micros: The gradient batch processing timeout, | |||||
| in microseconds. | in microseconds. | ||||
| *@li container: If non-empty, this queue is placed in the given container. \n | |||||
| *@li container: If non-empty, this queue is placed in the given container. | |||||
| Otherwise, a default container is used. | Otherwise, a default container is used. | ||||
| *@li shared_name: If set, this queue will be shared under the given name \n | |||||
| *@li shared_name: If set, this queue will be shared under the given name | |||||
| across multiple sessions. | across multiple sessions. | ||||
| *@li batching_queue: The queue resource container. | |||||
| *@li batching_queue: The queue resource container . \n | |||||
| *@par Outputs: | |||||
| *@par Outputs: | |||||
| *@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order. | *@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order. | ||||
| *@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order. | *@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order. | ||||
| *@li y_tensors: A list or dictionary of tensors with \n | |||||
| the same types as "x_tensors". | |||||
| *@li y_tensors: A list or dictionary of tensors with | |||||
| the same types as "x_tensors" . It's a dynamic output. \n | |||||
| *@attention Constraints: \n | |||||
| *Batch runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *Batch runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator Batch. | *Compatible with the TensorFlow operator Batch. | ||||
| @@ -79,26 +80,26 @@ REG_OP(Batch) | |||||
| .OP_END_FACTORY_REG(Batch) | .OP_END_FACTORY_REG(Batch) | ||||
| /** | /** | ||||
| *@brief Reverses the operation of Batch for a single output Tensor. | |||||
| *@brief Reverses the operation of Batch for a single output Tensor . \n | |||||
| *@par Inputs: | |||||
| *Input "x_tensors" is a list or a dictionary of tensors. \n | |||||
| *@par Inputs: | |||||
| *Input "x_tensors" is a list or a dictionary of tensors. | |||||
| * @li x_tensors: The list or dictionary of tensors to enqueue. | * @li x_tensors: The list or dictionary of tensors to enqueue. | ||||
| * @li index: The matching "batch_index" obtained from Batch. | * @li index: The matching "batch_index" obtained from Batch. | ||||
| * @li id: The "id" scalar emitted by Batch. | |||||
| * @li id: The "id" scalar emitted by Batch . \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li timeout_micros: The unbatch processing timeout, in microseconds. | *@li timeout_micros: The unbatch processing timeout, in microseconds. | ||||
| *@li container: If non-empty, this queue is placed in the given container. \n | |||||
| *@li container: If non-empty, this queue is placed in the given container. | |||||
| Otherwise, a default container is used. | Otherwise, a default container is used. | ||||
| *@li shared_name: If set, this queue will be shared under the given name \n | |||||
| across multiple sessions. | |||||
| *@li shared_name: If set, this queue will be shared under the given name | |||||
| across multiple sessions . \n | |||||
| *@par Outputs: | |||||
| *y_tensor: A list or dictionary of tensors with the same types as "x_tensors". | |||||
| *@par Outputs: | |||||
| *y_tensor: A list or dictionary of tensors with the same types as "x_tensors" . \n | |||||
| *@attention Constraints: \n | |||||
| *Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator Unbatch. | *Compatible with the TensorFlow operator Unbatch. | ||||
| @@ -117,27 +118,27 @@ REG_OP(Unbatch) | |||||
| .OP_END_FACTORY_REG(Unbatch) | .OP_END_FACTORY_REG(Unbatch) | ||||
| /** | /** | ||||
| *@brief Acts like Batch but using the given "batch_index" index of batching \n | |||||
| things as they become available. | |||||
| *@brief Acts like Batch but using the given "batch_index" index of batching | |||||
| things as they become available . \n | |||||
| *@par Inputs: | |||||
| *Input "x_input" is a list or a dictionary of tensors. \n | |||||
| *@par Inputs: | |||||
| *Input "x_input" is a list or a dictionary of tensors. | |||||
| * @li x_input: The input to the Unbatch operation. | * @li x_input: The input to the Unbatch operation. | ||||
| * @li index: The batch_index given to the Unbatch operation. | * @li index: The batch_index given to the Unbatch operation. | ||||
| * @li id: The "id" scalar emitted by Batch. | * @li id: The "id" scalar emitted by Batch. | ||||
| * @li grad: The downstream gradient. | |||||
| * @li grad: The downstream gradient . \n | |||||
| *@par Attributes: | |||||
| *@li container: If non-empty, this queue is placed in the given container. \n | |||||
| *@par Attributes: | |||||
| *@li container: If non-empty, this queue is placed in the given container. | |||||
| Otherwise, a default container is used. | Otherwise, a default container is used. | ||||
| *@li shared_name: If set, this queue will be shared under the given name \n | |||||
| across multiple sessions. | |||||
| *@li shared_name: If set, this queue will be shared under the given name | |||||
| across multiple sessions . \n | |||||
| *@par Outputs: | |||||
| *y_grad: The return value, either an empty tensor or the batched gradient. | |||||
| *@par Outputs: | |||||
| *y_grad: The return value, either an empty tensor or the batched gradient . \n | |||||
| *@attention Constraints: \n | |||||
| *UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator UnbatchGrad. | *Compatible with the TensorFlow operator UnbatchGrad. | ||||
| @@ -26,20 +26,20 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Element-wise computes the bitwise right-shift of x and y. | |||||
| *@brief Element-wise computes the bitwise right-shift of x and y . \n | |||||
| *@par Inputs: | |||||
| *Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n | |||||
| *@par Inputs: | |||||
| *Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" | |||||
| are 0D scalars. | are 0D scalars. | ||||
| * @li x: A Tensor. Must be one of the following types: int8, int16, int32, \n | |||||
| int64, uint8, uint16, uint32, uint64. \n | |||||
| * @li y: A Tensor. Has the same type as "x". \n | |||||
| * @li x: A Tensor. Must be one of the following types: int8, int16, int32, | |||||
| int64, uint8, uint16, uint32, uint64. | |||||
| * @li y: A Tensor. Has the same type as "x". \n | |||||
| *@par Outputs: | |||||
| * z: A Tensor. Has the same type as "x". \n | |||||
| *@par Outputs: | |||||
| * z: A Tensor. Has the same type as "x". \n | |||||
| *@attention Constraints: \n | |||||
| *Unique runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *Unique runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator RightShift. | *Compatible with the TensorFlow operator RightShift. | ||||
| @@ -26,28 +26,28 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Bucketizes each feature based on bucket boundaries. | |||||
| *@brief Bucketizes each feature based on bucket boundaries . \n | |||||
| *@par Inputs: | |||||
| *Input "float_values" is a 1D tensor. Input "bucket_boundaries" is \n | |||||
| a list of 1D tensors. | |||||
| * @li float_values: A list of rank 1 tensors each containing float \n | |||||
| *@par Inputs: | |||||
| *Input "float_values" is a 1D tensor. Input "bucket_boundaries" is | |||||
| a list of 1D tensors. It's a dynamic input. | |||||
| * @li float_values: A list of rank 1 tensors each containing float | |||||
| values for a single feature. | values for a single feature. | ||||
| * @li bucket_boundaries: A list of rank 1 tensors each containing \n | |||||
| the bucket boundaries for a single feature. | |||||
| * @li bucket_boundaries: A list of rank 1 tensors each containing | |||||
| the bucket boundaries for a single feature . It's a dynamic input. \n | |||||
| *@par Attributes: | |||||
| *@li num_features: Number of features \n | |||||
| *@par Attributes: | |||||
| *@li num_features: Number of features | |||||
| *@par Outputs: | |||||
| *@li y: A list of rank 1 tensors each containing the bucketized values for \n | |||||
| a single feature. | |||||
| *@par Outputs: | |||||
| *@li y: A list of rank 1 tensors each containing the bucketized values for | |||||
| a single feature . \n | |||||
| *@attention Constraints: \n | |||||
| *@attention Constraints: | |||||
| *BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n | *BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n | ||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator BoostedTreesBucketize. | |||||
| *Compatible with the TensorFlow operator BoostedTreesBucketize . \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -26,44 +26,44 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with \n | |||||
| a learned unigram distribution. | |||||
| *@brief Generates labels for candidate sampling with | |||||
| a learned unigram distribution. \n | |||||
| *@par Inputs: | |||||
| *Input "true_classes" is a 2D matrix. \n | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | |||||
| *@par Inputs: | |||||
| *Input "true_classes" is a 2D matrix. | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, \n | |||||
| *@li unique: If "unique" is true, samples with rejection, | |||||
| so that all sampled candidates in a batch are unique. | so that all sampled candidates in a batch are unique. | ||||
| *This requires some approximation to estimate the post-rejection \n | |||||
| *This requires some approximation to estimate the post-rejection | |||||
| sampling probabilities. | sampling probabilities. | ||||
| *@li range_max: The sampler will sample integers from the interval \n | |||||
| *@li range_max: The sampler will sample integers from the interval | |||||
| [0, range_max). | [0, range_max). | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each | |||||
| element is the ID of a sampled candidate. | element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing \n | |||||
| the number of times each candidate is expected to occur in a batch of sampled \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing | |||||
| the number of times each candidate is expected to occur in a batch of sampled | |||||
| candidates. If "unique" is true, then this is a probability. | candidates. If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", \n | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", | |||||
| for each sampled candidate. | for each sampled candidate. | ||||
| *representing the number of times the candidate is expected to occur \n | |||||
| *representing the number of times the candidate is expected to occur | |||||
| in a batch of sampled candidates. | in a batch of sampled candidates. | ||||
| * If "unique" is true, then this is a probability. \n | |||||
| * If "unique" is true, then this is a probability. | |||||
| *@attention Constraints: \n | |||||
| *ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU, \n | |||||
| which delivers poor performance. | |||||
| *@attention Constraints: | |||||
| *ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU, | |||||
| which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. | |||||
| *Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -82,44 +82,44 @@ REG_OP(ThreadUnsafeUnigramCandidateSampler) | |||||
| .OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler) | .OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with a learned \n | |||||
| unigram distribution. | |||||
| *@brief Generates labels for candidate sampling with a learned | |||||
| unigram distribution. \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | the IDs of the "num_true" "target_classes" in the corresponding original label. | ||||
| *Input "true_classes" is a 2D matrix. | |||||
| *Input "true_classes" is a 2D matrix. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, \n | |||||
| *@li unique: If "unique" is true, samples with rejection, | |||||
| so that all sampled candidates in a batch are unique. | so that all sampled candidates in a batch are unique. | ||||
| *This requires some approximation to estimate the post-rejection \n | |||||
| *This requires some approximation to estimate the post-rejection | |||||
| sampling probabilities. | sampling probabilities. | ||||
| *@li range_max: The sampler will sample integers from the interval \n | |||||
| *@li range_max: The sampler will sample integers from the interval | |||||
| [0, range_max). | [0, range_max). | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", | |||||
| in which each element is the ID of a sampled candidate. | in which each element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n | |||||
| number of times each candidate is expected to occur \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the | |||||
| number of times each candidate is expected to occur | |||||
| in a batch of sampled candidates. | in a batch of sampled candidates. | ||||
| *If "unique" is true, then this is a probability. | *If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each \n | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each | |||||
| sampled candidate representing the number of times. | sampled candidate representing the number of times. | ||||
| * the candidate is expected to occur in a batch of sampled candidates. \n | |||||
| *If "unique" is true, then this is a probability. | |||||
| * the candidate is expected to occur in a batch of sampled candidates. | |||||
| *If "unique" is true, then this is a probability. \n | |||||
| *@attention Constraints: \n | |||||
| *UniformCandidateSampler runs on the Ascend AI CPU, \n | |||||
| which delivers poor performance. | |||||
| *@attention Constraints: | |||||
| *UniformCandidateSampler runs on the Ascend AI CPU, | |||||
| which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator UniformCandidateSampler. | |||||
| *Compatible with the TensorFlow operator UniformCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -138,56 +138,56 @@ REG_OP(UniformCandidateSampler) | |||||
| .OP_END_FACTORY_REG(UniformCandidateSampler) | .OP_END_FACTORY_REG(UniformCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with a learned \n | |||||
| unigram distribution. | |||||
| *@brief Generates labels for candidate sampling with a learned | |||||
| unigram distribution. \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | the IDs of the "num_true" "target_classes" in the corresponding original label. | ||||
| * Input "true_classes" is a 2D matrix. | |||||
| * Input "true_classes" is a 2D matrix. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, \n | |||||
| so that all sampled candidates in a batch are unique. This requires \n | |||||
| *@li unique: If "unique" is true, samples with rejection, | |||||
| so that all sampled candidates in a batch are unique. This requires | |||||
| some approximation to estimate the post-rejection sampling probabilities. | some approximation to estimate the post-rejection sampling probabilities. | ||||
| *@li range_max: The sampler will sample integers from the interval [0, range_max). | *@li range_max: The sampler will sample integers from the interval [0, range_max). | ||||
| *@li vocab_file: Each valid line in this file (which should have a \n | |||||
| CSV-like format) corresponds to a valid word ID. \n | |||||
| *@li vocab_file: Each valid line in this file (which should have a | |||||
| CSV-like format) corresponds to a valid word ID. | |||||
| *IDs are in sequential order, starting from num_reserved_ids. | *IDs are in sequential order, starting from num_reserved_ids. | ||||
| *@li distortion: The distortion is used to skew the unigram probability \n | |||||
| distribution. Each weight is first raised to the distortion's power before \n | |||||
| *@li distortion: The distortion is used to skew the unigram probability | |||||
| distribution. Each weight is first raised to the distortion's power before | |||||
| adding to the internal unigram distribution. | adding to the internal unigram distribution. | ||||
| *@li num_reserved_ids: Optionally some reserved IDs can be added in the range \n | |||||
| [0, ..., num_reserved_ids) by the users. \n | |||||
| *@li num_reserved_ids: Optionally some reserved IDs can be added in the range | |||||
| [0, ..., num_reserved_ids) by the users. | |||||
| * One use case is that a special unknown word token is used as ID 0. | * One use case is that a special unknown word token is used as ID 0. | ||||
| *@li num_shards: A sampler can be used to sample from a subset of the \n | |||||
| *@li num_shards: A sampler can be used to sample from a subset of the | |||||
| original range. in order to speed up the whole computation through parallelism. | original range. in order to speed up the whole computation through parallelism. | ||||
| *@li shard: A sampler can be used to sample from a subset of the original \n | |||||
| *@li shard: A sampler can be used to sample from a subset of the original | |||||
| range in order to speed up the whole computation through parallelism. | range in order to speed up the whole computation through parallelism. | ||||
| *@li unigrams: A list of unigram counts or probabilities, one per ID in \n | |||||
| *@li unigrams: A list of unigram counts or probabilities, one per ID in | |||||
| sequential order. | sequential order. | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each | |||||
| element is the ID of a sampled candidate. | element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n | |||||
| number of times each candidate is expected to occur in a batch of sampled \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the | |||||
| number of times each candidate is expected to occur in a batch of sampled | |||||
| candidates. If "unique" is true, then this is a probability. | candidates. If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", \n | |||||
| for each sampled candidate representing the number of times the candidate is \n | |||||
| expected to occur in a batch of sampled candidates. \n | |||||
| If "unique" is true, then this is a probability. | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", | |||||
| for each sampled candidate representing the number of times the candidate is | |||||
| expected to occur in a batch of sampled candidates. | |||||
| If "unique" is true, then this is a probability. \n | |||||
| *@attention Constraints: \n | |||||
| * FixedUnigramCandidateSampler runs on the Ascend AI CPU, \n | |||||
| which delivers poor performance. | |||||
| *@attention Constraints: | |||||
| * FixedUnigramCandidateSampler runs on the Ascend AI CPU, | |||||
| which delivers poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator FixedUnigramCandidateSampler. | |||||
| *Compatible with the TensorFlow operator FixedUnigramCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -212,43 +212,43 @@ REG_OP(FixedUnigramCandidateSampler) | |||||
| .OP_END_FACTORY_REG(FixedUnigramCandidateSampler) | .OP_END_FACTORY_REG(FixedUnigramCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with a learned \n | |||||
| unigram distribution. | |||||
| *@brief Generates labels for candidate sampling with a learned | |||||
| unigram distribution. \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | the IDs of the "num_true" "target_classes" in the corresponding original label. | ||||
| * Input "true_classes" is a 2D matrix. | |||||
| * Input "true_classes" is a 2D matrix. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, \n | |||||
| so that all sampled candidates in a batch are unique. \n | |||||
| *This requires some approximation to estimate the post-rejection \n | |||||
| *@li unique: If "unique" is true, samples with rejection, | |||||
| so that all sampled candidates in a batch are unique. | |||||
| *This requires some approximation to estimate the post-rejection | |||||
| sampling probabilities. | sampling probabilities. | ||||
| *@li range_max: The sampler will sample integers from the interval \n | |||||
| *@li range_max: The sampler will sample integers from the interval | |||||
| [0, range_max). | [0, range_max). | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each | |||||
| element is the ID of a sampled candidate. | element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing \n | |||||
| the number of times each candidate is expected to occur in a batch of sampled candidates. \n | |||||
| *If "unique" is true, then this is a probability. | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each \n | |||||
| sampled candidate representing the number of times the candidate is expected \n | |||||
| to occur in a batch of sampled candidates. \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing | |||||
| the number of times each candidate is expected to occur in a batch of sampled candidates. | |||||
| *If "unique" is true, then this is a probability. | *If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each | |||||
| sampled candidate representing the number of times the candidate is expected | |||||
| to occur in a batch of sampled candidates. | |||||
| *If "unique" is true, then this is a probability. \n | |||||
| *@attention Constraints: \n | |||||
| *LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers \n | |||||
| poor performance. | |||||
| *@attention Constraints: | |||||
| *LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers | |||||
| poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. | |||||
| *Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -267,42 +267,42 @@ REG_OP(LearnedUnigramCandidateSampler) | |||||
| .OP_END_FACTORY_REG(LearnedUnigramCandidateSampler) | .OP_END_FACTORY_REG(LearnedUnigramCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with a log-uniform \n | |||||
| distribution. | |||||
| *@brief Generates labels for candidate sampling with a log-uniform | |||||
| distribution. \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. \n | |||||
| * Input "true_classes" is a 2D matrix. | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | |||||
| * Input "true_classes" is a 2D matrix. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, so that all \n | |||||
| sampled candidates in a batch are unique. This requires some approximation \n | |||||
| *@li unique: If "unique" is true, samples with rejection, so that all | |||||
| sampled candidates in a batch are unique. This requires some approximation | |||||
| to estimate the post-rejection sampling probabilities. | to estimate the post-rejection sampling probabilities. | ||||
| *@li range_max: The sampler will sample integers from the interval \n | |||||
| *@li range_max: The sampler will sample integers from the interval | |||||
| [0, range_max). | [0, range_max). | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", in which each | |||||
| element is the ID of a sampled candidate. | element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing \n | |||||
| the number of times each candidate is expected to occur in a batch of sampled \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing | |||||
| the number of times each candidate is expected to occur in a batch of sampled | |||||
| candidates. If "unique" is true, then this is a probability. | candidates. If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each \n | |||||
| sampled candidate representing the number of times the candidate is expected \n | |||||
| to occur in a batch of sampled candidates. \n | |||||
| *If "unique" is true, then this is a probability. | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each | |||||
| sampled candidate representing the number of times the candidate is expected | |||||
| to occur in a batch of sampled candidates. | |||||
| *If "unique" is true, then this is a probability. \n | |||||
| *@attention Constraints: \n | |||||
| *LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers \n | |||||
| poor performance. | |||||
| *@attention Constraints: | |||||
| *LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers | |||||
| poor performance. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator LogUniformCandidateSampler. | |||||
| *Compatible with the TensorFlow operator LogUniformCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -321,38 +321,38 @@ REG_OP(LogUniformCandidateSampler) | |||||
| .OP_END_FACTORY_REG(LogUniformCandidateSampler) | .OP_END_FACTORY_REG(LogUniformCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Generates labels for candidate sampling with a learned \n | |||||
| unigram distribution. | |||||
| *@brief Generates labels for candidate sampling with a learned | |||||
| unigram distribution. \n | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains \n | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. \n | |||||
| * Input "true_classes" is a 2D matrix. | |||||
| *@par Inputs: | |||||
| *true_classes: A "batch_size * num_true" matrix, in which each row contains | |||||
| the IDs of the "num_true" "target_classes" in the corresponding original label. | |||||
| * Input "true_classes" is a 2D matrix. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li num_sampled: Number of candidates to randomly sample. | *@li num_sampled: Number of candidates to randomly sample. | ||||
| *@li unique: If "unique" is true, samples with rejection, \n | |||||
| so that all sampled candidates in a batch are unique. This requires some \n | |||||
| *@li unique: If "unique" is true, samples with rejection, | |||||
| so that all sampled candidates in a batch are unique. This requires some | |||||
| approximation to estimate the post-rejection sampling probabilities. | approximation to estimate the post-rejection sampling probabilities. | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", \n | |||||
| *@par Outputs: | |||||
| *@li sampled_candidates: A vector of length "num_sampled", | |||||
| in which each element is the ID of a sampled candidate. | in which each element is the ID of a sampled candidate. | ||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the \n | |||||
| number of times each candidate is expected to occur in a batch of sampled candidates. \n | |||||
| *@li true_expected_count: A "batch_size * num_true" matrix, representing the | |||||
| number of times each candidate is expected to occur in a batch of sampled candidates. | |||||
| *If "unique" is true, then this is a probability. | *If "unique" is true, then this is a probability. | ||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each \n | |||||
| sampled candidate representing the number of times the candidate is expected \n | |||||
| to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. | |||||
| *@li sampled_expected_count: A vector of length "num_sampled", for each | |||||
| sampled candidate representing the number of times the candidate is expected | |||||
| to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. \n | |||||
| *@attention Constraints: \n | |||||
| *AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance. | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator AllCandidateSampler. | |||||
| *Compatible with the TensorFlow operator AllCandidateSampler. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -370,31 +370,31 @@ REG_OP(AllCandidateSampler) | |||||
| .OP_END_FACTORY_REG(AllCandidateSampler) | .OP_END_FACTORY_REG(AllCandidateSampler) | ||||
| /** | /** | ||||
| *@brief Computes the "ids" of the positions in "sampled_candidates" that \n | |||||
| match "true_labels". | |||||
| *@brief Computes the "ids" of the positions in "sampled_candidates" that | |||||
| match "true_labels". \n | |||||
| *@par Inputs: | |||||
| * @li Input "true_classes" is a 2D matrix. \n | |||||
| * @li true_classes: The "true_classes" output of UnpackSparseLabels. \n | |||||
| * @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n | |||||
| *@par Inputs: | |||||
| * @li Input "true_classes" is a 2D matrix. | |||||
| * @li true_classes: The "true_classes" output of UnpackSparseLabels. | |||||
| * @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n | |||||
| *@par Attributes: | |||||
| *@par Attributes: | |||||
| *@li num_true: Number of true labels per context. | *@li num_true: Number of true labels per context. | ||||
| *@li seed: If either "seed" or "seed2" are set to be non-zero. | *@li seed: If either "seed" or "seed2" are set to be non-zero. | ||||
| *@li seed2: A second seed to avoid seed collision. | |||||
| *@li seed2: A second seed to avoid seed collision. \n | |||||
| *@par Outputs: | |||||
| *@par Outputs: | |||||
| * @li indices: A vector of indices corresponding to rows of "true_candidates". | * @li indices: A vector of indices corresponding to rows of "true_candidates". | ||||
| * @li ids: A vector of IDs of positions in "sampled_candidates" that match a \n | |||||
| * @li ids: A vector of IDs of positions in "sampled_candidates" that match a | |||||
| "true_label" for the row with the corresponding index in indices. | "true_label" for the row with the corresponding index in indices. | ||||
| * @li weights: A vector of the same length as "indices" and "ids", in which \n | |||||
| each element is -FLOAT_MAX. | |||||
| * @li weights: A vector of the same length as "indices" and "ids", in which | |||||
| each element is -FLOAT_MAX. \n | |||||
| *@attention Constraints: \n | |||||
| *ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance. \n | |||||
| *@attention Constraints: | |||||
| *ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance. | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *Compatible with the TensorFlow operator ComputeAccidentalHits. | |||||
| *Compatible with the TensorFlow operator ComputeAccidentalHits. \n | |||||
| *@par Restrictions: | *@par Restrictions: | ||||
| *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. | ||||
| @@ -26,17 +26,17 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Take elements from data if specific condition is satisfied on mask. | |||||
| *@brief Take elements from data if specific condition is satisfied on mask. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li data: input tensor from which to take elements, High-dimension input would \n | |||||
| *@li data: input tensor from which to take elements, High-dimension input would | |||||
| first be flattened. | first be flattened. | ||||
| *@li mask: condition param; must be the same shape with data. | |||||
| *@li mask: condition param; must be the same shape with data. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li mode:convert by convert in Mode. | *@li mode:convert by convert in Mode. | ||||
| *@li val:convert by <class 'float'> | *@li val:convert by <class 'float'> | ||||
| *@li eps:convert by <class 'float'> (default: 1e-06) | |||||
| *@li eps:convert by <class 'float'> (default: 1e-06) \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li out_data: the elements taken | *@li out_data: the elements taken | ||||
| @@ -27,21 +27,21 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Forwards the value of an available tensor from input "x" to output "y". \n | |||||
| * Merge waits for at least one of the input tensors to become available. \n | |||||
| * It is usually combined with Switch to implement branching. \n | |||||
| * Merge forwards the first tensor to become available to output "y", \n | |||||
| * and sets "value_index" the index of the tensor in inputs. | |||||
| *@brief Forwards the value of an available tensor from input "x" to output "y". | |||||
| * Merge waits for at least one of the input tensors to become available. | |||||
| * It is usually combined with Switch to implement branching. | |||||
| * Merge forwards the first tensor to become available to output "y", | |||||
| * and sets "value_index" the index of the tensor in inputs . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The input tensors, one of which will become available. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The input tensors, one of which will become available. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li y: The available tensor. Has the same type as "x". | *@li y: The available tensor. Has the same type as "x". | ||||
| *@li value_index: A scalar of type int32, for the index of the chosen input \n | |||||
| * tensor. | |||||
| *@li value_index: A scalar of type int32, for the index of the chosen input | |||||
| * tensor . \n | |||||
| *@see Switch() | *@see Switch() | ||||
| @@ -59,21 +59,21 @@ REG_OP(Merge) | |||||
| .OP_END_FACTORY_REG(Merge) | .OP_END_FACTORY_REG(Merge) | ||||
| /** | /** | ||||
| *@brief Forwards the value of an available tensor from input "x" to output "y". \n | |||||
| * Merge waits for at least one of the input tensors to become available. \n | |||||
| * It is usually combined with Switch to implement branching. \n | |||||
| * Merge forwards the first tensor to become available to output "y", \n | |||||
| * and sets "value_index" the index of the tensor in inputs. | |||||
| *@brief Forwards the value of an available tensor from input "x" to output "y". | |||||
| * Merge waits for at least one of the input tensors to become available. | |||||
| * It is usually combined with Switch to implement branching. | |||||
| * Merge forwards the first tensor to become available to output "y", | |||||
| * and sets "value_index" the index of the tensor in inputs . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The input tensors, one of which will become available. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The input tensors, one of which will become available. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li y: The available tensor. Has the same type as "x". | *@li y: The available tensor. Has the same type as "x". | ||||
| *@li value_index: A scalar of type int32, for the index of the chosen input \n | |||||
| * tensor. | |||||
| *@li value_index: A scalar of type int32, for the index of the chosen input | |||||
| * tensor . \n | |||||
| *@see Switch() | Merge() | *@see Switch() | Merge() | ||||
| @@ -91,21 +91,21 @@ REG_OP(RefMerge) | |||||
| .OP_END_FACTORY_REG(RefMerge) | .OP_END_FACTORY_REG(RefMerge) | ||||
| /** | /** | ||||
| *@brief Forwards "data" to the output port determined by "pred". \n | |||||
| * If "pred" is "true", the data input is forwarded to "output_true". \n | |||||
| * Otherwise, the data is forwarded to "output_false". | |||||
| *@brief Forwards "data" to the output port determined by "pred". | |||||
| * If "pred" is "true", the data input is forwarded to "output_true". | |||||
| * Otherwise, the data is forwarded to "output_false" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li data: The tensor to be forwarded. \ n | *@li data: The tensor to be forwarded. \ n | ||||
| * Must be one of the following types: float16, float32, float64, \n | |||||
| * Must be one of the following types: float16, float32, float64, | |||||
| * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | ||||
| *@li pred: A boolean scalar. The output port that will receive data. | |||||
| *@li pred: A boolean scalar. The output port that will receive data . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li output_false: If "pred" is "false", data will be forwarded to this output. \n | |||||
| *@li output_false: If "pred" is "false", data will be forwarded to this output. | |||||
| * Has the same type as "data". | * Has the same type as "data". | ||||
| *@li output_true: If "pred" is "true", data will be forwarded to this output. \n | |||||
| * Has the same type as "data". | |||||
| *@li output_true: If "pred" is "true", data will be forwarded to this output. | |||||
| * Has the same type as "data" . \n | |||||
| *@see Merge() | *@see Merge() | ||||
| @@ -126,21 +126,21 @@ REG_OP(Switch) | |||||
| .OP_END_FACTORY_REG(Switch) | .OP_END_FACTORY_REG(Switch) | ||||
| /** | /** | ||||
| *@brief Forwards "data" to the output port determined by "pred". \n | |||||
| * If "pred" is "true", the data input is forwarded to "output_true". \n | |||||
| * Otherwise, the data is forwarded to "output_false". | |||||
| *@brief Forwards "data" to the output port determined by "pred". | |||||
| * If "pred" is "true", the data input is forwarded to "output_true". | |||||
| * Otherwise, the data is forwarded to "output_false" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li data: The ref tensor to be forwarded. \n | |||||
| * Must be one of the following types: float16, float32, float64, \n | |||||
| *@li data: The ref tensor to be forwarded. | |||||
| * Must be one of the following types: float16, float32, float64, | |||||
| * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | ||||
| *@li pred: A boolean scalar. The output port that will receive data. | |||||
| *@li pred: A boolean scalar. The output port that will receive data . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li output_false: If "pred" is "false", data will be forwarded to this output. \n | |||||
| *@li output_false: If "pred" is "false", data will be forwarded to this output. | |||||
| * Has the same type as "data". | * Has the same type as "data". | ||||
| *@li output_true: If "pred" is "true", data will be forwarded to this output. \n | |||||
| * Has the same type as "data". | |||||
| *@li output_true: If "pred" is "true", data will be forwarded to this output. | |||||
| * Has the same type as "data" . \n | |||||
| *@see Merge() | Switch() | *@see Merge() | Switch() | ||||
| @@ -161,16 +161,16 @@ REG_OP(RefSwitch) | |||||
| .OP_END_FACTORY_REG(RefSwitch) | .OP_END_FACTORY_REG(RefSwitch) | ||||
| /** | /** | ||||
| *@brief Forwards "data" to the output port determined by "pred_value". | |||||
| *@brief Forwards "data" to the output port determined by "pred_value" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li data: The tensor to be forwarded. \ n | *@li data: The tensor to be forwarded. \ n | ||||
| * Must be one of the following types: float16, float32, float64, \n | |||||
| * Must be one of the following types: float16, float32, float64, | |||||
| * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. | ||||
| *@li pred_value: A int64 tensor which determines the output port that will receive data. | |||||
| *@li pred_value: A int64 tensor which determines the output port that will receive data . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors, one of which will become available. \n | |||||
| *output: The output tensors, one of which will become available. | |||||
| * Has the same type as "data". | * Has the same type as "data". | ||||
| */ | */ | ||||
| REG_OP(SwitchN) | REG_OP(SwitchN) | ||||
| @@ -184,24 +184,24 @@ REG_OP(SwitchN) | |||||
| .OP_END_FACTORY_REG(SwitchN) | .OP_END_FACTORY_REG(SwitchN) | ||||
| /** | /** | ||||
| *@brief Creates or finds a child frame, and makes "x" available to the child \n | |||||
| * frame. This op is used together with Exit to create loops in the graph. \n | |||||
| * The Executor uses the unique "frame_name" to identify frames. \n | |||||
| * If "is_constant" is "true", output "y" is a constant in the child \n | |||||
| * frame; otherwise it may be changed in the child frame. | |||||
| *@brief Creates or finds a child frame, and makes "x" available to the child | |||||
| * frame. This op is used together with Exit to create loops in the graph. | |||||
| * The Executor uses the unique "frame_name" to identify frames. | |||||
| * If "is_constant" is "true", output "y" is a constant in the child | |||||
| * frame; otherwise it may be changed in the child frame . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the child frame. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the child frame. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li frame_name: A required string. The name of the child frame. | *@li frame_name: A required string. The name of the child frame. | ||||
| *@li is_constant: A required bool. If true, the output is constant in \n | |||||
| * the child frame. | |||||
| *@li is_constant: A required bool. If true, the output is constant in | |||||
| * the child frame . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A Tensor. Has the same type as "x". | |||||
| *y: A Tensor. Has the same type as "x" . \n | |||||
| *@see Exit() | *@see Exit() | ||||
| @@ -220,24 +220,24 @@ REG_OP(Enter) | |||||
| .OP_END_FACTORY_REG(Enter) | .OP_END_FACTORY_REG(Enter) | ||||
| /** | /** | ||||
| *@brief Creates or finds a child frame, and makes "x" available to the child \n | |||||
| * frame. This op is used together with Exit to create loops in the graph. \n | |||||
| * The Executor uses the unique "frame_name" to identify frames. \n | |||||
| * If "is_constant" is "true", output "y" is a constant in the child \n | |||||
| * frame; otherwise it may be changed in the child frame. | |||||
| *@brief Creates or finds a child frame, and makes "x" available to the child | |||||
| * frame. This op is used together with Exit to create loops in the graph. | |||||
| * The Executor uses the unique "frame_name" to identify frames. | |||||
| * If "is_constant" is "true", output "y" is a constant in the child | |||||
| * frame; otherwise it may be changed in the child frame . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the child frame. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the child frame. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li frame_name: A required string. The name of the child frame. | *@li frame_name: A required string. The name of the child frame. | ||||
| *@li is_constant: A required bool. If true, the output is constant in \n | |||||
| * the child frame. | |||||
| *@li is_constant: A required bool. If true, the output is constant in | |||||
| * the child frame . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A tensor. Has the same type as "x". | |||||
| *y: A tensor. Has the same type as "x" . \n | |||||
| *@see Exit() | Enter() | *@see Exit() | Enter() | ||||
| @@ -256,14 +256,14 @@ REG_OP(RefEnter) | |||||
| .OP_END_FACTORY_REG(RefEnter) | .OP_END_FACTORY_REG(RefEnter) | ||||
| /** | /** | ||||
| *@brief Forwards the input to the output. This op represents the loop \n | |||||
| * termination condition. | |||||
| *@brief Forwards the input to the output. This op represents the loop | |||||
| * termination condition . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: A boolean scalar. The condition of the Switch op. | |||||
| *x: A boolean scalar. The condition of the Switch op . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: The tensor "x". | |||||
| *y: The tensor "x" . \n | |||||
| *@see Switch() | *@see Switch() | ||||
| @@ -276,15 +276,15 @@ REG_OP(LoopCond) | |||||
| .OP_END_FACTORY_REG(LoopCond) | .OP_END_FACTORY_REG(LoopCond) | ||||
| /** | /** | ||||
| *@brief Makes the input available to the next iteration. | |||||
| *@brief Makes the input available to the next iteration . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the next iteration. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the next iteration. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A Tensor. Has the same type as "x". | |||||
| *y: A Tensor. Has the same type as "x" . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator NextIteration. | *@Compatible with the TensorFlow operator NextIteration. | ||||
| @@ -299,15 +299,15 @@ REG_OP(NextIteration) | |||||
| .OP_END_FACTORY_REG(NextIteration) | .OP_END_FACTORY_REG(NextIteration) | ||||
| /** | /** | ||||
| *@brief Makes the input available to the next iteration. | |||||
| *@brief Makes the input available to the next iteration . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the next iteration. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the next iteration. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A tensor. Has the same type as "x". | |||||
| *y: A tensor. Has the same type as "x" . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator RefNextIteration. | *@Compatible with the TensorFlow operator RefNextIteration. | ||||
| @@ -322,15 +322,15 @@ REG_OP(RefNextIteration) | |||||
| .OP_END_FACTORY_REG(RefNextIteration) | .OP_END_FACTORY_REG(RefNextIteration) | ||||
| /** | /** | ||||
| *@brief Exits the current frame to its parent frame. | |||||
| *@brief Exits the current frame to its parent frame . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the parent frame. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the parent frame. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A Tensor. Has the same type as "x". | |||||
| *y: A Tensor. Has the same type as "x" . \n | |||||
| *@see Enter() | *@see Enter() | ||||
| @@ -347,15 +347,15 @@ REG_OP(Exit) | |||||
| .OP_END_FACTORY_REG(Exit) | .OP_END_FACTORY_REG(Exit) | ||||
| /** | /** | ||||
| *@brief Exits the current frame to its parent frame. | |||||
| *@brief Exits the current frame to its parent frame . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *x: The tensor to be made available to the parent frame. \n | |||||
| * Must be one of the following types: float16, float32, float64, int8, \n | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool. | |||||
| *x: The tensor to be made available to the parent frame. | |||||
| * Must be one of the following types: float16, float32, float64, int8, | |||||
| * int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *y: A tensor. Has the same type as "x". | |||||
| *y: A tensor. Has the same type as "x" . \n | |||||
| *@see Enter() | Exit() | *@see Enter() | Exit() | ||||
| @@ -372,9 +372,9 @@ REG_OP(RefExit) | |||||
| .OP_END_FACTORY_REG(RefExit) | .OP_END_FACTORY_REG(RefExit) | ||||
| /** | /** | ||||
| *@brief Only useful as a placeholder for control edges. \n | |||||
| * It is similar to a no-op that always produces a live control output \n | |||||
| * even when some control inputs are dead. | |||||
| *@brief Only useful as a placeholder for control edges. | |||||
| * It is similar to a no-op that always produces a live control output | |||||
| * even when some control inputs are dead . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator ControlTrigger. | *@Compatible with the TensorFlow operator ControlTrigger. | ||||
| @@ -389,7 +389,7 @@ REG_OP(ControlTrigger) | |||||
| * Three inputs, including: | * Three inputs, including: | ||||
| *@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8. | *@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8. | ||||
| *@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried. | *@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried. | ||||
| *@li level_index: One dimensional tensore of type int32, specifying secondary index. | |||||
| *@li level_index: One dimensional tensore of type int32, specifying secondary index. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map. | *@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map. | ||||
| @@ -27,29 +27,29 @@ | |||||
| namespace ge { | namespace ge { | ||||
| /** | /** | ||||
| *@brief Calculates the CTC Loss (log probability) for each batch entry. \n | |||||
| Also calculates the gradient. | |||||
| *@brief Calculates the CTC Loss (log probability) for each batch entry. | |||||
| Also calculates the gradient. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | ||||
| *@li labels_indices: The indices of a `SparseTensor<int32, 2>`. \n | |||||
| `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for \n | |||||
| *@li labels_indices: The indices of a `SparseTensor<int32, 2>`. | |||||
| `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for | |||||
| `(batch b, time t)`. | `(batch b, time t)`. | ||||
| *@li labels_values: The values (labels) associated with the given batch and time. | *@li labels_values: The values (labels) associated with the given batch and time. | ||||
| *@li sequence_length: A vector containing sequence lengths (batch). | |||||
| *@li sequence_length: A vector containing sequence lengths (batch). \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li loss: A vector (batch) containing log-probabilities. | *@li loss: A vector (batch) containing log-probabilities. | ||||
| *@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x \n | |||||
| batch_size x num_classes)`. | |||||
| *@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x | |||||
| batch_size x num_classes)`. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to \n | |||||
| *@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to | |||||
| the CTC calculation.If not specified, defaults to false | the CTC calculation.If not specified, defaults to false | ||||
| *@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation \n | |||||
| repeated non-blank labels will not be merged and are interpreted as \n | |||||
| individual labels. This is a simplified version of CTC. \n | |||||
| If not specified, defaults to true | |||||
| *@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation | |||||
| repeated non-blank labels will not be merged and are interpreted as | |||||
| individual labels. This is a simplified version of CTC. | |||||
| If not specified, defaults to true. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| * Compatible with TensorFlow CTCLoss operator. | * Compatible with TensorFlow CTCLoss operator. | ||||
| @@ -67,24 +67,24 @@ REG_OP(CTCLoss) | |||||
| .OP_END_FACTORY_REG(CTCLoss) | .OP_END_FACTORY_REG(CTCLoss) | ||||
| /** | /** | ||||
| *@brief Performs greedy decoding on the logits given in inputs. | |||||
| *@brief Performs greedy decoding on the logits given in inputs. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | ||||
| *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. | |||||
| *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li merge_repeated: If True, merge repeated classes in output. | |||||
| *@li merge_repeated: If True, merge repeated classes in output. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,\n | |||||
| *@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`, | |||||
| of a `SparseTensor<int64, 2>`. The rows store: [batch, time]. | of a `SparseTensor<int64, 2>`. The rows store: [batch, time]. | ||||
| *@li decoded_values: Values vector, size: `(total_decoded_outputs)`,\n | |||||
| *@li decoded_values: Values vector, size: `(total_decoded_outputs)`, | |||||
| of a `SparseTensor<int64, 2>`. The vector stores the decoded classes. | of a `SparseTensor<int64, 2>`. The vector stores the decoded classes. | ||||
| *@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.\n | |||||
| *@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor. | |||||
| Values are: `[batch_size, max_decoded_length]`. | Values are: `[batch_size, max_decoded_length]`. | ||||
| *@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence\n | |||||
| log-probabilities. | |||||
| *@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence | |||||
| log-probabilities. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| * Compatible with TensorFlow CTCGreedyDecoder operator. | * Compatible with TensorFlow CTCGreedyDecoder operator. | ||||
| @@ -100,27 +100,27 @@ REG_OP(CTCGreedyDecoder) | |||||
| .OP_END_FACTORY_REG(CTCGreedyDecoder) | .OP_END_FACTORY_REG(CTCGreedyDecoder) | ||||
| /** | /** | ||||
| *@brief Performs beam search decoding on the logits given in input. | |||||
| *@brief Performs beam search decoding on the logits given in input. \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. | ||||
| *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. | |||||
| *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li merge_repeated: If True, merge repeated classes in output. | |||||
| *@li merge_repeated: If True, merge repeated classes in output. \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *@li decoded_indices: A list (length: top_paths) of indices matrices. Matrix j,\n | |||||
| size `(total_decoded_outputs[j] x 2)`, has indices of a\n | |||||
| *@li decoded_indices: A list (length: top_paths) of indices matrices. Matrix j, | |||||
| size `(total_decoded_outputs[j] x 2)`, has indices of a | |||||
| `SparseTensor<int64, 2>`. The rows store: [batch, time]. | `SparseTensor<int64, 2>`. The rows store: [batch, time]. | ||||
| *@li decoded_values: A list (length: top_paths) of values vectors. Vector j,\n | |||||
| size `(length total_decoded_outputs[j])`, has the values of a\n | |||||
| *@li decoded_values: A list (length: top_paths) of values vectors. Vector j, | |||||
| size `(length total_decoded_outputs[j])`, has the values of a | |||||
| `SparseTensor<int64, 2>`. The vector stores the decoded classes for beam j. | `SparseTensor<int64, 2>`. The vector stores the decoded classes for beam j. | ||||
| *@li decoded_shape: A list (length: top_paths) of shape vector. Vector j,\n | |||||
| size `(2)`, stores the shape of the decoded `SparseTensor[j]`.\n | |||||
| *@li decoded_shape: A list (length: top_paths) of shape vector. Vector j, | |||||
| size `(2)`, stores the shape of the decoded `SparseTensor[j]`. | |||||
| Its values are: `[batch_size, max_decoded_length[j]]`. | Its values are: `[batch_size, max_decoded_length[j]]`. | ||||
| *@li log_probability: A matrix, shaped: `(batch_size x top_paths)`. The\n | |||||
| sequence log-probabilities. | |||||
| *@li log_probability: A matrix, shaped: `(batch_size x top_paths)`. The | |||||
| sequence log-probabilities. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| * Compatible with TensorFlow CTCBeamSearchDecoder operator. | * Compatible with TensorFlow CTCBeamSearchDecoder operator. | ||||
| @@ -25,40 +25,27 @@ | |||||
| #include "graph/operator.h" | #include "graph/operator.h" | ||||
| namespace ge { | namespace ge { | ||||
| REG_OP(SymbolicGradient) | |||||
| .DYNAMIC_INPUT(input, TensorType::ALL()) | |||||
| .DYNAMIC_OUTPUT(output, TensorType::ALL()) | |||||
| .GRAPH(f) | |||||
| .OP_END_FACTORY_REG(SymbolicGradient) | |||||
| REG_OP(RemoteCall) | |||||
| .INPUT(target, DT_STRING) | |||||
| .DYNAMIC_INPUT(args, TensorType::ALL()) | |||||
| .DYNAMIC_OUTPUT(output, TensorType::ALL()) | |||||
| .GRAPH(f) | |||||
| .OP_END_FACTORY_REG(RemoteCall) | |||||
| /** | /** | ||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n | |||||
| * If "cond" means True, the selected subgraph is "then_branch". \n | |||||
| * Otherwise, the selected subgraph is "else_branch". | |||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. | |||||
| * If "cond" means True, the selected subgraph is "then_branch". | |||||
| * Otherwise, the selected subgraph is "else_branch" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; | |||||
| * if "cond" is not a scalar, non-empty means True and empty means False. | * if "cond" is not a scalar, non-empty means True and empty means False. | ||||
| *@li input: The input tensors. | |||||
| *@li input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what else_branch returns. | * whose types are the same as what else_branch returns. | ||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| * whose types are the same as what then_branch returns. | |||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what then_branch returns . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input). | |||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator _If. | *@Compatible with the TensorFlow operator _If. | ||||
| @@ -72,26 +59,26 @@ REG_OP(_If) | |||||
| .OP_END_FACTORY_REG(_If) | .OP_END_FACTORY_REG(_If) | ||||
| /** | /** | ||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n | |||||
| * If "cond" means True, the selected subgraph is "then_branch". \n | |||||
| * Otherwise, the selected subgraph is "else_branch". | |||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. | |||||
| * If "cond" means True, the selected subgraph is "then_branch". | |||||
| * Otherwise, the selected subgraph is "else_branch" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; | |||||
| * if "cond" is not a scalar, non-empty means True and empty means False. | * if "cond" is not a scalar, non-empty means True and empty means False. | ||||
| *@li input: The input tensors. | |||||
| *@li input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what else_branch returns. | * whose types are the same as what else_branch returns. | ||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| * whose types are the same as what then_branch returns. | |||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what then_branch returns . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input). | |||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator StatelessIf. | *@Compatible with the TensorFlow operator StatelessIf. | ||||
| @@ -105,26 +92,26 @@ REG_OP(StatelessIf) | |||||
| .OP_END_FACTORY_REG(StatelessIf) | .OP_END_FACTORY_REG(StatelessIf) | ||||
| /** | /** | ||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n | |||||
| * If "cond" means True, the selected subgraph is "then_branch". \n | |||||
| * Otherwise, the selected subgraph is "else_branch". | |||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. | |||||
| * If "cond" means True, the selected subgraph is "then_branch". | |||||
| * Otherwise, the selected subgraph is "else_branch" . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A Tensor. If "cond" is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if "cond" is a numerical scalar, non-zero means True and zero means False; | |||||
| * if "cond" is a string scalar, non-empty means True and empty means False; | |||||
| * if "cond" is not a scalar, non-empty means True and empty means False. | * if "cond" is not a scalar, non-empty means True and empty means False. | ||||
| *@li input: The input tensors. | |||||
| *@li input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| *@li then_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what else_branch returns. | * whose types are the same as what else_branch returns. | ||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n | |||||
| * whose types are the same as what then_branch returns. | |||||
| *@li else_branch: A subgraph takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what then_branch returns . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input). | |||||
| *output: The output tensors returned by either then_branch(input) or else_branch(input) . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator If. | *@Compatible with the TensorFlow operator If. | ||||
| @@ -138,18 +125,18 @@ REG_OP(If) | |||||
| .OP_END_FACTORY_REG(If) | .OP_END_FACTORY_REG(If) | ||||
| /** | /** | ||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors. | |||||
| *@brief Select one of the subgraphs to pass the input tensors and return the output tensors . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li branch_index: A int32 scalar which determines the selected subgraph. | *@li branch_index: A int32 scalar which determines the selected subgraph. | ||||
| *@li input: The input tensors, which will be passed to the subgraph. | |||||
| *@li input: The input tensors, which will be passed to the subgraph . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors, \n | |||||
| * whose types are the same as what every other subgraph returns. | |||||
| *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors, | |||||
| * whose types are the same as what every other subgraph returns . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by one of branches. | |||||
| *output: The output tensors returned by one of branches . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator Case. | *@Compatible with the TensorFlow operator Case. | ||||
| @@ -162,25 +149,25 @@ REG_OP(Case) | |||||
| .OP_END_FACTORY_REG(Case) | .OP_END_FACTORY_REG(Case) | ||||
| /** | /** | ||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False. | |||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *input: The input tensors. | |||||
| *input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li cond: A subgraph takes 'input' and returns a tensor. \n | |||||
| * If the tensor is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if it is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A subgraph takes 'input' and returns a tensor. | |||||
| * If the tensor is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; | |||||
| * if it is a string scalar, non-empty means True and empty means False; | |||||
| * if it is not a scalar, non-empty means True and empty means False. | * if it is not a scalar, non-empty means True and empty means False. | ||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors. | |||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *parallel_iterations: An optional int, default as 10. | |||||
| *parallel_iterations: An optional int, default as 10 . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "body". Has the same type as "input". | |||||
| *output: The output tensors returned by "body". Has the same type as "input" . \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator _While. | *@Compatible with the TensorFlow operator _While. | ||||
| @@ -193,25 +180,25 @@ REG_OP(_While) | |||||
| .OP_END_FACTORY_REG(_While) | .OP_END_FACTORY_REG(_While) | ||||
| /** | /** | ||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False. | |||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *input: The input tensors. | |||||
| *input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li cond: A subgraph takes 'input' and returns a tensor. \n | |||||
| * If the tensor is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if it is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A subgraph takes 'input' and returns a tensor. | |||||
| * If the tensor is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; | |||||
| * if it is a string scalar, non-empty means True and empty means False; | |||||
| * if it is not a scalar, non-empty means True and empty means False. | * if it is not a scalar, non-empty means True and empty means False. | ||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors. | |||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *parallel_iterations: An optional int, default as 10. | |||||
| *parallel_iterations: An optional int, default as 10 . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "body". Has the same type as "input". | |||||
| *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator While. | *@Compatible with the TensorFlow operator While. | ||||
| @@ -225,25 +212,25 @@ REG_OP(While) | |||||
| .OP_END_FACTORY_REG(While) | .OP_END_FACTORY_REG(While) | ||||
| /** | /** | ||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False. | |||||
| *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *input: The input tensors. | |||||
| *input: The input tensors . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *@li cond: A subgraph takes 'input' and returns a tensor. \n | |||||
| * If the tensor is not a scalar of boolean type, \n | |||||
| * it will be converted to a boolean according to the following rule: \n | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; \n | |||||
| * if it is a string scalar, non-empty means True and empty means False; \n | |||||
| *@li cond: A subgraph takes 'input' and returns a tensor. | |||||
| * If the tensor is not a scalar of boolean type, | |||||
| * it will be converted to a boolean according to the following rule: | |||||
| * if it is a numerical scalar, non-zero means True and zero means False; | |||||
| * if it is a string scalar, non-empty means True and empty means False; | |||||
| * if it is not a scalar, non-empty means True and empty means False. | * if it is not a scalar, non-empty means True and empty means False. | ||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors. | |||||
| *@li body: A subgraph takes 'input' and returns a another list of tensors . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *parallel_iterations: An optional int, default as 10. | |||||
| *parallel_iterations: An optional int, default as 10 . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "body". Has the same type as "input". | |||||
| *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator StatelessWhile. | *@Compatible with the TensorFlow operator StatelessWhile. | ||||
| @@ -257,19 +244,19 @@ REG_OP(StatelessWhile) | |||||
| .OP_END_FACTORY_REG(StatelessWhile) | .OP_END_FACTORY_REG(StatelessWhile) | ||||
| /** | /** | ||||
| *@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound. | |||||
| *@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *@li start: A int32 scalar. The lower bound. | *@li start: A int32 scalar. The lower bound. | ||||
| *@li limit: A int32 scalar. The upper bound. | *@li limit: A int32 scalar. The upper bound. | ||||
| *@li delta: A int32 scalar. The step size. | *@li delta: A int32 scalar. The step size. | ||||
| *@li input: The input tensors, which will be passed to "body". | |||||
| *@li input: The input tensors, which will be passed to "body" . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *body: A subgraph takes 'input' and returns a another list of tensors. | |||||
| *body: A subgraph takes 'input' and returns a another list of tensors . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "body". Has the same type as "input". | |||||
| *output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator For. | *@Compatible with the TensorFlow operator For. | ||||
| @@ -284,21 +271,21 @@ REG_OP(For) | |||||
| .OP_END_FACTORY_REG(For) | .OP_END_FACTORY_REG(For) | ||||
| /** | /** | ||||
| *@brief Pass the input tensors to the subgraph "f" and return the output tensors. | |||||
| *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *args: The input tensors, which will be passed to "f". | |||||
| *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *f: A subgraph takes 'args' and returns a another list of tensors. | |||||
| *f: A subgraph takes 'args' and returns a another list of tensors . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li config: An optional string, default as "". | *@li config: An optional string, default as "". | ||||
| *@li config_proto: An optional int, default as "". | *@li config_proto: An optional int, default as "". | ||||
| *@li executor_type: An optional int, default as "". | |||||
| *@li executor_type: An optional int, default as "" . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "f". | |||||
| *output: The output tensors returned by "f" . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator PartitionedCall. | *@Compatible with the TensorFlow operator PartitionedCall. | ||||
| @@ -313,21 +300,21 @@ REG_OP(PartitionedCall) | |||||
| .OP_END_FACTORY_REG(PartitionedCall) | .OP_END_FACTORY_REG(PartitionedCall) | ||||
| /** | /** | ||||
| *@brief Pass the input tensors to the subgraph "f" and return the output tensors. | |||||
| *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n | |||||
| *@par Inputs: | *@par Inputs: | ||||
| *args: The input tensors, which will be passed to "f". | |||||
| *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n | |||||
| *@par Graphs: | *@par Graphs: | ||||
| *f: A subgraph takes 'args' and returns a another list of tensors. | |||||
| *f: A subgraph takes 'args' and returns a another list of tensors . \n | |||||
| *@par Attributes: | *@par Attributes: | ||||
| *@li config: An optional string, default as "". | *@li config: An optional string, default as "". | ||||
| *@li config_proto: An optional int, default as "". | *@li config_proto: An optional int, default as "". | ||||
| *@li executor_type: An optional int, default as "". | |||||
| *@li executor_type: An optional int, default as "" . \n | |||||
| *@par Outputs: | *@par Outputs: | ||||
| *output: The output tensors returned by "f". | |||||
| *output: The output tensors returned by "f" . It's a dynamic output. \n | |||||
| *@par Third-party framework compatibility | *@par Third-party framework compatibility | ||||
| *@Compatible with the TensorFlow operator StatefulPartitionedCall. | *@Compatible with the TensorFlow operator StatefulPartitionedCall. | ||||
| @@ -341,11 +328,6 @@ REG_OP(StatefulPartitionedCall) | |||||
| .ATTR(executor_type, String, "") | .ATTR(executor_type, String, "") | ||||
| .OP_END_FACTORY_REG(StatefulPartitionedCall) | .OP_END_FACTORY_REG(StatefulPartitionedCall) | ||||
| REG_OP(FakeParam) | |||||
| .OUTPUT(output, TensorType::ALL()) | |||||
| .ATTR(shape, ListInt, {}) | |||||
| .OP_END_FACTORY_REG(FakeParam) | |||||
| } // namespace ge | } // namespace ge | ||||
| #endif // GE_FUNCTIONAL_OPS_H_ | #endif // GE_FUNCTIONAL_OPS_H_ | ||||
| @@ -27,18 +27,18 @@ namespace ge { | |||||
| /** | /** | ||||
| * @brief Outputs a tensor gathering all input tensors. | * @brief Outputs a tensor gathering all input tensors. | ||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| * float32. | |||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| float32. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| * participating in the op. | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| * participating in the op. | |||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| participating in the op. | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| participating in the op. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A Tensor. Has the same type as "x". | * y: A Tensor. Has the same type as "x". | ||||
| * @attention Constraints:\n | |||||
| * "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| * as the name of a world group. | |||||
| * @attention Constraints: | |||||
| "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| as the name of a world group. | |||||
| */ | */ | ||||
| REG_OP(HcomAllGather) | REG_OP(HcomAllGather) | ||||
| .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | ||||
| @@ -50,25 +50,25 @@ REG_OP(HcomAllGather) | |||||
| .OP_END_FACTORY_REG(HcomAllGather) | .OP_END_FACTORY_REG(HcomAllGather) | ||||
| /** | /** | ||||
| * @brief Outputs a tensor containing the reduction across all input tensors | |||||
| * passed to op. | |||||
| * @brief Outputs a tensor containing the reduction across all input tensors | |||||
| passed to op. | |||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| * float32. | |||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| float32. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li reduction: A required string identifying the reduction operation to | |||||
| * perform.The supported operation are: "sum", "max", "min", "prod". | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| * participating in the op. | |||||
| * @li fusion: An optional integer identifying the fusion flag of the op. \n | |||||
| * 0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id. | |||||
| * @li reduction: A required string identifying the reduction operation to | |||||
| perform.The supported operation are: "sum", "max", "min", "prod". | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| participating in the op. | |||||
| * @li fusion: An optional integer identifying the fusion flag of the op. | |||||
| 0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id. | |||||
| * @li fusion_id: An optional integer identifying the fusion id of the op. | * @li fusion_id: An optional integer identifying the fusion id of the op. | ||||
| * The HcomAllReduce ops with the same fusion id will be fused. | * The HcomAllReduce ops with the same fusion id will be fused. | ||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A Tensor. Has the same type as "x". | * y: A Tensor. Has the same type as "x". | ||||
| * @attention Constraints: \n | |||||
| * "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| * as the name of a world group. | |||||
| * @attention Constraints: | |||||
| *"group" is limited to 128 characters. Use "hccl_world_group" | |||||
| as the name of a world group. | |||||
| */ | */ | ||||
| REG_OP(HcomAllReduce) | REG_OP(HcomAllReduce) | ||||
| .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | ||||
| @@ -84,18 +84,19 @@ REG_OP(HcomAllReduce) | |||||
| /** | /** | ||||
| * @brief Broadcasts the input tensor in root rank to all ranks. | * @brief Broadcasts the input tensor in root rank to all ranks. | ||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A list of dynamic input tensor. Must be one of the following types: | |||||
| * int8, int16, int32, float16, float32. | |||||
| * x: A list of dynamic input tensor. Must be one of the following types: | |||||
| int8, int16, int32, float16, float32. It's a dynamic input. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li root_rank: A required integer identifying the root rank in the op | |||||
| * input of this rank will be broadcast to other ranks. | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| * participating in the op. | |||||
| * @li root_rank: A required integer identifying the root rank in the op | |||||
| input of this rank will be broadcast to other ranks. | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| participating in the op. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A list of dynamic output tensor. Has the same type and length as "x". | * y: A list of dynamic output tensor. Has the same type and length as "x". | ||||
| * @attention Constraints:\n | |||||
| * "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| * as the name of a world group. | |||||
| * It's a dynamic output. | |||||
| * @attention Constraints: | |||||
| "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| as the name of a world group. | |||||
| */ | */ | ||||
| REG_OP(HcomBroadcast) | REG_OP(HcomBroadcast) | ||||
| .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | ||||
| @@ -107,24 +108,24 @@ REG_OP(HcomBroadcast) | |||||
| .OP_END_FACTORY_REG(HcomBroadcast) | .OP_END_FACTORY_REG(HcomBroadcast) | ||||
| /** | /** | ||||
| * @brief Performs reduction across all input tensors, scattering in equal | |||||
| * blocks among ranks, each rank getting a chunk of data based on its rank | |||||
| * index. | |||||
| * @brief Performs reduction across all input tensors, scattering in equal | |||||
| blocks among ranks, each rank getting a chunk of data based on its rank | |||||
| index. | |||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| * float32. | |||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| float32. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li reduction: A required string identifying the reduction operation to | |||||
| * perform. The supported operation are: "sum", "max", "min", "prod". | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| * participating in the op. | |||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| * participating in the op. | |||||
| * @li reduction: A required string identifying the reduction operation to | |||||
| perform. The supported operation are: "sum", "max", "min", "prod". | |||||
| * @li group: A required string identifying the group name of ranks | |||||
| participating in the op. | |||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| participating in the op. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A Tensor. Has the same type as "x". | * y: A Tensor. Has the same type as "x". | ||||
| * @attention Constraints:\n | |||||
| * "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| * as the name of a world group. | |||||
| * @attention Constraints: | |||||
| "group" is limited to 128 characters. Use "hccl_world_group" | |||||
| as the name of a world group. | |||||
| */ | */ | ||||
| REG_OP(HcomReduceScatter) | REG_OP(HcomReduceScatter) | ||||
| .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) | ||||
| @@ -139,19 +140,19 @@ REG_OP(HcomReduceScatter) | |||||
| /** | /** | ||||
| * @brief Sends the input tensor to destination rank. | * @brief Sends the input tensor to destination rank. | ||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| * float32. | |||||
| * x: A tensor. Must be one of the following types: int8, int16, int32, float16, | |||||
| float32. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li sr_tag: A required integer identifying the send/recv message tag. The | |||||
| * message will be received by the HcomReceive op with the same "sr_tag". | |||||
| * @li sr_tag: A required integer identifying the send/recv message tag. The | |||||
| message will be received by the HcomReceive op with the same "sr_tag". | |||||
| * @li dest_rank: A required integer identifying the destination rank. | * @li dest_rank: A required integer identifying the destination rank. | ||||
| * @li group: A string identifying the group name of ranks participating in | |||||
| * the op. | |||||
| * @li group: A string identifying the group name of ranks participating in | |||||
| the op. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * None. | * None. | ||||
| * @attention Constraints:\n | |||||
| * @li "group" is limited to 128 characters. Use | |||||
| * "hccl_world_group" as the name of a world group. | |||||
| * @attention Constraints: | |||||
| @li "group" is limited to 128 characters. Use | |||||
| "hccl_world_group" as the name of a world group. | |||||
| * @li Operators HcomSend and HcomReceive have the same "sr_tag". | * @li Operators HcomSend and HcomReceive have the same "sr_tag". | ||||
| * @see HcomReceive | * @see HcomReceive | ||||
| */ | */ | ||||
| @@ -169,20 +170,20 @@ REG_OP(HcomSend) | |||||
| * @par Inputs: | * @par Inputs: | ||||
| * None. | * None. | ||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li sr_tag: A required integer identifying the send/recv message tag. The | |||||
| * message will be send by the HcomSend op with the same "sr_tag". | |||||
| * @li sr_tag: A required integer identifying the send/recv message tag. The | |||||
| message will be send by the HcomSend op with the same "sr_tag". | |||||
| * @li src_rank: A required integer identifying the source rank. | * @li src_rank: A required integer identifying the source rank. | ||||
| * @li group: A required string identifying the group name of ranks | * @li group: A required string identifying the group name of ranks | ||||
| * participating in the op. | * participating in the op. | ||||
| * @li shape: A required list identifying the shape of the tensor to be | |||||
| * received. | |||||
| * @li dtype: A required integer identifying the type of the tensor to be | |||||
| * received. The supported types are: int8, int16, int32, float16, float32. | |||||
| * @li shape: A required list identifying the shape of the tensor to be | |||||
| received. | |||||
| * @li dtype: A required integer identifying the type of the tensor to be | |||||
| received. The supported types are: int8, int16, int32, float16, float32. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A tensor with type identified in "dtype". | * y: A tensor with type identified in "dtype". | ||||
| * @attention Constraints:\n | |||||
| * @li "group" is limited to 128 characters. Use | |||||
| * "hccl_world_group" as the name of a world group. | |||||
| * @attention Constraints: | |||||
| @li "group" is limited to 128 characters. Use | |||||
| "hccl_world_group" as the name of a world group. | |||||
| * @li Operators HcomSend and HcomReceive have the same "sr_tag". | * @li Operators HcomSend and HcomReceive have the same "sr_tag". | ||||
| * @li "shape" should be same as the input tensor of HcomSend. | * @li "shape" should be same as the input tensor of HcomSend. | ||||
| * @li "dtype" should be same as the input tensor of HcomSend. | * @li "dtype" should be same as the input tensor of HcomSend. | ||||
| @@ -28,10 +28,10 @@ namespace ge { | |||||
| * @brief Outputs a tensor gathering all input tensors. | * @brief Outputs a tensor gathering all input tensors. | ||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: uint8, int8, uint16, int16, int32, | * x: A tensor. Must be one of the following types: uint8, int8, uint16, int16, int32, | ||||
| * int64, float16, bool. | |||||
| int64, float16, bool. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| * participating in the op. | |||||
| * @li rank_size: A required integer identifying the number of ranks | |||||
| participating in the op. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A Tensor. Has the same type as "x". | * y: A Tensor. Has the same type as "x". | ||||
| */ | */ | ||||
| @@ -44,13 +44,13 @@ REG_OP(HorovodAllgather) | |||||
| .OP_END_FACTORY_REG(HorovodAllgather) | .OP_END_FACTORY_REG(HorovodAllgather) | ||||
| /** | /** | ||||
| * @brief Outputs a tensor containing the reduction across all input tensors | |||||
| * passed to op. | |||||
| * @brief Outputs a tensor containing the reduction across all input tensors | |||||
| passed to op. | |||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A tensor. Must be one of the following types: int32, int64, float16, float32 | |||||
| * @par Attributes: | |||||
| * @li reduce_op: A required int identifying the reduction operation to | |||||
| * perform.The supported operation are: "sum", "max", "min", "prod". | |||||
| * x: A tensor. Must be one of the following types: int32, int64, float16, float32 | |||||
| @par Attributes: | |||||
| * @li reduce_op: A required int identifying the reduction operation to | |||||
| perform.The supported operation are: "sum", "max", "min", "prod". | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A Tensor. Has the same type as "x". | * y: A Tensor. Has the same type as "x". | ||||
| */ | */ | ||||
| @@ -63,11 +63,11 @@ REG_OP(HorovodAllreduce) | |||||
| /** | /** | ||||
| * @brief Broadcasts the input tensor in root rank to all ranks. | * @brief Broadcasts the input tensor in root rank to all ranks. | ||||
| * @par Inputs: | * @par Inputs: | ||||
| * x: A list of dynamic input tensor. Must be one of the following types: | |||||
| * int8, int32, float16, float32. | |||||
| * x: A list of dynamic input tensor. Must be one of the following types: | |||||
| int8, int32, float16, float32. | |||||
| * @par Attributes: | * @par Attributes: | ||||
| * @li root_rank: A required integer identifying the root rank in the op | |||||
| * input of this rank will be broadcast to other ranks. | |||||
| * @li root_rank: A required integer identifying the root rank in the op | |||||
| input of this rank will be broadcast to other ranks. | |||||
| * @par Outputs: | * @par Outputs: | ||||
| * y: A list of dynamic output tensor. Has the same type and length as "x". | * y: A list of dynamic output tensor. Has the same type and length as "x". | ||||
| */ | */ | ||||