diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index 033c5304..66eeeba8 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -18,6 +18,7 @@ #include #include "framework/common/debug/log.h" #include "graph/compute_graph.h" +#include "graph/utils/tensor_utils.h" #include "hybrid_execution_context.h" #include "subgraph_context.h" @@ -35,29 +36,31 @@ ShapeInferenceState::ShapeInferenceState(const NodeItem &node_item) : node_item( this->num_pending_shapes_); } -Status ShapeInferenceState::UpdateInputShape(int idx, - const GeShape &ori_shape, - const GeShape &shape) { +Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target) { if (node_item.IsInputShapeStatic(idx)) { GELOGD("[%s] Trying to update static shape, idx = %d. old shape = [%s], new shape = [%s]", node_item.NodeName().c_str(), idx, node_item.MutableInputDesc(idx)->GetShape().ToString().c_str(), - shape.ToString().c_str()); + target.GetShape().ToString().c_str()); return SUCCESS; } - GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s]", + int64_t tensor_size = -1; + (void) TensorUtils::GetSize(target, tensor_size); + GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s], size = %ld", node_item.NodeName().c_str(), idx, - shape.ToString().c_str(), - ori_shape.ToString().c_str()); + target.GetShape().ToString().c_str(), + target.GetOriginShape().ToString().c_str(), + tensor_size); std::lock_guard lk(mu_); auto tensor_desc = node_item.MutableInputDesc(idx); GE_CHECK_NOTNULL(tensor_desc); - tensor_desc->SetShape(shape); - tensor_desc->SetOriginShape(ori_shape); + tensor_desc->SetShape(target.GetShape()); + tensor_desc->SetOriginShape(target.GetOriginShape()); + (void) TensorUtils::SetSize(*tensor_desc, tensor_size); if (--num_pending_shapes_ == 0) { ready_cv_.notify_all(); } @@ -110,24 +113,24 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex for (auto &p : shape_futures) { auto idx = p.first; auto &future = p.second; - GeShape shape; - GeShape ori_shape; RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] Start", idx); - GE_CHK_STATUS_RET(future.Get(ori_shape, shape), - "[%s] Get shape failed. index = %u", - node_item.NodeName().c_str(), - idx); + auto src_tensor_desc = future.GetTensorDesc(); + GE_CHECK_NOTNULL(src_tensor_desc); RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] End", idx); + auto input_desc = node_item.MutableInputDesc(idx); + GE_CHECK_NOTNULL(input_desc); + int64_t tensor_size = -1; + (void) TensorUtils::GetSize(*src_tensor_desc, tensor_size); GELOGD("[%s] Update input shape [%u] with shape: [%s] and ori_shape: [%s]", node_item.NodeName().c_str(), idx, - shape.ToString().c_str(), - ori_shape.ToString().c_str()); - auto input_desc = node_item.MutableInputDesc(idx); - GE_CHECK_NOTNULL(input_desc); - input_desc->SetShape(std::move(shape)); - input_desc->SetOriginShape(ori_shape); + src_tensor_desc->GetShape().ToString().c_str(), + src_tensor_desc->GetOriginShape().ToString().c_str(), + tensor_size); + input_desc->SetShape(src_tensor_desc->GetShape()); + input_desc->SetOriginShape(src_tensor_desc->GetOriginShape()); + (void) TensorUtils::SetSize(*input_desc, tensor_size); } return SUCCESS; @@ -190,5 +193,14 @@ Status ShapeFuture::Get(GeShape &ori_shape, GeShape &shape) { GELOGD("Get shape from %s:%u. shape = [%s]", src_node_->GetName().c_str(), src_index_, shape.ToString().c_str()); return SUCCESS; } + +GeTensorDescPtr ShapeFuture::GetTensorDesc() { + GELOGD("Start to wait node: %s for getting shape", src_node_->GetName().c_str()); + if (!subgraph_context_->Await(src_node_)) { + GELOGE(INTERNAL_ERROR, "cancelled"); + return nullptr; + } + return src_node_->GetOpDesc()->MutableOutputDesc(src_index_); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/node_state.h b/ge/hybrid/executor/node_state.h index 04f1ee4b..312e177f 100644 --- a/ge/hybrid/executor/node_state.h +++ b/ge/hybrid/executor/node_state.h @@ -35,6 +35,7 @@ class ShapeFuture { ShapeFuture(NodePtr src_node, uint32_t src_index, SubgraphContext *subgraph_context); ~ShapeFuture() = default; Status Get(GeShape &ori_shape, GeShape &shape); + GeTensorDescPtr GetTensorDesc(); private: NodePtr src_node_; @@ -45,7 +46,7 @@ class ShapeFuture { struct ShapeInferenceState { explicit ShapeInferenceState(const NodeItem &node_item); - Status UpdateInputShape(int idx, const GeShape &ori_shape, const GeShape &shape); + Status UpdateInputShape(int idx, const GeTensorDesc &tensor_desc); void UpdateInputShapeFuture(int idx, ShapeFuture &&future); diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc index 5a464f8e..4b6dddab 100644 --- a/ge/hybrid/executor/subgraph_executor.cc +++ b/ge/hybrid/executor/subgraph_executor.cc @@ -96,7 +96,7 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorGetOrCreateNodeState(input_node); GE_CHECK_NOTNULL(node_state); - node_state->GetShapeInferenceState().UpdateInputShape(0, tensor_desc->GetOriginShape(), tensor_desc->GetShape()); + node_state->GetShapeInferenceState().UpdateInputShape(0, *tensor_desc); } } @@ -268,13 +268,6 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta } else { node_state.SetKernelTask(node_item.kernel_task); } - - GELOGD("[%s] Start to invoke CalcOpRunningParam.", node_item.NodeName().c_str()); - RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start"); - GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().CalcOpRunningParam(*node_item.node), - "[%s] Failed to invoke CalcOpRunningParam.", node_item.NodeName().c_str()); - RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] End"); - GELOGD("[%s] Done invoking CalcOpRunningParam successfully.", node_item.NodeName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index e6729352..0d9c7a69 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -20,12 +20,9 @@ #include "graph/utils/tensor_adapter.h" #include "graph/debug/ge_attr_define.h" #include "hybrid/node_executor/node_executor.h" -#include "common/dump/dump_manager.h" +#include "hybrid/executor//worker//shape_inference_engine.h" #include "common/dump/dump_op.h" -#include "common/types.h" -#include "common/ge_types.h" #include "common/profiling/profiling_manager.h" -#include "runtime/base.h" namespace ge { namespace hybrid { @@ -349,6 +346,10 @@ Status NodeDoneCallback::OnNodeDone() { } GE_CHK_STATUS_RET_NOLOG(PrepareConstInputs(node_item)); + if (node_item.shape_inference_type == DEPEND_SHAPE_RANGE || node_item.shape_inference_type == DEPEND_COMPUTE) { + // update output tensor sizes + GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(node_item)); + } // PropagateOutputs for type == DEPEND_COMPUTE if (node_item.shape_inference_type == DEPEND_COMPUTE) { if (graph_context_->trace_enabled) { diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index 1d813526..02b3a50b 100755 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -17,9 +17,15 @@ #include "hybrid/executor/worker/shape_inference_engine.h" #include "graph/shape_refiner.h" #include "graph/utils/node_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/utils/type_utils.h" +#include "common/math/math_util.h" #include "hybrid/node_executor/node_executor.h" namespace ge { +namespace { +const int kAlignment = 32; +} namespace hybrid { ShapeInferenceEngine::ShapeInferenceEngine(GraphExecutionContext *execution_context, SubgraphContext *subgraph_context) : execution_context_(execution_context), @@ -40,7 +46,9 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { } if (node_item.fused_subgraph != nullptr) { - return InferShapeForSubgraph(node_item, *node_item.fused_subgraph); + GE_CHK_STATUS_RET_NOLOG(InferShapeForSubgraph(node_item, *node_item.fused_subgraph)); + GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item)); + return SUCCESS; } // Skip shape inference for node of type DEPEND_COMPUTE @@ -66,18 +74,12 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { "Invoke InferShapeAndType failed."); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); } - // Check again to make sure shape is valid after shape inference - if (node_item.shape_inference_type != DEPEND_SHAPE_RANGE) { - bool is_unknown_shape = false; - GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node_item.node, is_unknown_shape), - "Failed to get shape status. node = %s", - node_item.NodeName().c_str()); - - GE_CHK_BOOL_RET_STATUS(!is_unknown_shape, - INTERNAL_ERROR, - "[%s] Shape is still unknown after shape inference.", - node_item.NodeName().c_str()); - } + + // update output tensor sizes after shape inference + // error if shape is still unknown and not of type DEPEND_SHAPE_RANGE + RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start"); + GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item, node_item.shape_inference_type == DEPEND_SHAPE_RANGE)); + RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] End"); GELOGD("[%s] [HybridTrace] After shape inference. Node = %s", node_item.NodeName().c_str(), @@ -127,8 +129,6 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) { // propagate each output for (int i = 0; i < node_item.num_outputs; ++i) { auto output_desc = node_item.op_desc->MutableOutputDesc(i); - const auto &shape = output_desc->MutableShape(); - const auto &ori_shape = output_desc->GetOriginShape(); auto &output_nodes = node_item.outputs[i]; // propagate output to all sub-inputs @@ -149,9 +149,7 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) { infer_state.UpdateInputShapeFuture(dst_input_index_and_node.first, std::move(future)); } else { - GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first, - ori_shape, - shape)); + GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first, *output_desc)); } } } @@ -230,5 +228,71 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { } return SUCCESS; } + +Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range) { + auto op_desc = node_item.GetOpDesc(); + for (size_t output_index = 0; output_index < op_desc->GetOutputsSize(); ++output_index) { + auto tensor_desc = op_desc->MutableOutputDesc(output_index); + GE_CHECK_NOTNULL(tensor_desc); + const auto &shape = tensor_desc->MutableShape(); + auto dims = shape.GetDims(); + auto dim_num = dims.size(); + if (shape.IsUnknownShape()) { + if (!fallback_with_range) { + GELOGE(INTERNAL_ERROR, "[%s] Shape of output[%zu] is still unknown after shape inference. shape = [%s]", + node_item.NodeName().c_str(), + output_index, + shape.ToString().c_str()); + return INTERNAL_ERROR; + } + + GELOGD("[%s] Calc output[%zu] size by range", node_item.NodeName().c_str(), output_index); + std::vector> shape_range; + GE_CHK_GRAPH_STATUS_RET(tensor_desc->GetShapeRange(shape_range), + "[$s] Failed to get shape range for output: %zu", + node_item.NodeName().c_str(), + output_index); + if (shape_range.size() != dim_num) { + GELOGE(INTERNAL_ERROR, "[%s] Number of shape ranges (%zu) mismatches that of dims (%zu), index = %zu", + node_item.NodeName().c_str(), + shape_range.size(), + dim_num, + output_index); + return INTERNAL_ERROR; + } + + for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) { + if (dims[dim_index] == ge::UNKNOWN_DIM) { + dims[dim_index] = shape_range[dim_index].second; + } + } + } + + uint32_t type_size = 0; + if (!TypeUtils::GetDataTypeLength(tensor_desc->GetDataType(), type_size)) { + GELOGE(INTERNAL_ERROR, "Failed to get data type size"); + return INTERNAL_ERROR; + } + int64_t tensor_size = type_size; + for (const auto &dim : dims) { + GE_CHECK_GE(dim, 0); + GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), + "[%s] Shape size overflow, shape = [%s]", + node_item.NodeName().c_str(), + shape.ToString().c_str()); + tensor_size *= dim; + } + + GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), + "[%s] Output[%zu] Tensor size too large, shape = [%s]", + node_item.NodeName().c_str(), + output_index, + shape.ToString().c_str()); + tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; + (void) TensorUtils::SetSize(*tensor_desc, tensor_size); + } + + return SUCCESS; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/worker/shape_inference_engine.h b/ge/hybrid/executor/worker/shape_inference_engine.h index 7bb9269c..9401ead2 100644 --- a/ge/hybrid/executor/worker/shape_inference_engine.h +++ b/ge/hybrid/executor/worker/shape_inference_engine.h @@ -34,6 +34,8 @@ class ShapeInferenceEngine { Status PropagateOutputShapes(const NodeItem &node_item); + static Status CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range = false); + private: static Status UpdatePeerNodeShape(const Node &node); Status AwaitDependentNodes(NodeState &node_state); diff --git a/ge/hybrid/model/node_item.cc b/ge/hybrid/model/node_item.cc index 69cf334d..1fd8fe31 100644 --- a/ge/hybrid/model/node_item.cc +++ b/ge/hybrid/model/node_item.cc @@ -22,6 +22,7 @@ #include "graph/debug/ge_attr_define.h" #include "graph/utils/node_utils.h" #include "hybrid/node_executor/node_executor.h" +#include "hybrid/executor/worker/shape_inference_engine.h" namespace ge { namespace hybrid { @@ -47,7 +48,7 @@ Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgr GE_CHECK_NOTNULL(dst_op_desc); auto in_idx = node_and_anchor.second->GetIdx(); auto tensor_desc = dst_op_desc->MutableInputDesc(in_idx); - fused_subgraph.input_mapping[parent_index].emplace_back(tensor_desc); + fused_subgraph.input_mapping[static_cast(parent_index)].emplace_back(tensor_desc); GELOGD("Input[%u] mapped to [%s:%u]", parent_index, dst_op_desc->GetName().c_str(), in_idx); } @@ -64,7 +65,7 @@ Status ParseOutputMapping(const OpDescPtr &op_desc, FusedSubgraph &fused_subgrap return FAILED; } - fused_subgraph.output_mapping.emplace(parent_index, op_desc); + fused_subgraph.output_mapping.emplace(static_cast(parent_index), op_desc); return SUCCESS; } @@ -175,6 +176,10 @@ Status NodeItem::Init() { } } + if (is_output_shape_static) { + GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(*this)); + } + if (IsControlOp() || node_type == PARTITIONEDCALL) { shape_inference_type = DEPEND_COMPUTE; } else { diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc index 77004f99..d6291c60 100644 --- a/ge/hybrid/node_executor/task_context.cc +++ b/ge/hybrid/node_executor/task_context.cc @@ -148,6 +148,10 @@ Status TaskContext::AllocateWorkspaces() { } Status TaskContext::RegisterCallback(const std::function &callback_fun) const { + if (callback_fun == nullptr) { + GELOGW("[%s] Callback is NULL", GetNodeName()); + return SUCCESS; + } auto ret = execution_context_->callback_manager->RegisterCallback(callback_fun); if (ret != SUCCESS) { GELOGE(ret, "[%s] Failed to register callback", GetNodeName()); @@ -384,6 +388,20 @@ const char *TaskContext::GetNodeName() const { return node_item_->NodeName().c_str(); } +void TaskContext::ReleaseInputsAndOutputs() { + for (int i = 0; i < node_item_->num_inputs; ++i) { + auto tensor = inputs_start_ + i; + tensor->Destroy(); + GELOGD("[%s] Tensor of input[%d] released", GetNodeName(), index); + } + + for (int i = 0; i < node_item_->num_outputs; ++i) { + auto tensor = outputs_start_ + i; + tensor->Destroy(); + GELOGD("[%s] Tensor of output[%d] released", GetNodeName(), index); + } +} + void TaskContext::ReleaseInput(int index) { auto input_tensor = MutableInput(index); if (input_tensor != nullptr) { @@ -456,5 +474,9 @@ Status TaskContext::TryExecuteCallback(const function &callback_fun) con const DumpProperties &TaskContext::GetDumpProperties() const { return execution_context_->dump_properties; } + +bool TaskContext::NeedCallback() { + return node_item_->has_observer || IsDumpEnabled() || execution_context_->profiling_level > 0; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h index 0549a1dc..34754a14 100644 --- a/ge/hybrid/node_executor/task_context.h +++ b/ge/hybrid/node_executor/task_context.h @@ -50,6 +50,8 @@ class TaskContext { ConstGeTensorDescPtr GetOutputDesc(int index) const; GeTensorDescPtr MutableInputDesc(int index) const; GeTensorDescPtr MutableOutputDesc(int index) const; + void ReleaseInputsAndOutputs(); + bool NeedCallback(); void ReleaseInput(int index); const TensorValue *GetInput(int index) const; const TensorValue *GetOutput(int index) const;