Browse Source

Merge remote-tracking branch 'upstream/master'

tags/v1.1.0
taoxiangdong 4 years ago
parent
commit
dc72f5dcae
100 changed files with 4267 additions and 2629 deletions
  1. +19
    -0
      RELEASE.md
  2. +3
    -0
      ge/CMakeLists.txt
  3. +19
    -8
      ge/analyzer/analyzer.cc
  4. +2
    -0
      ge/analyzer/analyzer.h
  5. +2
    -0
      ge/client/CMakeLists.txt
  6. +1
    -1
      ge/client/ge_api.cc
  7. +4
    -0
      ge/client/module.mk
  8. +1
    -0
      ge/common/CMakeLists.txt
  9. +3
    -8
      ge/common/dump/dump_manager.cc
  10. +0
    -1
      ge/common/dump/dump_manager.h
  11. +1
    -1
      ge/common/dump/dump_op.cc
  12. +34
    -3
      ge/common/dump/dump_properties.cc
  13. +18
    -0
      ge/common/dump/dump_properties.h
  14. +6
    -3
      ge/common/ge/op_tiling_manager.cc
  15. +1
    -0
      ge/common/ge_common.mk
  16. +19
    -1
      ge/common/helper/model_cache_helper.cc
  17. +17
    -2
      ge/common/helper/model_helper.cc
  18. +15
    -2
      ge/common/profiling/profiling_manager.cc
  19. +1
    -1
      ge/common/profiling/profiling_manager.h
  20. +6
    -0
      ge/common/properties_manager.cc
  21. +5
    -1
      ge/common/properties_manager.h
  22. +68
    -2
      ge/common/util.cc
  23. +3
    -1
      ge/executor/CMakeLists.txt
  24. +48
    -1
      ge/executor/ge_executor.cc
  25. +3
    -0
      ge/executor/module.mk
  26. +5
    -0
      ge/ge_inference.mk
  27. +57
    -32
      ge/ge_local_engine/engine/host_cpu_engine.cc
  28. +9
    -0
      ge/ge_runner.mk
  29. +5
    -0
      ge/graph/build/memory/block_mem_assigner.cc
  30. +50
    -3
      ge/graph/build/memory/graph_mem_assigner.cc
  31. +2
    -0
      ge/graph/build/memory/graph_mem_assigner.h
  32. +56
    -20
      ge/graph/build/task_generator.cc
  33. +2
    -1
      ge/graph/build/task_generator.h
  34. +15
    -2
      ge/graph/load/new_model_manager/davinci_model.cc
  35. +1
    -0
      ge/graph/load/new_model_manager/davinci_model.h
  36. +82
    -3
      ge/graph/load/new_model_manager/model_manager.cc
  37. +6
    -0
      ge/graph/load/new_model_manager/model_manager.h
  38. +29
    -12
      ge/graph/load/new_model_manager/model_utils.cc
  39. +2
    -94
      ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
  40. +0
    -2
      ge/graph/load/new_model_manager/task_info/kernel_task_info.h
  41. +4
    -6
      ge/graph/load/new_model_manager/zero_copy_task.cc
  42. +5
    -1
      ge/graph/load/new_model_manager/zero_copy_task.h
  43. +9
    -1
      ge/graph/manager/graph_manager.cc
  44. +3
    -0
      ge/graph/partition/engine_place.cc
  45. +4
    -0
      ge/graph/passes/for_pass.cc
  46. +1
    -1
      ge/graph/passes/multi_batch_clone_pass.cc
  47. +3
    -0
      ge/graph/passes/reshape_recovery_pass.cc
  48. +11
    -7
      ge/graph/preprocess/multi_batch_copy_graph.cc
  49. +238
    -194
      ge/host_kernels/strided_slice_kernel.cc
  50. +18
    -19
      ge/host_kernels/strided_slice_kernel.h
  51. +6
    -0
      ge/hybrid/executor/hybrid_model_executor.cc
  52. +1
    -1
      ge/hybrid/executor/hybrid_model_executor.h
  53. +2
    -1
      ge/hybrid/model/hybrid_model_builder.cc
  54. +2
    -2
      ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc
  55. +1
    -0
      ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
  56. +13
    -7
      ge/init/gelib.cc
  57. +1
    -1
      ge/init/gelib.h
  58. +7
    -7
      ge/session/inner_session.cc
  59. +1
    -0
      ge/session/inner_session.h
  60. +38
    -4
      ge/single_op/single_op.cc
  61. +10
    -2
      ge/single_op/single_op.h
  62. +60
    -10
      ge/single_op/single_op_model.cc
  63. +3
    -1
      ge/single_op/single_op_model.h
  64. +23
    -0
      ge/single_op/task/aicpu_kernel_task_builder.cc
  65. +50
    -20
      ge/single_op/task/aicpu_task_builder.cc
  66. +3
    -1
      ge/single_op/task/aicpu_task_builder.h
  67. +408
    -21
      ge/single_op/task/op_task.cc
  68. +68
    -6
      ge/single_op/task/op_task.h
  69. +69
    -0
      inc/external/ge/ge_prof.h
  70. +8
    -3
      inc/framework/common/ge_inner_error_codes.h
  71. +18
    -0
      inc/framework/common/util.h
  72. +7
    -7
      metadef/graph/CMakeLists.txt
  73. +1
    -1
      metadef/graph/compute_graph.cc
  74. +6
    -3
      metadef/graph/format_refiner.cc
  75. +1
    -0
      metadef/graph/ge_attr_define.cc
  76. +34
    -18
      metadef/graph/node.cc
  77. +1
    -1
      metadef/graph/op_desc.cc
  78. +14
    -8
      metadef/graph/operator.cc
  79. +7
    -1
      metadef/graph/shape_refiner.cc
  80. +14
    -3
      metadef/graph/utils/ge_ir_utils.cc
  81. +1
    -2
      metadef/graph/utils/graph_utils.cc
  82. +0
    -1
      metadef/graph/utils/op_desc_utils.cc
  83. +1
    -0
      metadef/inc/graph/debug/ge_attr_define.h
  84. +375
    -0
      src/ge/client/ge_prof.cc
  85. +10
    -8
      third_party/fwkacllib/inc/ops/aipp.h
  86. +227
    -210
      third_party/fwkacllib/inc/ops/array_ops.h
  87. +44
    -44
      third_party/fwkacllib/inc/ops/audio_ops.h
  88. +41
    -40
      third_party/fwkacllib/inc/ops/batch_ops.h
  89. +10
    -10
      third_party/fwkacllib/inc/ops/bitwise_ops.h
  90. +14
    -14
      third_party/fwkacllib/inc/ops/boosted_trees_ops.h
  91. +154
    -154
      third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
  92. +4
    -4
      third_party/fwkacllib/inc/ops/condtake_ops.h
  93. +91
    -91
      third_party/fwkacllib/inc/ops/control_flow_ops.h
  94. +31
    -31
      third_party/fwkacllib/inc/ops/ctc_ops.h
  95. +553
    -551
      third_party/fwkacllib/inc/ops/data_flow_ops.h
  96. +360
    -350
      third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
  97. +85
    -103
      third_party/fwkacllib/inc/ops/functional_ops.h
  98. +64
    -63
      third_party/fwkacllib/inc/ops/hcom_ops.h
  99. +13
    -13
      third_party/fwkacllib/inc/ops/hvd_ops.h
  100. +396
    -377
      third_party/fwkacllib/inc/ops/image_ops.h

+ 19
- 0
RELEASE.md View File

@@ -1,3 +1,22 @@
# Release 1.0.0

## Major Features and Improvements
* Automatically dump the input and output of the abnormal operator when the network execution is abnormal;
* Realize dynamic multi-batch based on GotoLabel;
* Optimize the performance of dynamic shape;
* The dynamic resolution feature supports new scene that the network has multiple inputs and the shape of each input is different.

## Bugfixes
* Fixed the issue that the input and output data of the AICPU operator cannot be dumped in the single-operator execution scenario.
* Fixed the execution fails in the custom AICPU operator cascading scenario.
* Fixed the issue that in the dynamic batch+dynamic AIPP scenario, the getinputformat and getinputdims parameters are inconsistent.


## Thanks to our Contributors
Thanks goes to these wonderful people: wuweikang,wangcong,weiyang,yanghaorang,xutianchun,shibeiji,zhouchao, tanghuikang, zhoulili, liujunzhu, zhengyuanhua, taoxiangdong Contributions of any kind are welcome!

Contributions of any kind are welcome!

# Release 0.7.0-beta # Release 0.7.0-beta


## Major Features and Improvements ## Major Features and Improvements


+ 3
- 0
ge/CMakeLists.txt View File

@@ -63,6 +63,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge)
# need to remove dependencies on pb files later # need to remove dependencies on pb files later
file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"analyzer/analyzer.cc" "analyzer/analyzer.cc"
"client/ge_prof.cc"
"client/ge_api.cc" "client/ge_api.cc"
"common/dump/dump_manager.cc" "common/dump/dump_manager.cc"
"common/dump/dump_properties.cc" "common/dump/dump_properties.cc"
@@ -230,6 +231,7 @@ target_link_libraries(ge_runner
${msprof} ${msprof}
${runtime} ${runtime}
${resouce} ${resouce}
${ascend_hal}
rt rt
dl) dl)


@@ -340,6 +342,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"host_kernels/unpack_kernel.cc" "host_kernels/unpack_kernel.cc"
"host_kernels/unsqueeze_kernel.cc" "host_kernels/unsqueeze_kernel.cc"
"hybrid/hybrid_davinci_model_stub.cc" "hybrid/hybrid_davinci_model_stub.cc"
"hybrid/node_executor/aicpu/aicpu_ext_info.cc"
"init/gelib.cc" "init/gelib.cc"
"ir_build/atc_ir_common.cc" "ir_build/atc_ir_common.cc"
"ir_build/ge_ir_build.cc" "ir_build/ge_ir_build.cc"


+ 19
- 8
ge/analyzer/analyzer.cc View File

@@ -101,7 +101,7 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {


ge::Status Analyzer::Initialize() { ge::Status Analyzer::Initialize() {
ClearHistoryFile(); ClearHistoryFile();
return CreateAnalyzerFile();
return SUCCESS;
} }


void Analyzer::Finalize() { void Analyzer::Finalize() {
@@ -136,7 +136,7 @@ void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) {
} else { } else {
auto iter1 = (iter->second).find(graph_id); auto iter1 = (iter->second).find(graph_id);
if (iter1 == (iter->second).end()) { if (iter1 == (iter->second).end()) {
GELOGW("can not find the graph json object by session_id[%lu] and graph_id[%lu].Do nothing", session_id,
GELOGW("Can not find the graph json object by session_id[%lu] and graph_id[%lu]. Do nothing.", session_id,
graph_id); graph_id);
} }
(iter->second).erase(iter1); (iter->second).erase(iter1);
@@ -169,6 +169,10 @@ void Analyzer::ClearHistoryFile() {
} }


ge::Status Analyzer::CreateAnalyzerFile() { ge::Status Analyzer::CreateAnalyzerFile() {
if (is_json_file_create_) {
GELOGD("analyzer file has been created!No necessary to create again!");
return SUCCESS;
}
GELOGD("start to create analyzer file!"); GELOGD("start to create analyzer file!");
// Check whether the manifest exists, if not, create it. // Check whether the manifest exists, if not, create it.
string real_path = RealPath(kFilePath.c_str()); string real_path = RealPath(kFilePath.c_str());
@@ -176,18 +180,19 @@ ge::Status Analyzer::CreateAnalyzerFile() {
GELOGE(FAILED, "File path is invalid."); GELOGE(FAILED, "File path is invalid.");
return FAILED; return FAILED;
} }
string file = real_path + "/" + kAnalyzeFile;
GELOGD("Created analyzer file:[%s]", file.c_str());
int fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
std::lock_guard<std::mutex> lg(file_mutex_);
json_file_name_ = real_path + "/" + kAnalyzeFile;
GELOGD("Created analyzer file:[%s]", json_file_name_.c_str());
int fd = open(json_file_name_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
if (fd < 0) { if (fd < 0) {
GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", file.c_str());
GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", json_file_name_.c_str());
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
if (close(fd) != 0) { if (close(fd) != 0) {
GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", file.c_str());
GELOGE(INTERNAL_ERROR, "Fail to close the file: %s.", json_file_name_.c_str());
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
json_file_name_ = file;
is_json_file_create_ = true;


GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str()); GELOGD("success to create analyzer file[%s]!", json_file_name_.c_str());
return SUCCESS; return SUCCESS;
@@ -231,6 +236,12 @@ ge::Status Analyzer::DoAnalyze(DataInfo &data_info) {
GELOGE(status, "save op info failed!"); GELOGE(status, "save op info failed!");
return FAILED; return FAILED;
} }
// create json file
status = CreateAnalyzerFile();
if (status != SUCCESS) {
GELOGE(status, "create analyzer file failed!");
return status;
}
// save data to file // save data to file
return SaveAnalyzerDataToFile(); return SaveAnalyzerDataToFile();
} }


+ 2
- 0
ge/analyzer/analyzer.h View File

@@ -24,6 +24,7 @@
#include <mutex> #include <mutex>
#include <memory> #include <memory>
#include <fstream> #include <fstream>
#include <atomic>


#include "external/ge/ge_api_types.h" #include "external/ge/ge_api_types.h"
#include "graph/compute_graph.h" #include "graph/compute_graph.h"
@@ -181,6 +182,7 @@ class Analyzer {
std::mutex file_mutex_; // protect json_file_ std::mutex file_mutex_; // protect json_file_
std::ofstream json_file_; std::ofstream json_file_;
std::string json_file_name_; std::string json_file_name_;
std::atomic_bool is_json_file_create_{false};
}; };
} // namespace ge } // namespace ge
#endif // DOMI_ANALYZER_ANANLYZER_H_ #endif // DOMI_ANALYZER_ANANLYZER_H_

+ 2
- 0
ge/client/CMakeLists.txt View File

@@ -29,6 +29,7 @@ file(GLOB PROTO_HEADER_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}


file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"ge_api.cc" "ge_api.cc"
"ge_prof.cc"
) )


ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
@@ -66,5 +67,6 @@ target_link_libraries(ge_client
${slog} ${slog}
${mmpa} ${mmpa}
${runtime} ${runtime}
${msprof}
rt rt
dl) dl)

+ 1
- 1
ge/client/ge_api.cc View File

@@ -39,7 +39,7 @@ using std::vector;


namespace { namespace {
const int32_t kMaxStrLen = 128; const int32_t kMaxStrLen = 128;
}
} // namespace


static bool g_ge_initialized = false; static bool g_ge_initialized = false;
static std::mutex g_ge_release_mutex; // GEFinalize and ~Session use static std::mutex g_ge_release_mutex; // GEFinalize and ~Session use


+ 4
- 0
ge/client/module.mk View File

@@ -4,6 +4,7 @@ LOCAL_PATH := $(call my-dir)
COMMON_LOCAL_SRC_FILES := \ COMMON_LOCAL_SRC_FILES := \
proto/ge_api.proto \ proto/ge_api.proto \
ge_api.cc \ ge_api.cc \
ge_prof.cc \




COMMON_LOCAL_C_INCLUDES := \ COMMON_LOCAL_C_INCLUDES := \
@@ -69,6 +70,8 @@ LOCAL_SHARED_LIBRARIES := \
libregister \ libregister \
libge_compiler \ libge_compiler \
libge_common \ libge_common \
libmsprof





LOCAL_LDFLAGS := -lrt -ldl LOCAL_LDFLAGS := -lrt -ldl
@@ -102,6 +105,7 @@ LOCAL_SHARED_LIBRARIES := \
libruntime \ libruntime \
libge_compiler \ libge_compiler \
libge_common \ libge_common \
libmsprof




LOCAL_LDFLAGS := -lrt -ldl LOCAL_LDFLAGS := -lrt -ldl


+ 1
- 0
ge/common/CMakeLists.txt View File

@@ -27,6 +27,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"context/ctx.cc" "context/ctx.cc"
"cust_aicpu_kernel_store.cc" "cust_aicpu_kernel_store.cc"
"debug/memory_dumper.cc" "debug/memory_dumper.cc"
"dump/dump_properties.cc"
"fmk_error_codes.cc" "fmk_error_codes.cc"
"formats/format_transfers/datatype_transfer.cc" "formats/format_transfers/datatype_transfer.cc"
"formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc" "formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc"


+ 3
- 8
ge/common/dump/dump_manager.cc View File

@@ -49,7 +49,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
dump_properties_.ClearDumpPropertyValue(); dump_properties_.ClearDumpPropertyValue();
return SUCCESS; return SUCCESS;
} }
dump_properties_.SetDumpStatus(dump_status);

dump_op_switch = dump_config.dump_op_switch; dump_op_switch = dump_config.dump_op_switch;
dump_properties_.SetDumpOpSwitch(dump_op_switch);
if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) { if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) {
GELOGE(PARAM_INVALID, "Dump list is invalid,dump_op_switch is %s", dump_op_switch.c_str()); GELOGE(PARAM_INVALID, "Dump list is invalid,dump_op_switch is %s", dump_op_switch.c_str());
return PARAM_INVALID; return PARAM_INVALID;
@@ -95,14 +98,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
return SUCCESS; return SUCCESS;
} }


FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpManager::IsDumpOpen() {
std::lock_guard<std::mutex> lock(mutex_);
if (!dump_properties_.GetDumpPath().empty()) {
return true;
}
return false;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties() { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties() {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return dump_properties_; return dump_properties_;


+ 0
- 1
ge/common/dump/dump_manager.h View File

@@ -28,7 +28,6 @@ class DumpManager {
static DumpManager &GetInstance(); static DumpManager &GetInstance();


Status SetDumpConf(const DumpConfig &dump_config); Status SetDumpConf(const DumpConfig &dump_config);
bool IsDumpOpen();
const DumpProperties &GetDumpProperties(); const DumpProperties &GetDumpProperties();
void SetModelName(const std::string &model_name); void SetModelName(const std::string &model_name);
const std::string &GetModelName(); const std::string &GetModelName();


+ 1
- 1
ge/common/dump/dump_op.cc View File

@@ -16,7 +16,6 @@


#include "common/dump/dump_op.h" #include "common/dump/dump_op.h"


#include "aicpu/common/aicpu_task_struct.h"
#include "common/dump/dump_manager.h" #include "common/dump/dump_manager.h"
#include "common/ge/datatype_util.h" #include "common/ge/datatype_util.h"
#include "framework/common/debug/ge_log.h" #include "framework/common/debug/ge_log.h"
@@ -28,6 +27,7 @@
#include "proto/ge_ir.pb.h" #include "proto/ge_ir.pb.h"
#include "proto/op_mapping_info.pb.h" #include "proto/op_mapping_info.pb.h"
#include "runtime/mem.h" #include "runtime/mem.h"
#include "aicpu/common/aicpu_task_struct.h"


namespace { namespace {
const uint32_t kAicpuLoadFlag = 1; const uint32_t kAicpuLoadFlag = 1;


+ 34
- 3
ge/common/dump/dump_properties.cc View File

@@ -31,7 +31,7 @@


namespace { namespace {
const std::string kEnableFlag = "1"; const std::string kEnableFlag = "1";
const std::string kDumpStatusOpen = "on";
const uint32_t kAicoreOverflow = (0x1 << 0); const uint32_t kAicoreOverflow = (0x1 << 0);
const uint32_t kAtomicOverflow = (0x1 << 1); const uint32_t kAtomicOverflow = (0x1 << 1);
const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow); const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow);
@@ -81,12 +81,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti
if (enable_dump_ == kEnableFlag) { if (enable_dump_ == kEnableFlag) {
std::string dump_step; std::string dump_step;
if (GetContext().GetOption(OPTION_EXEC_DUMP_STEP, dump_step) == GRAPH_SUCCESS) { if (GetContext().GetOption(OPTION_EXEC_DUMP_STEP, dump_step) == GRAPH_SUCCESS) {
GELOGD("Get dump step %s successfully", dump_step.c_str());
GELOGI("Get dump step %s successfully", dump_step.c_str());
SetDumpStep(dump_step); SetDumpStep(dump_step);
} }
string dump_mode; string dump_mode;
if (GetContext().GetOption(OPTION_EXEC_DUMP_MODE, dump_mode) == GRAPH_SUCCESS) { if (GetContext().GetOption(OPTION_EXEC_DUMP_MODE, dump_mode) == GRAPH_SUCCESS) {
GELOGD("Get dump mode %s successfully", dump_mode.c_str());
GELOGI("Get dump mode %s successfully", dump_mode.c_str());
SetDumpMode(dump_mode); SetDumpMode(dump_mode);
} }
AddPropertyValue(DUMP_ALL_MODEL, {}); AddPropertyValue(DUMP_ALL_MODEL, {});
@@ -192,6 +192,37 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti
return dump_mode_; return dump_mode_;
} }


FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpStatus(const std::string &status) {
dump_status_ = status;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpStatus() const {
return dump_status_;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch(
const std::string &dump_op_switch) {
dump_op_switch_ = dump_op_switch;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperties::GetDumpOpSwitch() const {
return dump_op_switch_;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsSingleOpNeedDump() const {
if (dump_op_switch_ == kDumpStatusOpen) {
return true;
}
return false;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsDumpOpen() const {
if (enable_dump_ == kEnableFlag || dump_status_ == kDumpStatusOpen) {
return true;
}
return false;
}

void DumpProperties::CopyFrom(const DumpProperties &other) { void DumpProperties::CopyFrom(const DumpProperties &other) {
if (&other != this) { if (&other != this) {
enable_dump_ = other.enable_dump_; enable_dump_ = other.enable_dump_;


+ 18
- 0
ge/common/dump/dump_properties.h View File

@@ -61,10 +61,26 @@ class DumpProperties {


const std::string &GetDumpMode() const; const std::string &GetDumpMode() const;


void SetDumpStatus(const std::string &status);

const std::string &GetDumpStatus() const;

void SetDumpOpSwitch(const std::string &dump_op_switch);

const std::string &GetDumpOpSwitch() const;

bool IsOpDebugOpen() const { return is_op_debug_; } bool IsOpDebugOpen() const { return is_op_debug_; }


bool IsDumpOpen() const;

bool IsSingleOpNeedDump() const;

uint32_t GetOpDebugMode() const { return op_debug_mode_; } uint32_t GetOpDebugMode() const { return op_debug_mode_; }


const std::string &GetEnableDump() const { return enable_dump_; }

const std::string &GetEnableDumpDebug() const { return enable_dump_debug_; }

private: private:
void CopyFrom(const DumpProperties &other); void CopyFrom(const DumpProperties &other);


@@ -76,6 +92,8 @@ class DumpProperties {
std::string dump_path_; std::string dump_path_;
std::string dump_step_; std::string dump_step_;
std::string dump_mode_; std::string dump_mode_;
std::string dump_status_;
std::string dump_op_switch_;
std::map<std::string, std::set<std::string>> model_dump_properties_map_; std::map<std::string, std::set<std::string>> model_dump_properties_map_;


bool is_op_debug_ = false; bool is_op_debug_ = false;


+ 6
- 3
ge/common/ge/op_tiling_manager.cc View File

@@ -15,14 +15,15 @@
*/ */


#include "common/ge/op_tiling_manager.h" #include "common/ge/op_tiling_manager.h"
#include "common/util/error_manager/error_manager.h"
#include "framework/common/debug/log.h" #include "framework/common/debug/log.h"
#include <string> #include <string>


namespace { namespace {
const char *const kEnvName = "ASCEND_OPP_PATH"; const char *const kEnvName = "ASCEND_OPP_PATH";
const std::string kDefaultPath = "/usr/local/Ascend/opp"; const std::string kDefaultPath = "/usr/local/Ascend/opp";
const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/liboptiling.so";
const std::string kDefaultCustomTilingPath = "/op_impl/custom/liboptiling.so";
const std::string kDefaultBuiltInTilingPath = "/op_impl/built-in/ai_core/tbe/op_tiling/liboptiling.so";
const std::string kDefaultCustomTilingPath = "/op_impl/custom/ai_core/tbe/op_tiling/liboptiling.so";
const uint8_t kPrefixIndex = 9; const uint8_t kPrefixIndex = 9;
} // namespace } // namespace


@@ -44,7 +45,9 @@ std::string OpTilingManager::GetPath() {
if (opp_path_env != nullptr) { if (opp_path_env != nullptr) {
char resolved_path[PATH_MAX]; char resolved_path[PATH_MAX];
if (realpath(opp_path_env, resolved_path) == NULL) { if (realpath(opp_path_env, resolved_path) == NULL) {
GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'(%s) is invalid path.", opp_path_env);
ErrorManager::GetInstance().ATCReportErrMessage("E19024", {"env", "value", "situation"},
{"ASCEND_OPP_PATH", opp_path_env, "loading the tiling lib"});
GELOGE(PARAM_INVALID, "Failed load tiling lib as env 'ASCEND_OPP_PATH'[%s] is invalid path.", opp_path_env);
return std::string(); return std::string();
} }
opp_path = resolved_path; opp_path = resolved_path;


+ 1
- 0
ge/common/ge_common.mk View File

@@ -12,6 +12,7 @@ GE_COMMON_LOCAL_SRC_FILES := \
math/fp16_math.cc \ math/fp16_math.cc \
debug/memory_dumper.cc \ debug/memory_dumper.cc \
formats/utils/formats_trans_utils.cc \ formats/utils/formats_trans_utils.cc \
dump/dump_properties.cc \
formats/format_transfers/datatype_transfer.cc \ formats/format_transfers/datatype_transfer.cc \
formats/format_transfers/format_transfer_transpose.cc \ formats/format_transfers/format_transfer_transpose.cc \
formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \ formats/format_transfers/format_transfer_nchw_nc1hwc0.cc \


+ 19
- 1
ge/common/helper/model_cache_helper.cc View File

@@ -497,7 +497,25 @@ Status ModelCacheHelper::LoadJsonFromFile(const string &file_name, Json &json) c
GELOGW("Fail to open the file: %s.", path.c_str()); GELOGW("Fail to open the file: %s.", path.c_str());
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
ifs >> json;
try {
ifs >> json;
} catch (nlohmann::detail::parse_error e) {
GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
return INTERNAL_ERROR;
} catch (nlohmann::detail::invalid_iterator e) {
GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
return INTERNAL_ERROR;
} catch (nlohmann::detail::type_error e) {
GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
return INTERNAL_ERROR;
} catch (nlohmann::detail::out_of_range e) {
GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
return INTERNAL_ERROR;
} catch (nlohmann::detail::other_error e) {
GELOGW("Fail to load json from file, json throw an error:%s.", e.what());
return INTERNAL_ERROR;
}

if (!json.is_object()) { if (!json.is_object()) {
GELOGW("Fail to load the json file: %s.", path.c_str()); GELOGW("Fail to load the json file: %s.", path.c_str());
return INTERNAL_ERROR; return INTERNAL_ERROR;


+ 17
- 2
ge/common/helper/model_helper.cc View File

@@ -41,7 +41,22 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil
const uint8_t *data, size_t size) { const uint8_t *data, size_t size) {
if (size < 1 || size > UINT32_MAX) { if (size < 1 || size > UINT32_MAX) {
GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size); GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size);
ErrorManager::GetInstance().ATCReportErrMessage("E19022");
if (size > UINT32_MAX) {
string item = "item";
if (type == MODEL_DEF) {
item = "model info";
} else if (type == WEIGHTS_DATA) {
item = "weight data";
} else if (type == TASK_INFO) {
item = "task info";
} else if (type == TBE_KERNELS) {
item = "tbe kernels";
} else if (type == CUST_AICPU_KERNELS) {
item = "aicpu kernels";
}
ErrorManager::GetInstance().ATCReportErrMessage("E19023", {"size", "item", "maxsize"},
{std::to_string(size), item, std::to_string(UINT32_MAX)});
}
return PARAM_INVALID; return PARAM_INVALID;
} }
if (data == nullptr) { if (data == nullptr) {
@@ -263,7 +278,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
} }


Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_); Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_);
if (ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_) != SUCCESS) {
if (status != SUCCESS) {
GELOGE(status, "Parse model content failed!"); GELOGE(status, "Parse model content failed!");
return status; return status;
} }


+ 15
- 2
ge/common/profiling/profiling_manager.cc View File

@@ -51,10 +51,23 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager &ProfilingMana
return profiling_manager; return profiling_manager;
} }


FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options) {
FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options,
bool convert_2_phy_device_id) {
#ifdef DAVINCI_SUPPORT_PROFILING #ifdef DAVINCI_SUPPORT_PROFILING
vector<int32_t>().swap(device_id_); vector<int32_t>().swap(device_id_);
device_id_.push_back(options.device_id);
// profiling need phy device id
if (!convert_2_phy_device_id) {
device_id_.push_back(options.device_id);
} else {
uint32_t phy_device_id = 0;
rtError_t rt_ret = rtGetDevicePhyIdByIndex(static_cast<uint32_t>(options.device_id), &phy_device_id);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id);
return FAILED;
}
device_id_.push_back(phy_device_id);
}

job_id_ = options.job_id; job_id_ = options.job_id;


Status ret; Status ret;


+ 1
- 1
ge/common/profiling/profiling_manager.h View File

@@ -69,7 +69,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
ProfilingManager(); ProfilingManager();
virtual ~ProfilingManager(); virtual ~ProfilingManager();
static ProfilingManager &Instance(); static ProfilingManager &Instance();
ge::Status Init(const Options &options);
ge::Status Init(const Options &options, bool convert_2_phy_device_id = false);
ge::Status InitFromOptions(const Options &options); ge::Status InitFromOptions(const Options &options);
ge::Status InitFromAclCfg(const std::string &config); ge::Status InitFromAclCfg(const std::string &config);
ge::Status StartProfiling(int32_t iter, int32_t device_id); ge::Status StartProfiling(int32_t iter, int32_t device_id);


+ 6
- 0
ge/common/properties_manager.cc View File

@@ -172,6 +172,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpProperties &PropertiesManag
return dump_properties_map_[session_id]; return dump_properties_map_[session_id];
} }


FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::AddDumpProperties(
uint64_t session_id, const DumpProperties &dump_properties) {
std::lock_guard<std::mutex> lock(mutex_);
dump_properties_map_.emplace(session_id, dump_properties);
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::RemoveDumpProperties(uint64_t session_id) { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::RemoveDumpProperties(uint64_t session_id) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto iter = dump_properties_map_.find(session_id); auto iter = dump_properties_map_.find(session_id);


+ 5
- 1
ge/common/properties_manager.h View File

@@ -23,8 +23,8 @@
#include <string> #include <string>
#include <vector> #include <vector>


#include "graph/op_desc.h"
#include "common/dump/dump_properties.h" #include "common/dump/dump_properties.h"
#include "graph/op_desc.h"


namespace ge { namespace ge {
// Configuration property management // Configuration property management
@@ -83,6 +83,10 @@ class PropertiesManager {
void SetPropertyDelimiter(const std::string &de); void SetPropertyDelimiter(const std::string &de);


DumpProperties &GetDumpProperties(uint64_t session_id); DumpProperties &GetDumpProperties(uint64_t session_id);

const map<uint64_t, DumpProperties> &GetDumpPropertiesMap() { return dump_properties_map_; }

void AddDumpProperties(uint64_t session_id, const DumpProperties &dump_properties);
void RemoveDumpProperties(uint64_t session_id); void RemoveDumpProperties(uint64_t session_id);


private: private:


+ 68
- 2
ge/common/util.cc View File

@@ -19,16 +19,16 @@
#include <fcntl.h> #include <fcntl.h>
#include <sys/stat.h> #include <sys/stat.h>


#include <unistd.h>
#include <regex.h> #include <regex.h>
#include <unistd.h>
#include <algorithm> #include <algorithm>
#include <climits> #include <climits>
#include <cstdlib> #include <cstdlib>
#include <ctime> #include <ctime>
#include <fstream> #include <fstream>


#include "external/ge/ge_api_error_codes.h"
#include "common/util/error_manager/error_manager.h" #include "common/util/error_manager/error_manager.h"
#include "external/ge/ge_api_error_codes.h"
#include "framework/common/debug/ge_log.h" #include "framework/common/debug/ge_log.h"
#include "framework/common/fmk_types.h" #include "framework/common/fmk_types.h"
#include "framework/common/ge_inner_error_codes.h" #include "framework/common/ge_inner_error_codes.h"
@@ -58,6 +58,7 @@ const int kWarningThreshold = 536870912 * 2; // 536870912 represent 512M
const int kMaxFileSizeLimit = INT_MAX; const int kMaxFileSizeLimit = INT_MAX;
const int kMaxBuffSize = 256; const int kMaxBuffSize = 256;
const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character"; const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character";
constexpr uint32_t MAX_CONFIG_FILE_BYTE = 10 * 1024 * 1024;
} // namespace } // namespace


namespace ge { namespace ge {
@@ -482,4 +483,69 @@ FMK_FUNC_HOST_VISIBILITY bool ValidateStr(const std::string &str, const std::str
regfree(&reg); regfree(&reg);
return true; return true;
} }

FMK_FUNC_HOST_VISIBILITY bool IsValidFile(const char *file_path) {
if (file_path == nullptr) {
GELOGE(PARAM_INVALID, "Config path is null.");
return false;
}
if (!CheckInputPathValid(file_path)) {
GELOGE(PARAM_INVALID, "Config path is invalid: %s", file_path);
return false;
}
// Normalize the path
std::string resolved_file_path = RealPath(file_path);
if (resolved_file_path.empty()) {
GELOGE(PARAM_INVALID, "Invalid input file path [%s], make sure that the file path is correct.", file_path);
return false;
}

mmStat_t stat = {0};
int32_t ret = mmStatGet(resolved_file_path.c_str(), &stat);
if (ret != EN_OK) {
GELOGE(PARAM_INVALID, "cannot get config file status, which path is %s, maybe not exist, return %d, errcode %d",
resolved_file_path.c_str(), ret, mmGetErrorCode());
return false;
}
if ((stat.st_mode & S_IFMT) != S_IFREG) {
GELOGE(PARAM_INVALID, "config file is not a common file, which path is %s, mode is %u", resolved_file_path.c_str(),
stat.st_mode);
return false;
}
if (stat.st_size > MAX_CONFIG_FILE_BYTE) {
GELOGE(PARAM_INVALID, "config file %s size[%ld] is larger than max config file Bytes[%u]",
resolved_file_path.c_str(), stat.st_size, MAX_CONFIG_FILE_BYTE);
return false;
}
return true;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status CheckPath(const char *path, size_t length) {
if (path == nullptr) {
GELOGE(PARAM_INVALID, "Config path is invalid.");
return PARAM_INVALID;
}

if (strlen(path) != length) {
GELOGE(PARAM_INVALID, "Path is invalid or length of config path is not equal to given length.");
return PARAM_INVALID;
}

if (length == 0 || length > MMPA_MAX_PATH) {
GELOGE(PARAM_INVALID, "Length of config path is invalid.");
return PARAM_INVALID;
}

INT32 is_dir = mmIsDir(path);
if (is_dir != EN_OK) {
GELOGE(PATH_INVALID, "Open directory %s failed, maybe it is not exit or not a dir", path);
return PATH_INVALID;
}

if (mmAccess2(path, M_R_OK) != EN_OK) {
GELOGE(PATH_INVALID, "Read path[%s] failed, errmsg[%s]", path, strerror(errno));
return PATH_INVALID;
}
return SUCCESS;
}
} // namespace ge } // namespace ge

+ 3
- 1
ge/executor/CMakeLists.txt View File

@@ -22,7 +22,7 @@ file(GLOB PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"../../proto/insert_op.proto" "../../proto/insert_op.proto"
"../../proto/op_mapping_info.proto" "../../proto/op_mapping_info.proto"
"../../proto/ge_ir.proto" "../../proto/ge_ir.proto"
"../proto/dump_task.proto"
"../../proto/dump_task.proto"
) )


file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
@@ -73,6 +73,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"../graph/manager/trans_var_data_utils.cc" "../graph/manager/trans_var_data_utils.cc"
"../graph/manager/util/debug.cc" "../graph/manager/util/debug.cc"
"../hybrid/hybrid_davinci_model_stub.cc" "../hybrid/hybrid_davinci_model_stub.cc"
"../hybrid/node_executor/aicpu/aicpu_ext_info.cc"
"../model/ge_model.cc" "../model/ge_model.cc"
"../model/ge_root_model.cc" "../model/ge_root_model.cc"
"../omm/csa_interact.cc" "../omm/csa_interact.cc"
@@ -118,6 +119,7 @@ target_link_libraries(ge_executor
${slog} ${slog}
${mmpa} ${mmpa}
${msprof} ${msprof}
${error_manager}
rt rt
dl) dl)



+ 48
- 1
ge/executor/ge_executor.cc View File

@@ -182,6 +182,37 @@ bool IsDynamicImageSizeMatchModel(uint64_t image_height, uint64_t image_width,
GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width); GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width);
return false; return false;
} }

bool IsDynmaicDimsSizeMatchModel(const vector<uint64_t> cur_dynamic_dims, const vector<vector<int64_t>> &batch_info) {
if (batch_info.empty()) {
GELOGE(ge::FAILED, "Dynamic batch info is empty.");
return false;
}

bool find_match = false;
for (auto resolution : batch_info) {
if (cur_dynamic_dims.size() != resolution.size()) {
GELOGE(ge::FAILED, "Cur dynamic dims param num is %zu, current resolution size is %zu.", cur_dynamic_dims.size(),
resolution.size());
return false;
}
bool flag = true;
for (std::size_t i = 0; i < resolution.size(); ++i) {
if (cur_dynamic_dims[i] != static_cast<uint64_t>(resolution[i])) {
flag = false;
break;
}
}
if (flag) {
find_match = true;
break;
}
}
if (!find_match) {
GELOGE(ge::FAILED, "choose dynamic dims can not match the gear of model.");
}
return find_match;
}
} // namespace } // namespace


namespace ge { namespace ge {
@@ -347,9 +378,21 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u
vector<uint64_t> cur_dynamic_dims; vector<uint64_t> cur_dynamic_dims;
Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims); Status ret = GetCurDynamicDims(model_id, dynamic_dims, cur_dynamic_dims);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(FAILED, "Set cur gear dynmaic dims failed");
GELOGE(FAILED, "Set cur gear dynamic dims failed");
return FAILED; return FAILED;
} }
std::vector<std::vector<int64_t>> batch_info;
int32_t dynamic_type = static_cast<int32_t>(FIXED);
ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info, dynamic_type);
if (ret != SUCCESS) {
GELOGE(ret, "Get dynamic input info failed.");
return ret;
}

if (!IsDynmaicDimsSizeMatchModel(cur_dynamic_dims, batch_info)) {
GELOGE(PARAM_INVALID, "The current dynamic input does not match the gear of the model.");
return PARAM_INVALID;
}


ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS)); ret = GraphExecutor::SetDynamicSize(model_id, cur_dynamic_dims, static_cast<int32_t>(DYNAMIC_DIMS));
if (ret != SUCCESS) { if (ret != SUCCESS) {
@@ -410,6 +453,10 @@ Status GeExecutor::GetCurDynamicDims(uint32_t model_id, const vector<uint64_t> &
for (std::size_t i = 0; i < all_data_dims.size(); ++i) { for (std::size_t i = 0; i < all_data_dims.size(); ++i) {
if (all_data_dims[i] < 0) { if (all_data_dims[i] < 0) {
cur_dynamic_dims.push_back(dynamic_dims[i]); cur_dynamic_dims.push_back(dynamic_dims[i]);
} else if (static_cast<uint64_t>(all_data_dims[i]) != dynamic_dims[i]) {
GELOGE(PARAM_INVALID, "Static dims should be same, index: %zu value: %d should be %d", i, dynamic_dims[i],
all_data_dims[i]);
return PARAM_INVALID;
} }
} }
return SUCCESS; return SUCCESS;


+ 3
- 0
ge/executor/module.mk View File

@@ -60,6 +60,7 @@ local_ge_executor_src_files := \
../single_op/task/aicpu_task_builder.cc \ ../single_op/task/aicpu_task_builder.cc \
../single_op/task/aicpu_kernel_task_builder.cc \ ../single_op/task/aicpu_kernel_task_builder.cc \
../hybrid/hybrid_davinci_model_stub.cc\ ../hybrid/hybrid_davinci_model_stub.cc\
../hybrid/node_executor/aicpu/aicpu_ext_info.cc \


local_ge_executor_c_include := \ local_ge_executor_c_include := \
proto/insert_op.proto \ proto/insert_op.proto \
@@ -87,6 +88,7 @@ local_ge_executor_shared_library := \
libgraph \ libgraph \
libregister \ libregister \
libmsprof \ libmsprof \
liberror_manager \


local_ge_executor_ldflags := -lrt -ldl \ local_ge_executor_ldflags := -lrt -ldl \


@@ -137,6 +139,7 @@ LOCAL_SHARED_LIBRARIES := \
libgraph \ libgraph \
libregister \ libregister \
libmsprof \ libmsprof \
liberror_manager \


LOCAL_LDFLAGS += $(local_ge_executor_ldflags) LOCAL_LDFLAGS += $(local_ge_executor_ldflags)




+ 5
- 0
ge/ge_inference.mk View File

@@ -254,6 +254,7 @@ OME_HOST_SRC_FILES := \
single_op/stream_resource.cc \ single_op/stream_resource.cc \
single_op/single_op_manager.cc \ single_op/single_op_manager.cc \
hybrid/hybrid_davinci_model_stub.cc \ hybrid/hybrid_davinci_model_stub.cc \
hybrid/node_executor/aicpu/aicpu_ext_info.cc \
# graph/load/new_model_manager/task_info/hccl_task_info.cc # graph/load/new_model_manager/task_info/hccl_task_info.cc


OME_DEVICE_SRC_FILES := $(OME_HOST_SRC_FILES) OME_DEVICE_SRC_FILES := $(OME_HOST_SRC_FILES)
@@ -286,6 +287,7 @@ COMMON_LOCAL_C_INCLUDES := \
$(TOPDIR)inc/runtime \ $(TOPDIR)inc/runtime \
$(TOPDIR)libc_sec/include \ $(TOPDIR)libc_sec/include \
$(TOPDIR)ops/built-in/op_proto/inc \ $(TOPDIR)ops/built-in/op_proto/inc \
$(TOPDIR)toolchain/ide/ide-daemon/external \
third_party/json/include \ third_party/json/include \
third_party/protobuf/include \ third_party/protobuf/include \
third_party/opencv/include \ third_party/opencv/include \
@@ -340,6 +342,7 @@ DEVICE_LOCAL_C_INCLUDES := \
$(TOPDIR)inc/runtime \ $(TOPDIR)inc/runtime \
$(TOPDIR)ops/built-in/op_proto/inc \ $(TOPDIR)ops/built-in/op_proto/inc \
$(TOPDIR)framework/domi \ $(TOPDIR)framework/domi \
$(TOPDIR)toolchain/ide/ide-daemon/external \
third_party/json/include \ third_party/json/include \
third_party/protobuf/include \ third_party/protobuf/include \
third_party/opencv/include \ third_party/opencv/include \
@@ -368,6 +371,7 @@ LOCAL_SRC_FILES += $(BUILER_SRC_FILES)
LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES) LOCAL_SRC_FILES += $(ANALYZER_SRC_FILES)


LOCAL_STATIC_LIBRARIES := libge_memory \ LOCAL_STATIC_LIBRARIES := libge_memory \
libadump_server_stub \


LOCAL_SHARED_LIBRARIES := \ LOCAL_SHARED_LIBRARIES := \
libc_sec \ libc_sec \
@@ -432,6 +436,7 @@ LOCAL_C_INCLUDES := $(DEVICE_LOCAL_C_INCLUDES)
LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES) LOCAL_C_INCLUDES += $(ANALYZER_LOCAL_INCLUDES)


LOCAL_STATIC_LIBRARIES := libge_memory \ LOCAL_STATIC_LIBRARIES := libge_memory \
libadump_server_stub \


LOCAL_SHARED_LIBRARIES := \ LOCAL_SHARED_LIBRARIES := \
libc_sec \ libc_sec \


+ 57
- 32
ge/ge_local_engine/engine/host_cpu_engine.cc View File

@@ -25,40 +25,65 @@
#include "common/ge/plugin_manager.h" #include "common/ge/plugin_manager.h"
#include "graph/utils/type_utils.h" #include "graph/utils/type_utils.h"
#include "common/fp16_t.h" #include "common/fp16_t.h"
#include "common/math/math_util.h"


namespace { namespace {
#define CREATE_OUTPUT_CASE(DTYPE, TYPE) \
case (DTYPE): { \
GeTensorPtr ge_tensor = nullptr; \
if (need_create_flag) { \
int64_t data_num = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize(); \
std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]()); \
if (buf == nullptr) { \
GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", \
static_cast<size_t>(sizeof(TYPE) * data_num)); \
return MEMALLOC_FAILED; \
} \
ge_tensor = MakeShared<GeTensor>(out_desc); \
GE_CHECK_NOTNULL(ge_tensor); \
GELOGI("node:%s allocate output %zu, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \
ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)); \
ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); \
ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); \
outputs.emplace_back(ge_tensor); \
} else { \
ge_tensor = outputs[i]; \
GE_CHECK_NOTNULL(ge_tensor); \
GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \
reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \
} \
auto tensor = TensorAdapter::AsTensor(*ge_tensor); \
auto tensor_name = op_desc->GetOutputNameByIndex(i); \
GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", \
op_desc->GetName().c_str(), i); \
GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \
op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize()); \
named_outputs.emplace(tensor_name, tensor); \
break; \
#define CREATE_OUTPUT_CASE(DTYPE, TYPE) \
case (DTYPE): { \
GeTensorPtr ge_tensor = nullptr; \
if (need_create_flag) { \
int64_t num_size = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize(); \
if (out_desc.GetShape().IsUnknownShape()) { \
std::vector<std::pair<int64_t, int64_t>> range; \
if (out_desc.GetShapeRange(range) != GRAPH_SUCCESS) { \
GELOGE(INTERNAL_ERROR, "Get shape range failed, node:%s", op_desc->GetName().c_str()); \
return INTERNAL_ERROR; \
} \
int64_t max_range_size = 1; \
for (const auto &item : range) { \
FMK_INT64_MULCHECK(max_range_size, item.second); \
max_range_size *= item.second; \
} \
num_size = max_range_size; \
} \
if (num_size < 0) { \
GELOGE(INTERNAL_ERROR, "node:%s, get size for output %zu failed, num=%lld", op_desc->GetName().c_str(), i, \
num_size); \
return INTERNAL_ERROR; \
} \
auto data_num = static_cast<uint64_t>(num_size); \
GELOGI("node:%s allocate output %zu start, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \
std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]()); \
if (buf == nullptr) { \
GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", \
static_cast<size_t>(sizeof(TYPE) * data_num)); \
return MEMALLOC_FAILED; \
} \
ge_tensor = MakeShared<GeTensor>(out_desc); \
GE_CHECK_NOTNULL(ge_tensor); \
GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, \
data_num * sizeof(TYPE)); \
if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) { \
GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str()); \
return MEMALLOC_FAILED; \
} \
ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); \
ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); \
outputs.emplace_back(ge_tensor); \
} else { \
ge_tensor = outputs[i]; \
GE_CHECK_NOTNULL(ge_tensor); \
GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \
reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \
} \
auto tensor = TensorAdapter::AsTensor(*ge_tensor); \
auto tensor_name = op_desc->GetOutputNameByIndex(i); \
GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", \
op_desc->GetName().c_str(), i); \
GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu", \
op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize()); \
named_outputs.emplace(tensor_name, tensor); \
break; \
} }
} // namespace } // namespace




+ 9
- 0
ge/ge_runner.mk View File

@@ -296,6 +296,7 @@ LIBGE_LOCAL_SRC_FILES := \
LIBCLIENT_LOCAL_SRC_FILES := \ LIBCLIENT_LOCAL_SRC_FILES := \
proto/ge_api.proto \ proto/ge_api.proto \
client/ge_api.cc \ client/ge_api.cc \
client/ge_prof.cc \


RUNNER_LOCAL_C_INCLUDES := \ RUNNER_LOCAL_C_INCLUDES := \
$(LOCAL_PATH) ./ \ $(LOCAL_PATH) ./ \
@@ -312,6 +313,7 @@ RUNNER_LOCAL_C_INCLUDES := \
$(TOPDIR)libc_sec/include \ $(TOPDIR)libc_sec/include \
$(TOPDIR)ops/built-in/op_proto/inc \ $(TOPDIR)ops/built-in/op_proto/inc \
$(TOPDIR)framework/domi/analyzer \ $(TOPDIR)framework/domi/analyzer \
$(TOPDIR)toolchain/ide/ide-daemon/external \
proto/fwk_adapter.proto \ proto/fwk_adapter.proto \
proto/ge_ir.proto \ proto/ge_ir.proto \
proto/insert_op.proto \ proto/insert_op.proto \
@@ -353,6 +355,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)


LOCAL_STATIC_LIBRARIES := libge_memory \ LOCAL_STATIC_LIBRARIES := libge_memory \
libadump_server \


LOCAL_SHARED_LIBRARIES := \ LOCAL_SHARED_LIBRARIES := \
libc_sec \ libc_sec \
@@ -371,6 +374,7 @@ LOCAL_LDFLAGS := -lrt -ldl
LOCAL_SHARED_LIBRARIES += \ LOCAL_SHARED_LIBRARIES += \
libruntime \ libruntime \
libresource \ libresource \
stub/libascend_hal \


include $(BUILD_HOST_SHARED_LIBRARY) include $(BUILD_HOST_SHARED_LIBRARY)


@@ -389,6 +393,7 @@ endif
LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES) LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)


LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc
LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_prof.cc




LOCAL_SHARED_LIBRARIES := LOCAL_SHARED_LIBRARIES :=
@@ -438,6 +443,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)


LOCAL_STATIC_LIBRARIES := libge_memory \ LOCAL_STATIC_LIBRARIES := libge_memory \
libadump_server \


LOCAL_SHARED_LIBRARIES := \ LOCAL_SHARED_LIBRARIES := \
libc_sec \ libc_sec \
@@ -450,6 +456,7 @@ LOCAL_LDFLAGS := -lrt -ldl
LOCAL_SHARED_LIBRARIES += \ LOCAL_SHARED_LIBRARIES += \
libruntime \ libruntime \
libresource \ libresource \
stub/libascend_hal \


include $(BUILD_HOST_STATIC_LIBRARY) include $(BUILD_HOST_STATIC_LIBRARY)


@@ -469,6 +476,7 @@ LOCAL_SRC_FILES := $(LIBGE_LOCAL_SRC_FILES)
LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES) LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)


LOCAL_STATIC_LIBRARIES := libge_memory \ LOCAL_STATIC_LIBRARIES := libge_memory \
libadump_server \


LOCAL_SHARED_LIBRARIES := \ LOCAL_SHARED_LIBRARIES := \
libc_sec \ libc_sec \
@@ -481,5 +489,6 @@ LOCAL_LDFLAGS := -lrt -ldl
LOCAL_SHARED_LIBRARIES += \ LOCAL_SHARED_LIBRARIES += \
libruntime \ libruntime \
libresource \ libresource \
libascend_hal \


include $(BUILD_STATIC_LIBRARY) include $(BUILD_STATIC_LIBRARY)

+ 5
- 0
ge/graph/build/memory/block_mem_assigner.cc View File

@@ -1296,6 +1296,11 @@ void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &s
return; return;
} }
if (dest[i] != nullptr && src[i] != nullptr) { if (dest[i] != nullptr && src[i] != nullptr) {
if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) {
GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.", i,
dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_);
continue;
}
for (auto &symbol : src[i]->SymbolList()) { for (auto &symbol : src[i]->SymbolList()) {
dest[i]->AddSymbol(symbol); dest[i]->AddSymbol(symbol);
} }


+ 50
- 3
ge/graph/build/memory/graph_mem_assigner.cc View File

@@ -227,7 +227,10 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse
if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) { if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) {
GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset, GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset,
VarManager::Instance(session_id)->GetGraphMemoryMaxSize()); VarManager::Instance(session_id)->GetGraphMemoryMaxSize());
ErrorManager::GetInstance().ATCReportErrMessage("E19022");
ErrorManager::GetInstance().ATCReportErrMessage(
"E19022", {"size", "item", "maxsize"},
{std::to_string(mem_offset), "featuremap",
std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())});
return ge::FAILED; return ge::FAILED;
} }
return SUCCESS; return SUCCESS;
@@ -908,6 +911,8 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP
GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str()); GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str());
return ret; return ret;
} }
} else {
GELOGW("Current atomic node %s does not have attr ATOMIC_WORKSPACE_INFO.", node->GetName().c_str());
} }


return SUCCESS; return SUCCESS;
@@ -1452,14 +1457,56 @@ Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int6
return SUCCESS; return SUCCESS;
} }


ge::Status GraphMemoryAssigner::IsIndependentAtomicClean(const ge::NodePtr &node,
bool &is_independent_atomic_clean_node) {
GE_CHECK_NOTNULL(node);
const auto &out_control_anchor = node->GetOutControlAnchor();
GE_CHECK_NOTNULL(out_control_anchor);
for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) {
if (peer_in_control_anchor != nullptr) {
auto peer_in_node = peer_in_control_anchor->GetOwnerNode();
auto peer_in_node_desc = peer_in_node->GetOpDesc();
if (peer_in_node_desc != nullptr) {
bool is_atomic_node = false;
// If GetBool fail, is_atomic_node is false.
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic_node);
if (is_atomic_node) {
vector<int> is_connect_netoutput;
// If GetBool fail, attr is_connect_netoutput is an empty vector.
(void)ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connect_netoutput);
if (!is_connect_netoutput.empty()) {
GELOGD("Peer in node %s is independent atomic clean node", peer_in_node->GetName().c_str());
is_independent_atomic_clean_node = true;
break;
}
}
}
}
}

return SUCCESS;
}

ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start, ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start,
const vector<int64_t> &atomic_mem_size) { const vector<int64_t> &atomic_mem_size) {
for (ge::NodePtr &node : compute_graph_->GetAllNodes()) { for (ge::NodePtr &node : compute_graph_->GetAllNodes()) {
auto node_op_desc = node->GetOpDesc(); auto node_op_desc = node->GetOpDesc();
GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);


if (((n != nullptr) && (node->GetName() == n->GetName())) ||
((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) {
bool is_valid_atomic_clean_node = (n != nullptr) && (node->GetName() == n->GetName());

if (((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) {
bool is_independent_atomic_clean = false;
if (IsIndependentAtomicClean(node, is_independent_atomic_clean) != SUCCESS) {
GELOGE(FAILED, "Failed to determine the connection relationship of atomic addr clean node.");
return PARAM_INVALID;
}

is_valid_atomic_clean_node = is_valid_atomic_clean_node || (!is_independent_atomic_clean);
}

if (is_valid_atomic_clean_node) {
GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str());
vector<int64_t> workspace_vector = node_op_desc->GetWorkspace(); vector<int64_t> workspace_vector = node_op_desc->GetWorkspace();
vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes(); vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes();
workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end()); workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());


+ 2
- 0
ge/graph/build/memory/graph_mem_assigner.h View File

@@ -175,6 +175,8 @@ class GraphMemoryAssigner {
ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start, ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start,
const std::vector<int64_t> &atomic_mem_size); const std::vector<int64_t> &atomic_mem_size);


ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node);

void AlignMemOffset(const int64_t &mem_align_size); void AlignMemOffset(const int64_t &mem_align_size);


ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const; ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const;


+ 56
- 20
ge/graph/build/task_generator.cc View File

@@ -266,6 +266,14 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
if (is_unknown_shape) { if (is_unknown_shape) {
GE_CHK_STATUS_RET(SetUnknownShapeStream(run_context, stream), "Set unknown shape stream failed."); GE_CHK_STATUS_RET(SetUnknownShapeStream(run_context, stream), "Set unknown shape stream failed.");
} }
std::function<void()> callback = [&]() {
if (is_unknown_shape) {
if (DestroyUnknownShapeStream(run_context, stream) != SUCCESS) {
GELOGE(FAILED, "Destory unknown shape stream failed.");
}
}
};
GE_MAKE_GUARD(release, callback);


for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
OpDescPtr op_desc = node->GetOpDesc(); OpDescPtr op_desc = node->GetOpDesc();
@@ -352,9 +360,6 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id,
task_list_size_after - task_list_size_before); task_list_size_after - task_list_size_before);
} }
if (is_unknown_shape) {
GE_CHK_STATUS_RET(DestroyUnknownShapeStream(run_context, stream), "Destory unknown shape stream failed.");
}
GE_TIMESTAMP_CALLNUM_EVENT_END(GenerateTask, "GraphBuild::GenerateTask"); GE_TIMESTAMP_CALLNUM_EVENT_END(GenerateTask, "GraphBuild::GenerateTask");
return SUCCESS; return SUCCESS;
} }
@@ -532,6 +537,9 @@ Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) {
(void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node); (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node);
} }


(void)op_desc->DelAttr(kIsFirstNode);
(void)op_desc->DelAttr(kIsLastNode);

all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc); all_stream_ops[op_desc->GetStreamId()].emplace_back(op_desc);
} }


@@ -645,8 +653,6 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
vector<uint32_t> &all_reduce_nodes) const { vector<uint32_t> &all_reduce_nodes) const {
GELOGI("Start AutoFindBpOpIndex"); GELOGI("Start AutoFindBpOpIndex");
NodePtr bp_node = nullptr; NodePtr bp_node = nullptr;
uint32_t last_bp = 0;
uint32_t iter_end = 0;
uint32_t current_idx = 0; uint32_t current_idx = 0;
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
OpDescPtr op_desc = node->GetOpDesc(); OpDescPtr op_desc = node->GetOpDesc();
@@ -662,20 +668,40 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
all_reduce_nodes.emplace_back(current_idx); all_reduce_nodes.emplace_back(current_idx);
GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
} }
if (op_desc->GetType() == NETOUTPUT) {
if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
if (bp_node == nullptr) { if (bp_node == nullptr) {
bp_node = node; bp_node = node;
} }
iter_end = current_idx;
GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
}
if (graph->GetNeedIteration()) {
if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive",
op_desc->GetName().c_str(), current_idx);
}
if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx);
}
} else {
if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx);
}
} }
} }
profiling_point.end_index = iter_end;


if (bp_node == nullptr) { if (bp_node == nullptr) {
GELOGW("not find bp_node."); GELOGW("not find bp_node.");
return SUCCESS; return SUCCESS;
} }

profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node);
return SUCCESS;
}

uint32_t TaskGenerator::FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const {
uint32_t last_bp = 0;
OpDescPtr bp_op_desc = nullptr; OpDescPtr bp_op_desc = nullptr;
for (auto &in_anchor : bp_node->GetAllInDataAnchors()) { for (auto &in_anchor : bp_node->GetAllInDataAnchors()) {
auto out_anchor = in_anchor->GetPeerOutAnchor(); auto out_anchor = in_anchor->GetPeerOutAnchor();
@@ -691,7 +717,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
} }


GE_CHECK_NOTNULL(bp_op_desc); GE_CHECK_NOTNULL(bp_op_desc);
current_idx = 0;
uint32_t current_idx = 0;
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
OpDescPtr op_desc = node->GetOpDesc(); OpDescPtr op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc); GE_CHECK_NOTNULL(op_desc);
@@ -702,8 +728,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
break; break;
} }
} }
profiling_point.bp_index = last_bp;
return SUCCESS;
return last_bp;
} }


Status TaskGenerator::FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, Status TaskGenerator::FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str,
@@ -734,7 +759,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes) const { ProfilingPoint &profiling_point, vector<uint32_t> &all_reduce_nodes) const {
GELOGI("Start FindBpOfEnv"); GELOGI("Start FindBpOfEnv");
uint32_t current_idx = 0; uint32_t current_idx = 0;
uint32_t iter_end = 0;
uint32_t last_bp = 0; uint32_t last_bp = 0;
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
OpDescPtr op_desc = node->GetOpDesc(); OpDescPtr op_desc = node->GetOpDesc();
@@ -745,10 +769,23 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
continue; continue;
} }


if (op_desc->GetType() == NETOUTPUT) {
iter_end = current_idx;
GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
if (graph->GetNeedIteration()) {
if (op_desc->GetName() == NODE_NAME_NET_OUTPUT + '_' + NODE_NAME_STREAM_SWITCH + "_StreamActive") {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from Node_Output_IteratorCtrl_StreamSwitch_StreamActive",
op_desc->GetName().c_str(), current_idx);
}
if (op_desc->GetName() == NODE_NAME_FLOWCTRL_LOOP_ASSIGN) {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from FlowCtrl_LoopCond_ASSIGN", op_desc->GetName().c_str(), current_idx);
}
} else {
if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
profiling_point.end_index.insert(current_idx);
GELOGI("Iter end name %s, idx %u, from NETOUTPUT", op_desc->GetName().c_str(), current_idx);
}
} }

if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) { if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) {
all_reduce_nodes.emplace_back(current_idx); all_reduce_nodes.emplace_back(current_idx);
GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx); GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
@@ -760,7 +797,6 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
} }


profiling_point.bp_index = last_bp; profiling_point.bp_index = last_bp;
profiling_point.end_index = iter_end;
return SUCCESS; return SUCCESS;
} }


@@ -857,7 +893,7 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
ProfilingManager::Instance().ProfilingTrainingTraceOn(); ProfilingManager::Instance().ProfilingTrainingTraceOn();
if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index == 0)) {
(profiling_point.end_index.empty())) {
return SUCCESS; return SUCCESS;
} }
if (profiling_point.fp_index == node_index) { if (profiling_point.fp_index == node_index) {
@@ -914,7 +950,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
ProfilingManager::Instance().ProfilingTrainingTraceOn(); ProfilingManager::Instance().ProfilingTrainingTraceOn();
if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
(profiling_point.end_index == 0)) {
(profiling_point.end_index.empty())) {
return SUCCESS; return SUCCESS;
} }
if (profiling_point.bp_index == node_index) { if (profiling_point.bp_index == node_index) {
@@ -928,7 +964,7 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
bp_log_def->set_notify(false); bp_log_def->set_notify(false);
task_def_list.emplace_back(bp_task_def); task_def_list.emplace_back(bp_task_def);
} }
if (profiling_point.end_index == node_index) {
if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index); GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
TaskDef end_task_def; TaskDef end_task_def;
end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);


+ 2
- 1
ge/graph/build/task_generator.h View File

@@ -36,7 +36,7 @@ class OpsKernelManager;
struct ProfilingPoint { struct ProfilingPoint {
uint32_t fp_index = 0; uint32_t fp_index = 0;
uint32_t bp_index = 0; uint32_t bp_index = 0;
uint32_t end_index = 0;
std::set<uint32_t> end_index;
}; };
// Describes infos needed by generate task for fusion node // Describes infos needed by generate task for fusion node
struct FusionTaskInfo { struct FusionTaskInfo {
@@ -112,6 +112,7 @@ class TaskGenerator {
Status AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point) const; Status AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point) const;
Status AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, Status AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
vector<uint32_t> &all_reduce_nodes) const; vector<uint32_t> &all_reduce_nodes) const;
uint32_t FindLastBpFromBpNode(const ComputeGraphPtr &graph, const NodePtr &bp_node) const;


Status FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str, Status FindFpOfEnv(const ComputeGraphPtr &graph, const std::string &fp_point_str,
ProfilingPoint &profiling_point) const; ProfilingPoint &profiling_point) const;


+ 15
- 2
ge/graph/load/new_model_manager/davinci_model.cc View File

@@ -125,6 +125,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
rt_model_stream_(nullptr), rt_model_stream_(nullptr),
is_inner_model_stream_(false), is_inner_model_stream_(false),
is_async_mode_(false), is_async_mode_(false),
last_execute_mode_(false),
session_id_(0), session_id_(0),
device_id_(0), device_id_(0),
maxDumpOpNum_(0), maxDumpOpNum_(0),
@@ -2879,6 +2880,12 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<v
} }
} }
} }
auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId());
if (it == zero_copy_op_id_batch_label_.end()) {
zero_copy_task.SetBatchLabel(kDefaultBatchLable);
} else {
zero_copy_task.SetBatchLabel(it->second);
}


std::lock_guard<std::mutex> lock(outside_addrs_mutex_); std::lock_guard<std::mutex> lock(outside_addrs_mutex_);
if (zero_copy_task.IsTaskArgsSet()) { if (zero_copy_task.IsTaskArgsSet()) {
@@ -3045,6 +3052,9 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
data.first, addr, size, buffer_addr); data.first, addr, size, buffer_addr);
// For input data, just copy for rts task. // For input data, just copy for rts task.
for (ZeroCopyTask &task : zero_copy_tasks_) { for (ZeroCopyTask &task : zero_copy_tasks_) {
if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) {
continue;
}
uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr); uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr);
if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) { if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) {
return FAILED; return FAILED;
@@ -3365,6 +3375,7 @@ Status DavinciModel::InitModelStream(rtStream_t stream) {
if (is_async_mode_) { if (is_async_mode_) {
rt_model_stream_ = stream; rt_model_stream_ = stream;
is_inner_model_stream_ = false; is_inner_model_stream_ = false;
last_execute_mode_ = true;
return SUCCESS; return SUCCESS;
} }


@@ -3376,12 +3387,14 @@ Status DavinciModel::InitModelStream(rtStream_t stream) {


rt_model_stream_ = stream; rt_model_stream_ = stream;
is_inner_model_stream_ = false; is_inner_model_stream_ = false;
last_execute_mode_ = false;
return SUCCESS; return SUCCESS;
} }


if (rt_model_stream_ == nullptr) {
if (last_execute_mode_ || (rt_model_stream_ == nullptr)) {
GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_FORBIDDEN_DEFAULT)); GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_FORBIDDEN_DEFAULT));
is_inner_model_stream_ = true; is_inner_model_stream_ = true;
last_execute_mode_ = false;
} }


return SUCCESS; return SUCCESS;
@@ -3516,7 +3529,7 @@ uint8_t *DavinciModel::MallocWeightsMem(size_t weights_size) {
} }


void DavinciModel::FreeFeatureMapMem() { void DavinciModel::FreeFeatureMapMem() {
if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
if (std::getenv(kEnvGeuseStaticMemory) != nullptr && is_inner_mem_base_) {
string weight_memory_key = std::to_string(0) + "_f"; string weight_memory_key = std::to_string(0) + "_f";
if (MemManager::Instance(RT_MEMORY_HBM)->GetMemoryAddr(weight_memory_key) != nullptr) { if (MemManager::Instance(RT_MEMORY_HBM)->GetMemoryAddr(weight_memory_key) != nullptr) {
GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_HBM)->FreeMemory(weight_memory_key, GetDeviceId()), GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_HBM)->FreeMemory(weight_memory_key, GetDeviceId()),


+ 1
- 0
ge/graph/load/new_model_manager/davinci_model.h View File

@@ -884,6 +884,7 @@ class DavinciModel {
bool is_inner_model_stream_; bool is_inner_model_stream_;


bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_. bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.
bool last_execute_mode_;


bool is_stream_list_bind_{false}; bool is_stream_list_bind_{false};
bool is_pure_head_stream_{false}; bool is_pure_head_stream_{false};


+ 82
- 3
ge/graph/load/new_model_manager/model_manager.cc View File

@@ -43,6 +43,13 @@ const std::string kCmdTypeProfInit = "prof_init";
const std::string kCmdTypeProfFinalize = "prof_finalize"; const std::string kCmdTypeProfFinalize = "prof_finalize";
const std::string kCmdTypeProfStart = "prof_start"; const std::string kCmdTypeProfStart = "prof_start";
const std::string kCmdTypeProfStop = "prof_stop"; const std::string kCmdTypeProfStop = "prof_stop";
const char *const kLoadOpFromBuf = "loadOpFromBuf";
struct CustAicpuSoBuf {
uint64_t kernelSoBuf;
uint32_t kernelSoBufLen;
uint64_t kernelSoName;
uint32_t kernelSoNameLen;
} __attribute__((packed));
} // namespace } // namespace


DumpProperties ModelManager::dump_properties_; DumpProperties ModelManager::dump_properties_;
@@ -163,7 +170,13 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
GELOGI("The session: %lu not created.", session_id); GELOGI("The session: %lu not created.", session_id);
return; return;
} else { } else {
GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
rtContext_t ctx = nullptr;
bool has_ctx = (rtCtxGetCurrent(&ctx) == RT_ERROR_NONE);
if (!has_ctx) {
GELOGI("Set device %u.", GetContext().DeviceId());
GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
}

Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0); Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGW("The session: %lu destroy failed.", session_id); GELOGW("The session: %lu destroy failed.", session_id);
@@ -171,7 +184,11 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
(void)sess_ids_.erase(session_id); (void)sess_ids_.erase(session_id);
GELOGI("The session: %lu destroyed.", session_id); GELOGI("The session: %lu destroyed.", session_id);
} }
GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));

if (!has_ctx) {
GELOGI("Reset device %u.", GetContext().DeviceId());
GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
}
} }
} }


@@ -219,6 +236,7 @@ ModelManager::~ModelManager() {
std::lock_guard<std::mutex> lock(map_mutex_); std::lock_guard<std::mutex> lock(map_mutex_);
model_map_.clear(); model_map_.clear();
model_aicpu_kernel_.clear(); model_aicpu_kernel_.clear();
cust_aicpu_so_.clear();


GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0)));
} }
@@ -919,7 +937,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
} }
davinci_model->SetDeviceId(device_id); davinci_model->SetDeviceId(device_id);
davinci_model->SetOmName(model.om_name); davinci_model->SetOmName(model.om_name);
if (DumpManager::GetInstance().IsDumpOpen()) {
if (DumpManager::GetInstance().GetDumpProperties().IsDumpOpen()) {
davinci_model->SetDumpProperties(DumpManager::GetInstance().GetDumpProperties()); davinci_model->SetDumpProperties(DumpManager::GetInstance().GetDumpProperties());
} else { } else {
davinci_model->SetDumpProperties(dump_properties_); davinci_model->SetDumpProperties(dump_properties_);
@@ -1070,6 +1088,67 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
return SUCCESS; return SUCCESS;
} }


Status ModelManager::LoadCustAicpuSo(const OpDescPtr op_desc, string so_name) {
std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
auto it = cust_aicpu_so_.find(so_name);
if (it == cust_aicpu_so_.end()) {
GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, so_name), "LaunchCustAicpuSo failed. op name %s, so_name %s",
op_desc->GetName().c_str(), so_name.c_str());
(void)cust_aicpu_so_.insert(so_name);
GELOGI("LaunchCustAicpuSo op name %s, so_name %s.", op_desc->GetName().c_str(), so_name.c_str());
}
return SUCCESS;
}

Status ModelManager::LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name) {
CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
if (aicpu_kernel == nullptr) {
GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
return INTERNAL_ERROR;
}
const void *aicpu_data = aicpu_kernel->GetBinData();
uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize();

void *d_aicpu_data = nullptr;
void *d_so_name = nullptr;
void *args = nullptr;
rtError_t status;
rtStream_t stream = nullptr;
GE_CHK_RT(rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM));
GE_CHK_RT(rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE));
GE_CHK_RT(rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM));
GE_CHK_RT(rtMemcpy(d_so_name, so_name.size(), reinterpret_cast<const void *>(so_name.c_str()), so_name.size(),
RT_MEMCPY_HOST_TO_DEVICE));

CustAicpuSoBuf cust_aicpu_so_buf;
cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
cust_aicpu_so_buf.kernelSoNameLen = so_name.size();

uint32_t args_size = sizeof(CustAicpuSoBuf);
GE_CHK_RT(rtMalloc(&args, args_size, RT_MEMORY_HBM));
GE_CHK_RT(rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE));
GE_CHK_RT(rtStreamCreate(&stream, 0));
GE_CHK_RT(rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream));

status = rtStreamSynchronize(stream);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
GE_CHK_RT(rtStreamDestroy(stream));
GE_CHK_RT(rtFree(args));
GE_CHK_RT(rtFree(d_aicpu_data));
GE_CHK_RT(rtFree(d_so_name));
return RT_ERROR_TO_GE_STATUS(status);
}
GE_CHK_RT(rtStreamDestroy(stream));
GE_CHK_RT(rtFree(args));
GE_CHK_RT(rtFree(d_aicpu_data));
GE_CHK_RT(rtFree(d_so_name));
GELOGI("Cpu kernel launch loadOpFromBuf task success.");
return SUCCESS;
}

/// ///
/// @ingroup ge /// @ingroup ge
/// @brief get model memory size and weight /// @brief get model memory size and weight


+ 6
- 0
ge/graph/load/new_model_manager/model_manager.h View File

@@ -268,6 +268,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {


ge::Status DestroyAicpuSessionForInfer(uint32_t model_id); ge::Status DestroyAicpuSessionForInfer(uint32_t model_id);


ge::Status LoadCustAicpuSo(const OpDescPtr op_desc, string so_name);

ge::Status LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name);

ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info); ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info);


ge::Status GenSessionId(uint64_t &session_id); ge::Status GenSessionId(uint64_t &session_id);
@@ -333,6 +337,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
uint64_t session_id_bias_; uint64_t session_id_bias_;
std::set<uint64_t> sess_ids_; std::set<uint64_t> sess_ids_;
std::vector<rtExceptionInfo> exception_infos_; std::vector<rtExceptionInfo> exception_infos_;
std::mutex cust_aicpu_mutex_;
std::set<std::string> cust_aicpu_so_;


static DumpProperties dump_properties_; static DumpProperties dump_properties_;
}; };


+ 29
- 12
ge/graph/load/new_model_manager/model_utils.cc View File

@@ -29,6 +29,14 @@
#include "framework/common/debug/ge_log.h" #include "framework/common/debug/ge_log.h"
#include "graph/manager/graph_var_manager.h" #include "graph/manager/graph_var_manager.h"


#define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \
do { \
if (SIZE <= static_cast<uint64_t>(OFFSET)) { \
GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \
return {}; \
} \
} while (0)

namespace ge { namespace ge {
/// ///
/// @ingroup ge /// @ingroup ge
@@ -38,7 +46,7 @@ namespace ge {
vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
vector<int64_t> v_input_size; vector<int64_t> v_input_size;
GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size); GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size);
const size_t inputs_size = op_desc->GetInputsSize();
const size_t inputs_size = op_desc->GetAllInputsSize();
const string op_type = op_desc->GetType(); const string op_type = op_desc->GetType();


const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
@@ -151,7 +159,7 @@ vector<int64_t> ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) {
} }


// other ops get weight from connected constop // other ops get weight from connected constop
const size_t inputs_size = op_desc->GetInputsSize();
const size_t inputs_size = op_desc->GetAllInputsSize();
const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
for (size_t i = 0; i < inputs_size; ++i) { for (size_t i = 0; i < inputs_size; ++i) {
if ((i < v_is_input_const.size()) && v_is_input_const[i]) { if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
@@ -191,7 +199,7 @@ vector<ConstGeTensorPtr> ModelUtils::GetWeights(ConstOpDescPtr op_desc) {
} }


// other ops get weight from connected constop // other ops get weight from connected constop
const size_t inputs_size = op_desc->GetInputsSize();
const size_t inputs_size = op_desc->GetAllInputsSize();
const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
for (size_t i = 0; i < inputs_size; ++i) { for (size_t i = 0; i < inputs_size; ++i) {
if ((i < v_is_input_const.size()) && v_is_input_const[i]) { if ((i < v_is_input_const.size()) && v_is_input_const[i]) {
@@ -221,7 +229,7 @@ vector<::tagCcAICPUTensor> ModelUtils::GetInputDescs(ConstOpDescPtr op_desc) {
vector<::opTensor_t> v_input_descs; vector<::opTensor_t> v_input_descs;
GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_descs); GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_descs);


const size_t inputs_size = op_desc->GetInputsSize();
const size_t inputs_size = op_desc->GetAllInputsSize();
const vector<bool> v_is_input_const = op_desc->GetIsInputConst(); const vector<bool> v_is_input_const = op_desc->GetIsInputConst();


for (size_t i = 0; i < inputs_size; ++i) { for (size_t i = 0; i < inputs_size; ++i) {
@@ -306,7 +314,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr); GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr);
uint64_t session_id = model_param.session_id; uint64_t session_id = model_param.session_id;


const size_t inputs_size = op_desc->GetInputsSize();
const size_t inputs_size = op_desc->GetAllInputsSize();
const vector<int64_t> v_input_offset = op_desc->GetInputOffset(); const vector<int64_t> v_input_offset = op_desc->GetInputOffset();


const string op_type = op_desc->GetType(); const string op_type = op_desc->GetType();
@@ -334,6 +342,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
if (tensor_size) { if (tensor_size) {
int64_t data_offset = 0; int64_t data_offset = 0;
GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset)); GE_CHK_STATUS(TensorUtils::GetDataOffset(*tensor_desc, data_offset));
VALIDATE_MEM_RANGE(op_desc, model_param.weight_size, data_offset);
uint8_t *weight_addr = model_param.weight_base + data_offset; uint8_t *weight_addr = model_param.weight_base + data_offset;
v_input_data_addr.push_back(weight_addr); v_input_data_addr.push_back(weight_addr);
GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id, GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
@@ -345,11 +354,12 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co


GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(), GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(),
GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index); GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index);
break;);
break);


int64_t input_offset = v_input_offset[non_const_index]; int64_t input_offset = v_input_offset[non_const_index];
non_const_index++; non_const_index++;
GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset), GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset),
VALIDATE_MEM_RANGE(op_desc, model_param.var_size, input_offset - model_param.logic_var_base);
uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base; uint8_t *variable_addr = model_param.var_base + input_offset - model_param.logic_var_base;
v_input_data_addr.push_back(variable_addr); v_input_data_addr.push_back(variable_addr);
GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]", GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]",
@@ -363,6 +373,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset)); mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset));
v_input_data_addr.push_back(mem_addr); v_input_data_addr.push_back(mem_addr);
} else { } else {
VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, input_offset);
mem_addr = model_param.mem_base + input_offset; mem_addr = model_param.mem_base + input_offset;
v_input_data_addr.push_back(mem_addr); v_input_data_addr.push_back(mem_addr);
} }
@@ -398,6 +409,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
} }
for (size_t i = 0; i < outputs_size; ++i) { for (size_t i = 0; i < outputs_size; ++i) {
GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]), GE_IF_BOOL_EXEC(model_param.var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]),
VALIDATE_MEM_RANGE(op_desc, model_param.var_size, v_output_offset[i] - model_param.logic_var_base);
uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base; uint8_t *variable_addr = model_param.var_base + v_output_offset[i] - model_param.logic_var_base;
v_output_data_addr.push_back(variable_addr); v_output_data_addr.push_back(variable_addr);
GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]", GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]",
@@ -410,6 +422,7 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i])); mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i]));
v_output_data_addr.push_back(mem_addr); v_output_data_addr.push_back(mem_addr);
} else { } else {
VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_output_offset[i]);
mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]); mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]);
v_output_data_addr.push_back(mem_addr); v_output_data_addr.push_back(mem_addr);
} }
@@ -440,15 +453,19 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
for (size_t i = 0; i < v_workspace_bytes.size(); ++i) { for (size_t i = 0; i < v_workspace_bytes.size(); ++i) {
if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) { if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i]))); v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
GELOGI("Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i,
reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[L1] name[%s], mem_addr[workspace index %zu]:0x%lx",
model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i]);
} else if (v_workspace_bytes[i] == 0) {
v_workspace_data_addr.push_back(nullptr);
GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] Null addr",
model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i]);
} else { } else {
int64_t workspace_offset = v_workspace_offset[i];
int64_t workspace_bytes = v_workspace_bytes[i];
uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : model_param.mem_base + workspace_offset;
VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_workspace_offset[i]);
uint8_t *mem_addr = model_param.mem_base + v_workspace_offset[i];
v_workspace_data_addr.push_back(mem_addr); v_workspace_data_addr.push_back(mem_addr);
GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]", GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]",
model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr);
model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i],
mem_addr);
} }
} }




+ 2
- 94
ge/graph/load/new_model_manager/task_info/kernel_task_info.cc View File

@@ -26,6 +26,7 @@
#include "framework/common/l2_cache_optimize.h" #include "framework/common/l2_cache_optimize.h"
#include "graph/debug/ge_attr_define.h" #include "graph/debug/ge_attr_define.h"
#include "graph/load/new_model_manager/davinci_model.h" #include "graph/load/new_model_manager/davinci_model.h"
#include "graph/load/new_model_manager/model_manager.h"
#include "graph/load/new_model_manager/model_utils.h" #include "graph/load/new_model_manager/model_utils.h"
#include "runtime/kernel.h" #include "runtime/kernel.h"
#include "super_kernel/super_kernel.h" #include "super_kernel/super_kernel.h"
@@ -41,13 +42,6 @@ const char *kIsLastNode = "is_last_node";
const char *kIsFirstNode = "is_first_node"; const char *kIsFirstNode = "is_first_node";
const int64_t kCloseSkt = 100; const int64_t kCloseSkt = 100;
const uint32_t kAddrLen = sizeof(void *); const uint32_t kAddrLen = sizeof(void *);
const char *const kLoadOpFromBuf = "loadOpFromBuf";
struct CustAicpuSoBuf {
uint64_t kernelSoBuf;
uint32_t kernelSoBufLen;
uint64_t kernelSoName;
uint32_t kernelSoNameLen;
} __attribute__((packed));
} // namespace } // namespace


namespace ge { namespace ge {
@@ -861,92 +855,6 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) {
return SUCCESS; return SUCCESS;
} }


Status KernelTaskInfo::LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def) {
CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
if (aicpu_kernel == nullptr) {
GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
return INTERNAL_ERROR;
}
const void *aicpu_data = aicpu_kernel->GetBinData();
uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize();

void *d_aicpu_data = nullptr;
rtError_t status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

void *d_so_name = nullptr;
status = rtMalloc(&d_so_name, so_name_.size(), RT_MEMORY_HBM);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

status = rtMemcpy(d_so_name, so_name_.size(), reinterpret_cast<const void *>(so_name_.c_str()), so_name_.size(),
RT_MEMCPY_HOST_TO_DEVICE);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

CustAicpuSoBuf cust_aicpu_so_buf;
cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
cust_aicpu_so_buf.kernelSoName = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
cust_aicpu_so_buf.kernelSoNameLen = so_name_.size();

void *args = nullptr;
uint32_t args_size = sizeof(CustAicpuSoBuf);
status = rtMalloc(&args, args_size, RT_MEMORY_HBM);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}
GELOGI("loadOpFromBuf kernelSoBuf %p, kernelSoBufLen %u, kernelSoName %p, kernelSoNameLen %u.", d_aicpu_data,
aicpu_data_length, d_so_name, so_name_.size());

status = rtMemcpy(args, args_size, static_cast<void *>(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

rtStream_t stream = nullptr;
status = rtStreamCreate(&stream, 0);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt create stream failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

status = rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt CpuKernelLaunch loadOpFromBuf failed, status: 0x%X", status);
return RT_ERROR_TO_GE_STATUS(status);
}
GELOGI("Cpu kernel launch loadOpFromBuf.");

status = rtStreamSynchronize(stream);
if (status != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
return RT_ERROR_TO_GE_STATUS(status);
}

GE_CHK_RT(rtFree(args));
GE_CHK_RT(rtFree(d_aicpu_data));
GE_CHK_RT(rtFree(d_so_name));

GELOGI("Cpu kernel launch loadOpFromBuf task success.");
return SUCCESS;
}

Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) { Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &kernel_def) {
GELOGI("Do InitAicpuTask"); GELOGI("Do InitAicpuTask");
so_name_ = kernel_def.so_name(); so_name_ = kernel_def.so_name();
@@ -961,7 +869,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
} }


if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) { if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, kernel_def), "launch cust aicpu so failed");
GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_), "launch cust aicpu so failed");
} }


// copy args to new host memory // copy args to new host memory


+ 0
- 2
ge/graph/load/new_model_manager/task_info/kernel_task_info.h View File

@@ -106,8 +106,6 @@ class KernelTaskInfo : public TaskInfo {


Status InitAicpuTaskExtInfo(const std::string &ext_info); Status InitAicpuTaskExtInfo(const std::string &ext_info);


Status LaunchCustAicpuSo(const OpDescPtr op_desc, const domi::KernelDef &kernel_def);

Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs, Status StoreInputOutputTensor(const std::vector<void *> &input_data_addrs,
const std::vector<void *> &output_data_addrs, const std::vector<void *> &output_data_addrs,
const std::vector<::tagCcAICPUTensor> &input_descs, const std::vector<::tagCcAICPUTensor> &input_descs,


+ 4
- 6
ge/graph/load/new_model_manager/zero_copy_task.cc View File

@@ -118,13 +118,11 @@ bool ZeroCopyTask::CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_ad
*/ */
Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs, Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs,
const string &batch_label) { const string &batch_label) {
for (auto pair : task_addr_offset_) {
if (pair.first != addr) {
continue;
}

auto iter = task_addr_offset_.find(addr);
if (iter != task_addr_offset_.end()) {
auto &cur_pair = *iter;
uint8_t *args_info = args_info_.data(); uint8_t *args_info = args_info_.data();
for (auto offset : pair.second) {
for (auto offset : cur_pair.second) {
if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) { if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) {
continue; continue;
} }


+ 5
- 1
ge/graph/load/new_model_manager/zero_copy_task.h View File

@@ -83,6 +83,10 @@ class ZeroCopyTask {
*/ */
ge::Status DistributeParam(bool async_mode, rtStream_t stream); ge::Status DistributeParam(bool async_mode, rtStream_t stream);


void SetBatchLabel(const string &batch_label) { batch_label_ = batch_label; }

const string &GetBatchLabel() const { return batch_label_; }

protected: protected:
bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr); bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr);


@@ -93,7 +97,7 @@ class ZeroCopyTask {
const size_t args_size_; const size_t args_size_;
vector<uint8_t> args_info_; vector<uint8_t> args_info_;
bool is_updated_; bool is_updated_;
string batch_label_;
// <address from Op, {offset in args}> // <address from Op, {offset in args}>
map<uintptr_t, vector<size_t>> task_addr_offset_; map<uintptr_t, vector<size_t>> task_addr_offset_;
}; };


+ 9
- 1
ge/graph/manager/graph_manager.cc View File

@@ -267,6 +267,14 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph,
auto compute_graph = GraphUtils::GetComputeGraph(graph); auto compute_graph = GraphUtils::GetComputeGraph(graph);
if (compute_graph != nullptr) { if (compute_graph != nullptr) {
compute_graph->SetGraphID(graph_id); compute_graph->SetGraphID(graph_id);
bool graph_has_been_added = false;
if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) &&
graph_has_been_added) {
GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] same graph object can not be added again, graph_id = %u.",
graph_id);
return GE_GRAPH_GRAPH_ALREADY_EXIST;
}
(void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true);
} else { } else {
GELOGE(FAILED, "compute graph is null"); GELOGE(FAILED, "compute graph is null");
return FAILED; return FAILED;
@@ -1953,9 +1961,9 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
names_to_passes.emplace_back("MergePass", &merge_pass); names_to_passes.emplace_back("MergePass", &merge_pass);
names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass); names_to_passes.emplace_back("CastRemovePass", &cast_remove_pass);
names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass); names_to_passes.emplace_back("TransposeTransDataPass", &transpose_transdata_pass);
names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass); names_to_passes.emplace_back("TransOpSymmetryEliminationPass", &symmetry_elimination_pass);
names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass); names_to_passes.emplace_back("TransOpNearbyAllreduceFusionPass", &trans_op_nearby_allreduce_fusion_pass);
names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass);
names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass); names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass);


+ 3
- 0
ge/graph/partition/engine_place.cc View File

@@ -23,6 +23,7 @@
#include <mutex> #include <mutex>


#include "common/op/ge_op_utils.h" #include "common/op/ge_op_utils.h"
#include "common/util/error_manager/error_manager.h"
#include "graph/utils/graph_utils.h" #include "graph/utils/graph_utils.h"
#include "graph/utils/op_desc_utils.h" #include "graph/utils/op_desc_utils.h"
#include "init/gelib.h" #include "init/gelib.h"
@@ -82,6 +83,8 @@ Status EnginePlacer::Run() {
// If can't get op's engine name, keep check support finish and return failed // If can't get op's engine name, keep check support finish and return failed
if (engine_name.empty()) { if (engine_name.empty()) {
is_check_support_success = false; is_check_support_success = false;
ErrorManager::GetInstance().ATCReportErrMessage("E13003", {"opname", "optype"},
{op_desc->GetName(), op_desc->GetType()});
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s", GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Can not find engine of op type %s",
node_ptr->GetOpDesc()->GetType().c_str()); node_ptr->GetOpDesc()->GetType().c_str());
continue; continue;


+ 4
- 0
ge/graph/passes/for_pass.cc View File

@@ -190,6 +190,10 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index); GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index);
return FAILED; return FAILED;
} }
GE_IF_BOOL_EXEC(
in_data_anchor->GetPeerOutAnchor() == nullptr,
GELOGW("Get null input by index %d from node %s ", in_data_anchor->GetIdx(), node->GetName().c_str());
continue);
data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor()); data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor());
} }




+ 1
- 1
ge/graph/passes/multi_batch_clone_pass.cc View File

@@ -239,7 +239,7 @@ Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, N


GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32); GeTensorDesc const_tensor(GeShape({count}), FORMAT_ND, DT_INT32);
GeTensor tensor(const_tensor); GeTensor tensor(const_tensor);
tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t));
(void)tensor.SetData(reinterpret_cast<uint8_t *>(addr.get()), count * sizeof(int32_t));
if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) { if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) {
GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str());
return FAILED; return FAILED;


+ 3
- 0
ge/graph/passes/reshape_recovery_pass.cc View File

@@ -50,9 +50,12 @@ Status InsertReshapeIfNeed(const NodePtr &node) {
GE_CHECK_NOTNULL(src_tensor); GE_CHECK_NOTNULL(src_tensor);
for (auto dst_anchor : src_anchor->GetPeerInDataAnchors()) { for (auto dst_anchor : src_anchor->GetPeerInDataAnchors()) {
auto dst_node = dst_anchor->GetOwnerNode(); auto dst_node = dst_anchor->GetOwnerNode();
GELOGD("Try insert reshape between %s[%d] and %s[%d] to keep the shape continues", node->GetName().c_str(),
src_anchor->GetIdx(), dst_node->GetName().c_str(), dst_anchor->GetIdx());
GE_CHECK_NOTNULL(dst_node); GE_CHECK_NOTNULL(dst_node);
GE_CHECK_NOTNULL(dst_node->GetOpDesc()); GE_CHECK_NOTNULL(dst_node->GetOpDesc());
auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx()); auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx());
GE_CHECK_NOTNULL(dst_tensor);
bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK && bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK &&
dst_tensor->GetShape().GetDims() != UNKNOWN_RANK && dst_tensor->GetShape().GetDims() != UNKNOWN_RANK &&
src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims(); src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims();


+ 11
- 7
ge/graph/preprocess/multi_batch_copy_graph.cc View File

@@ -113,10 +113,9 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) {
desc->CopyAttrsFrom(*src_op_desc); desc->CopyAttrsFrom(*src_op_desc);
for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) { for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) {
auto input_desc = desc->MutableInputDesc(i); auto input_desc = desc->MutableInputDesc(i);
GE_IF_BOOL_EXEC(input_desc == nullptr,
GELOGE(INTERNAL_ERROR, "Failed to get input desc by index %u from node %s when copy from %s", i,
desc->GetName().c_str(), node->GetName().c_str());
return nullptr);
GE_IF_BOOL_EXEC(input_desc == nullptr, GELOGW("Get null input desc by index %u from node %s when copy from %s", i,
desc->GetName().c_str(), node->GetName().c_str());
continue);


input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i)); input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i));
} }
@@ -991,12 +990,17 @@ Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() {
size_t i = 0; size_t i = 0;
for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
GE_CHECK_NOTNULL(identity_desc);

auto out_node = in_data_anchor->GetOwnerNode(); auto out_node = in_data_anchor->GetOwnerNode();
auto op_desc = out_node->GetOpDesc(); auto op_desc = out_node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc); GE_CHECK_NOTNULL(op_desc);
if ((out_node->GetType() == MERGE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
GELOGD("No need to insert identity between %s and %s.", node->GetName().c_str(), out_node->GetName().c_str());
continue;
}

auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
GE_CHECK_NOTNULL(identity_desc);

string batch_label; string batch_label;
if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {


+ 238
- 194
ge/host_kernels/strided_slice_kernel.cc View File

@@ -16,131 +16,262 @@


#include "host_kernels/strided_slice_kernel.h" #include "host_kernels/strided_slice_kernel.h"


#include <memory>

#include "common/fp16_t.h" #include "common/fp16_t.h"
#include "common/ge_inner_error_codes.h" #include "common/ge_inner_error_codes.h"
#include "common/math/math_util.h" #include "common/math/math_util.h"
#include "common/op/ge_op_utils.h" #include "common/op/ge_op_utils.h"
#include "external/graph/types.h"
#include "framework/common/debug/ge_log.h" #include "framework/common/debug/ge_log.h"
#include "host_kernels/kernel_utils.h"
#include "graph/utils/type_utils.h" #include "graph/utils/type_utils.h"
#include "host_kernels/kernel_utils.h"
#include "inc/kernel_factory.h" #include "inc/kernel_factory.h"
#include <memory>


namespace ge { namespace ge {
namespace { namespace {
const int32_t kNumOne = 1; const int32_t kNumOne = 1;
const size_t kStridedSliceInputSize = 4; const size_t kStridedSliceInputSize = 4;
const size_t kStridedSliceInputIndex0 = 0;
const size_t kStridedSliceInputIndex1 = 1;
const size_t kStridedSliceInputIndex2 = 2;
const size_t kStridedSliceInputIndex3 = 3;
const int32_t kDefaultSrideSize = 1;
} // namespace
Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input,
Attr &args) {
int64_t begin_mask = 0;
int64_t end_mask = 0;
int64_t ellipsis_mask = 0;
int64_t new_axis_mask = 0;
int64_t shrink_axis_mask = 0;
const size_t kStridedSliceInputIndex = 0;
const size_t kStridedSliceBeginIndex = 1;
const size_t kStridedSliceEndIndex = 2;
const size_t kStridedSliceStrideIndex = 3;
const int32_t kDefaultStrideSize = 1;
const std::set<DataType> kIndexNumberType = {DT_INT32, DT_INT64};


if (attr == nullptr) {
GELOGW("input opdescptr is nullptr.");
return PARAM_INVALID;
bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const int ellipsis_mask) {
if (ellipsis_mask != 0) {
auto ellipsis_num = 0;
auto input_shape = input_desc->GetShape();
bool ellipsis_mask_flag = false;
for (size_t i = 0; i < input_shape.GetDimNum(); i++) {
uint32_t i_temp = static_cast<uint32_t>(i);
ellipsis_mask_flag = (static_cast<uint32_t>(ellipsis_mask) & (1 << i_temp));
if (ellipsis_mask_flag) {
++ellipsis_num;
}
if (ellipsis_num > 1) {
GELOGW("Only one non-zero bit is allowed in ellipsis_mask.");
return false;
}
}
} }
if (input.size() != kStridedSliceInputSize) {
GELOGW("The number of input for strided slice must be %zu.", kStridedSliceInputSize);
return PARAM_INVALID;
return true;
}
} // namespace
Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input,
vector<ge::GeTensorPtr> &v_output) {
GELOGD("StridedSliceKernel in.");
// 1.Check input and attrs
if (CheckAndGetAttr(attr) != SUCCESS) {
GELOGW("Check and get attrs failed.Ignore kernel.");
return NOT_CHANGED;
} }
if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_BEGIN_MASK, begin_mask)) {
GELOGW("get begin_mask attr failed.");
return PARAM_INVALID;
if (CheckInputParam(input) != SUCCESS) {
GELOGW("Check input params failed.Ignore kernel.");
return NOT_CHANGED;
} }
if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_END_MASK, end_mask)) {
GELOGW("get end_mask attr failed.");
return PARAM_INVALID;
// 2.Init param with mask attrs.
std::vector<int64_t> input_dims;
std::vector<int64_t> begin_vec;
std::vector<int64_t> output_dims;
std::vector<int64_t> stride_vec;
if (InitParamWithAttrs(input, input_dims, begin_vec, output_dims, stride_vec) != SUCCESS) {
GELOGW("Init param with mask attrs failed.Ignore kernel.");
return NOT_CHANGED;
} }
if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_ELLIPSIS_MASK, ellipsis_mask)) {
GELOGW("get ellipsis_mask attr failed.");
return PARAM_INVALID;

// 3.Set sliced data to output_ptr
ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
auto data_type = weight0->GetTensorDesc().GetDataType();
size_t data_size = weight0->GetData().size() / GetSizeByDataType(data_type);
void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data()));
GE_CHECK_NOTNULL(data);
// Index 0 can always gets a GeTensorDesc object from any OpDescPtr.
auto output_tensor_desc = attr->GetOutputDesc(0);
GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc);
if (output_ptr == nullptr) {
GELOGE(MEMALLOC_FAILED, "MakeShared GeTensor failed, node name %s.", attr->GetName().c_str());
return NOT_CHANGED;
} }
if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_NEW_AXIS_MASK, new_axis_mask)) {
GELOGW("get new_axis_mask attr failed.");
return PARAM_INVALID;
auto ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), data_type, input_dims, begin_vec,
output_dims, output_ptr.get(), stride_vec);
if (ret != SUCCESS) {
GELOGE(INTERNAL_ERROR, "SetOutputSliceData failed.");
return NOT_CHANGED;
} }
if (!AttrUtils::GetInt(attr, STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, shrink_axis_mask)) {
GELOGW("get shrink_axis_mask attr failed.");

// 4.Set output data_type and shape
GeTensorDesc &t_d = output_ptr->MutableTensorDesc();
t_d.SetDataType(static_cast<DataType>(data_type));

auto final_dim_size = static_cast<uint32_t>(output_dims.size());
vector<int64_t> v_dims;
GetOutputDims(final_dim_size, output_dims, v_dims);
t_d.SetShape(GeShape(v_dims));
v_output.push_back(output_ptr);
GELOGI("StridedSliceKernel success.");
return SUCCESS;
}
Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr) {
if (attr == nullptr) {
GELOGE(PARAM_INVALID, "input opdescptr is nullptr.");
return PARAM_INVALID; return PARAM_INVALID;
} }
if ((ellipsis_mask != 0) || (new_axis_mask != 0)) {
GELOGW("ellipsis_mask or new_axis_mask must be 0 with optimizer.");
return NOT_CHANGED;
// Get all op attr value of strided_slice
for (auto &attr_2_value : attr_value_map_) {
if (!AttrUtils::GetInt(attr, attr_2_value.first, attr_2_value.second)) {
GELOGE(PARAM_INVALID, "Get %s attr failed.", attr_2_value.first.c_str());
return PARAM_INVALID;
}
} }
const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex0);
// Check ellipsis_mask is valid
const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex);
GE_CHECK_NOTNULL(input_desc); GE_CHECK_NOTNULL(input_desc);
DataType data_type = input_desc->GetDataType();
if ((data_type != DT_FLOAT) && (data_type != DT_INT32)) {
GELOGW(
"Data type of StridedSlice OP must be float or int32."
"Constant folding will not be carried out in this condition"
"which might affect the time performance but not the accuracy");
}
args.begin_mask = begin_mask;
args.end_mask = end_mask;
args.ellipsis_mask = ellipsis_mask;
args.new_axis_mask = new_axis_mask;
args.data_type = static_cast<int64_t>(data_type);
args.shrink_axis_mask = shrink_axis_mask;

ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0];
ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1];
ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2];
ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3];
if (CheckWeight(weight0, weight1, weight2, weight3) != SUCCESS) {
GELOGW("Check And Get Attr failed.");
auto ellipsis_mask = attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK);
if (!IsEllipsisMaskValid(input_desc, ellipsis_mask)) {
return PARAM_INVALID; return PARAM_INVALID;
} }

return SUCCESS; return SUCCESS;
} }
Status StridedSliceKernel::CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1,
const ConstGeTensorPtr &weight2, const ConstGeTensorPtr &weight3) const {
if ((weight0 == nullptr) || (weight1 == nullptr) || (weight2 == nullptr) || (weight3 == nullptr)) {
GELOGW("weight is nullptr.");
Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const {
if (input.size() != kStridedSliceInputSize) {
GELOGE(PARAM_INVALID, "The number of input for strided slice must be %zu.", kStridedSliceInputSize);
return PARAM_INVALID; return PARAM_INVALID;
} }
if (!(weight1->GetTensorDesc().GetDataType() == DT_INT32 && weight2->GetTensorDesc().GetDataType() == DT_INT32 &&
weight3->GetTensorDesc().GetDataType() == DT_INT32)) {
GELOGE(INTERNAL_ERROR, "Data type of StridedSlice OP(begin,end,strides) must be int32.");
return INTERNAL_ERROR;

ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];
GE_CHECK_NOTNULL(weight0);
GE_CHECK_NOTNULL(begin_tensor);
GE_CHECK_NOTNULL(end_tensor);
GE_CHECK_NOTNULL(stride_tensor);

// check if begin,end,strides data type is supported
auto begin_tensor_desc = begin_tensor->GetTensorDesc();
auto end_tensor_desc = begin_tensor->GetTensorDesc();
auto stride_tensor_desc = begin_tensor->GetTensorDesc();
if (begin_tensor_desc.GetDataType() != end_tensor_desc.GetDataType() ||
end_tensor_desc.GetDataType() != stride_tensor_desc.GetDataType()) {
GELOGW("Data type of StridedSlice OP(begin,end,strides) must be same.");
return PARAM_INVALID;
}
if (kIndexNumberType.find(begin_tensor_desc.GetDataType()) == kIndexNumberType.end()) {
GELOGW("Data type of StridedSlice OP(begin,end,strides) must be int32 or int64.");
return PARAM_INVALID;
} }


// check data // check data
size_t weight0_size = weight0->GetData().size() / sizeof(int32_t);
size_t weight1_size = weight1->GetData().size() / sizeof(int32_t);
size_t weight2_size = weight2->GetData().size() / sizeof(int32_t);
size_t weight3_size = weight3->GetData().size() / sizeof(int32_t);
if ((weight0_size == 0) || (weight1_size == 0) || (weight2_size == 0) || (weight3_size == 0)) {
auto x_data_type = weight0->GetTensorDesc().GetDataType();
auto x_data_size = GetSizeByDataType(x_data_type);
if (x_data_size < 0) {
GELOGW("Data type of x input %s is not supported.", TypeUtils::DataTypeToSerialString(x_data_type).c_str());
return PARAM_INVALID;
}
size_t weight0_size = weight0->GetData().size() / x_data_size;
size_t begin_data_size = begin_tensor->GetData().size() / sizeof(int32_t);
size_t end_data_size = end_tensor->GetData().size() / sizeof(int32_t);
size_t stride_data_size = stride_tensor->GetData().size() / sizeof(int32_t);
if ((weight0_size == 0) || (begin_data_size == 0) || (end_data_size == 0) || (stride_data_size == 0)) {
GELOGW("Data size of inputs is 0."); GELOGW("Data size of inputs is 0.");
return PARAM_INVALID; return PARAM_INVALID;
} }

// check dim size // check dim size
size_t weight0_dim_size = weight0->GetTensorDesc().GetShape().GetDimNum();
if (!((weight0_dim_size >= weight1_size) && (weight1_size == weight2_size) && (weight1_size == weight3_size))) {
if (!((begin_data_size == end_data_size) && (end_data_size == stride_data_size))) {
GELOGW("The sizes of begin, end and stride is not supported."); GELOGW("The sizes of begin, end and stride is not supported.");
return NOT_CHANGED;
return PARAM_INVALID;
} }


return SUCCESS; return SUCCESS;
} }


Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag,
int32_t &begin_i, int32_t &end_i, int32_t &dim_i) const {
Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input,
std::vector<int64_t> &input_dims, std::vector<int64_t> &begin_vec,
std::vector<int64_t> &output_dims, std::vector<int64_t> &stride_vec) {
ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];

const GeShape x_shape = weight0->GetTensorDesc().GetShape();
auto x_dims = x_shape.GetDims();
auto x_dims_num = x_shape.GetDimNum();
// handle new_axis_mask
ExpandDimsWithNewAxis(begin_tensor, x_dims_num, x_dims);

const int32_t *begin = reinterpret_cast<const int32_t *>(begin_tensor->GetData().data());
const int32_t *end = reinterpret_cast<const int32_t *>(end_tensor->GetData().data());
const int32_t *stride = reinterpret_cast<const int32_t *>(stride_tensor->GetData().data());
auto begin_dim_num = begin_tensor->GetData().size() / sizeof(int32_t);
auto min_dim = x_dims_num > begin_dim_num ? begin_dim_num : x_dims_num;
for (size_t i = 0; i < x_dims.size(); ++i) {
auto i_temp = static_cast<uint64_t>(i);
bool new_axis_mask_flag =
(static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
if (new_axis_mask_flag) {
output_dims.push_back(1);
input_dims.push_back(1);
begin_vec.push_back(0);
stride_vec.push_back(1);
continue;
}

int64_t begin_i = 0;
int64_t end_i = 0;
int64_t stride_i = 1;
if (i < min_dim) {
begin_i = begin[i];
end_i = end[i];
stride_i = stride[i];
} else {
begin_i = 0;
end_i = x_dims.at(i);
stride_i = 1;
}
GELOGD("Before mask calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i,
stride_i, x_dims.at(i));
auto ret = MaskCal(i, begin_i, end_i, x_dims.at(i));
if (ret != SUCCESS) {
GELOGW("MaskCal failed, because of data overflow.");
return NOT_CHANGED;
}
int64_t dim_final;
GELOGD("Before stride calculate. Begin is : %d\t,end is : %d\t stride is : %d\t x_dim_i is : %d.", begin_i, end_i,
stride_i, x_dims.at(i));
(void)StrideCal(x_dims.at(i), begin_i, end_i, stride_i, dim_final);
output_dims.push_back(dim_final);
input_dims.push_back(x_dims.at(i));
begin_vec.push_back(begin_i);
stride_vec.push_back(stride_i);
}
return SUCCESS;
}
void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num,
vector<int64_t> &x_dims) {
auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType());
size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size;
auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num;
for (size_t i = 0; i < final_dim_num; i++) {
auto i_temp = static_cast<uint64_t>(i);
bool new_axis_mask_flag =
(static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
if (new_axis_mask_flag) {
x_dims.insert(x_dims.begin() + i, 1);
}
}
}
Status StridedSliceKernel::MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const {
uint64_t i_temp = static_cast<uint64_t>(i);
bool begin_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_BEGIN_MASK)) & (1 << i_temp));
bool end_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK)) & (1 << i_temp));
bool ellipsis_mask_flag =
(static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK)) & (1 << i_temp));
bool shrink_mask_flag =
(static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << i_temp));
if (shrink_mask_flag) { if (shrink_mask_flag) {
begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i); begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i);
FMK_INT32_ADDCHECK(begin_i, kNumOne);
FMK_INT32_ADDCHECK(begin_i, kNumOne)
end_i = begin_i + kNumOne; end_i = begin_i + kNumOne;
} else { } else {
if (begin_mask_flag) { if (begin_mask_flag) {
@@ -153,130 +284,43 @@ Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_
} else { } else {
end_i = (end_i < 0 ? (dim_i + end_i) : end_i); end_i = (end_i < 0 ? (dim_i + end_i) : end_i);
} }
if (ellipsis_mask_flag) {
begin_i = 0;
end_i = dim_i;
}
} }
return SUCCESS; return SUCCESS;
} }
Status StridedSliceKernel::StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
int64_t &dim_final) const {
if (stride_i == 0) {
stride_i = kDefaultStrideSize;
} else if (stride_i < 0) {
stride_i = -stride_i;
begin_i = x_dims_i - begin_i - 1;
end_i = x_dims_i - end_i - 1;
}


void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args,
if (end_i > x_dims_i) {
end_i = x_dims_i;
}

if ((begin_i == 0) && (end_i == 0)) {
dim_final = x_dims_i;
} else {
dim_final = abs(end_i - begin_i) / stride_i;
}
return SUCCESS;
}
void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims,
vector<int64_t> &v_dims) { vector<int64_t> &v_dims) {
for (uint32_t k = 0; k < dims_size; k++) { for (uint32_t k = 0; k < dims_size; k++) {
bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << k));
bool shrink_mask_i = (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << k));
if (shrink_mask_i) { if (shrink_mask_i) {
continue; continue;
} }
v_dims.push_back(output_dims[k]); v_dims.push_back(output_dims[k]);
} }
} }

Status StridedSliceKernel::CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr) {
// check dim not all less than 0
for (auto dim : output_dims) {
if (dim > 0) {
return SUCCESS;
}
}
GELOGW("all output dim <=0, can't be processed. op_name : %s", attr->GetName().c_str());
return NOT_CHANGED;
}

Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input,
vector<ge::GeTensorPtr> &v_output) {
GELOGI("StridedSliceKernel in.");
Attr args;
Status ret = CheckAndGetAttr(attr, input, args);
if (ret != SUCCESS) {
GELOGW("Check And Get Attr failed.");
return NOT_CHANGED;
}

ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex0];
ConstGeTensorPtr weight1 = input[kStridedSliceInputIndex1];
ConstGeTensorPtr weight2 = input[kStridedSliceInputIndex2];
ConstGeTensorPtr weight3 = input[kStridedSliceInputIndex3];

const GeShape x_shape = weight0->GetTensorDesc().GetShape();
size_t dim_size = x_shape.GetDimNum();
size_t data_size = weight0->GetData().size() / sizeof(int32_t);

const int32_t *begin = reinterpret_cast<const int32_t *>(weight1->GetData().data());
const int32_t *end = reinterpret_cast<const int32_t *>(weight2->GetData().data());
const int32_t *stride = reinterpret_cast<const int32_t *>(weight3->GetData().data());
if ((begin == nullptr) || (end == nullptr) || (stride == nullptr)) {
GELOGW("input weight tensor is nullptr.");
return NOT_CHANGED;
}

std::vector<int64_t> input_dims;
std::vector<int64_t> begin_vec;
std::vector<int64_t> output_dims;
std::vector<int64_t> stride_vec;
int64_t dim_final;
for (size_t i = 0; i < dim_size; i++) {
int32_t begin_i = begin[i];
int32_t end_i = end[i];
int32_t stride_i = stride[i];
int32_t dim_i = static_cast<int32_t>(x_shape.GetDim(i));
GELOGI("%d\t %d\t %d\t %d", begin_i, end_i, stride_i, dim_i);
uint32_t i_temp = static_cast<uint32_t>(i);
bool begin_mask_i = (static_cast<uint32_t>(args.begin_mask) & (1 << i_temp));
bool end_mask_i = (static_cast<uint32_t>(args.end_mask) & (1 << i_temp));
bool shrink_mask_i = (static_cast<uint32_t>(args.shrink_axis_mask) & (1 << i_temp));
ret = MaskCal(begin_mask_i, end_mask_i, shrink_mask_i, begin_i, end_i, dim_i);
if (ret != SUCCESS) {
GELOGW("MaskCal failed, because of data overflow.");
return NOT_CHANGED;
}
if (stride_i == 0) {
stride_i = kDefaultSrideSize;
} else if (stride_i < 0) {
stride_i = -stride_i;
begin_i = x_shape.GetDim(i) - begin_i - 1;
end_i = x_shape.GetDim(i) - end_i - 1;
}
if ((begin_i == 0) && (end_i == 0)) {
dim_final = x_shape.GetDim(i);
} else {
dim_final = abs(end_i - begin_i) / stride_i;
}
output_dims.push_back(dim_final);
input_dims.push_back(x_shape.GetDim(i));
begin_vec.push_back(begin_i);
stride_vec.push_back(stride_i);
}

// Index 0 can always gets a GeTensorDesc object from any OpDescPtr.
auto output_tensor_desc = attr->GetOutputDesc(0);
GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc);
if (output_ptr == nullptr) {
GELOGW("MakeShared GeTensor failed, node name %s.", attr->GetName().c_str());
return NOT_CHANGED;
}

void *data = reinterpret_cast<void *>(const_cast<uint8_t *>(weight0->GetData().data()));
GE_CHECK_NOTNULL(data);

ret = CheckOutputDims(output_dims, attr);
if (ret != SUCCESS) {
return ret;
}

ret = OpUtils::SetOutputSliceData(data, static_cast<int64_t>(data_size), args.data_type, input_dims, begin_vec,
output_dims, output_ptr.get(), stride_vec);
if (ret != SUCCESS) {
GELOGW("SetOutputSliceData failed.");
return NOT_CHANGED;
}

GeTensorDesc &t_d = output_ptr->MutableTensorDesc();
t_d.SetDataType(static_cast<DataType>(args.data_type));

uint32_t final_dim_size = static_cast<uint32_t>(output_dims.size());
vector<int64_t> v_dims;
GetOutputDims(final_dim_size, output_dims, args, v_dims);
t_d.SetShape(GeShape(v_dims));
v_output.push_back(output_ptr);
GELOGI("StridedSliceKernel success.");
return SUCCESS;
}
REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel); REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel);
} // namespace ge } // namespace ge

+ 18
- 19
ge/host_kernels/strided_slice_kernel.h View File

@@ -17,34 +17,33 @@
#ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ #ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_
#define GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ #define GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_


#include <vector>

#include "inc/kernel.h" #include "inc/kernel.h"
#include <vector>


namespace ge { namespace ge {
struct Attr {
int64_t begin_mask;
int64_t end_mask;
int64_t ellipsis_mask;
int64_t new_axis_mask;
int64_t data_type;
int64_t shrink_axis_mask;
};

class StridedSliceKernel : public Kernel { class StridedSliceKernel : public Kernel {
public: public:
Status Compute(const OpDescPtr attr, const std::vector<ConstGeTensorPtr> &input, Status Compute(const OpDescPtr attr, const std::vector<ConstGeTensorPtr> &input,
vector<GeTensorPtr> &v_output) override; vector<GeTensorPtr> &v_output) override;


private: private:
Status CheckAndGetAttr(const OpDescPtr &attr, const std::vector<ConstGeTensorPtr> &input, Attr &args);
Status CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, const ConstGeTensorPtr &weight2,
const ConstGeTensorPtr &weight3) const;
Status MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, int32_t &begin_i,
int32_t &end_i, int32_t &dim_i) const;
void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, const Attr &args,
vector<int64_t> &v_dims);
Status CheckOutputDims(const std::vector<int64_t> &output_dims, const OpDescPtr attr);
Status CheckAndGetAttr(const OpDescPtr &attr);
Status CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const;
Status InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input, std::vector<int64_t> &input_dims,
std::vector<int64_t> &begin_vec, std::vector<int64_t> &output_dims,
std::vector<int64_t> &stride_vec);
Status MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const;
Status StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
int64_t &dim_final) const;
void ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num, vector<int64_t> &x_dims);

void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, vector<int64_t> &v_dims);

map<string, uint32_t> attr_value_map_ = {{STRIDE_SLICE_ATTR_BEGIN_MASK, 0},
{STRIDE_SLICE_ATTR_END_MASK, 0},
{STRIDE_SLICE_ATTR_ELLIPSIS_MASK, 0},
{STRIDE_SLICE_ATTR_NEW_AXIS_MASK, 0},
{STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK, 0}};
}; };
} // namespace ge } // namespace ge
#endif // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ #endif // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_

+ 6
- 0
ge/hybrid/executor/hybrid_model_executor.cc View File

@@ -27,6 +27,12 @@ const char *const kEnvProfilingLevel = "HYBRID_PROFILING_LEVEL";
HybridModelExecutor::HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream) HybridModelExecutor::HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream)
: model_(model), device_id_(device_id), stream_(stream) {} : model_(model), device_id_(device_id), stream_(stream) {}


HybridModelExecutor::~HybridModelExecutor() {
if (context_.rt_gen_context != nullptr) {
(void)rtCtxDestroy(context_.rt_gen_context);
}
}

Status HybridModelExecutor::Init() { Status HybridModelExecutor::Init() {
GELOGD("Start to init HybridGraphEngine."); GELOGD("Start to init HybridGraphEngine.");
GE_CHK_STATUS_RET_NOLOG(InitExecutionContext()); GE_CHK_STATUS_RET_NOLOG(InitExecutionContext());


+ 1
- 1
ge/hybrid/executor/hybrid_model_executor.h View File

@@ -35,7 +35,7 @@ class HybridModelExecutor {


HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream); HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream);


~HybridModelExecutor() = default;
~HybridModelExecutor();


Status Init(); Status Init();




+ 2
- 1
ge/hybrid/model/hybrid_model_builder.cc View File

@@ -618,7 +618,8 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_
} }


int64_t var_size = CalcVarSizeInBytes(*tensor_desc); int64_t var_size = CalcVarSizeInBytes(*tensor_desc);
tensor.reset(new (std::nothrow) TensorValue(dev_mem, var_size));
// var size is only for checking, will not allocate any memory by it
tensor.reset(new (std::nothrow) TensorValue(dev_mem, static_cast<size_t>(var_size)));
GE_CHECK_NOTNULL(tensor); GE_CHECK_NOTNULL(tensor);
return SUCCESS; return SUCCESS;
} }


+ 2
- 2
ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc View File

@@ -197,7 +197,7 @@ void AicpuExtInfoHandler::GetShapeAndType(const AicpuShapeAndType *shape_and_typ
dims.emplace_back(tmpDim); dims.emplace_back(tmpDim);
} }
data_type = static_cast<DataType>(shape_and_type->type); data_type = static_cast<DataType>(shape_and_type->type);
shape = std::move(GeShape(dims));
shape = GeShape(dims);
} }
} // namespace hybrid } // namespace hybrid
} // namespace ge
} // namespace ge

+ 1
- 0
ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc View File

@@ -48,6 +48,7 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
std::vector<ConstGeTensorPtr> inputs; std::vector<ConstGeTensorPtr> inputs;
for (int32_t i = 0; i < context.NumInputs(); ++i) { for (int32_t i = 0; i < context.NumInputs(); ++i) {
const auto &input_desc = op_desc->GetInputDesc(i); const auto &input_desc = op_desc->GetInputDesc(i);
GE_CHECK_NOTNULL(context.GetInput(i));
auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()), auto in_tensor = MakeShared<GeTensor>(input_desc, reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()),
context.GetInput(i)->GetSize()); context.GetInput(i)->GetSize());
GE_CHECK_NOTNULL(in_tensor); GE_CHECK_NOTNULL(in_tensor);


+ 13
- 7
ge/init/gelib.cc View File

@@ -167,7 +167,6 @@ Status GELib::SystemInitialize(const map<string, string> &options) {


// In train and infer, profiling is always needed. // In train and infer, profiling is always needed.
InitOptions(options); InitOptions(options);
InitProfiling(this->options_);
auto model_manager = ModelManager::GetInstance(); auto model_manager = ModelManager::GetInstance();
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS, GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS,
@@ -175,23 +174,23 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
return FAILED); return FAILED);
// 1.`is_train_mode_` means case: train // 1.`is_train_mode_` means case: train
// 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer
// these two case need call `InitSystemWithOptions->rtGetDeviceIndexByPhyId`
// to convert phy device id to logical device id
// note:rtGetDeviceIndexByPhyId return `0` logical id when input phy device id is `0`
// these two case with logical device id
if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) { if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
InitProfiling(this->options_, true);
status = InitSystemWithOptions(this->options_); status = InitSystemWithOptions(this->options_);
} else { } else {
InitProfiling(this->options_);
status = InitSystemWithoutOptions(); status = InitSystemWithoutOptions();
} }
return status; return status;
} }


void GELib::InitProfiling(Options &options) {
void GELib::InitProfiling(Options &options, bool convert_2_phy_device_id) {
GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id); GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id);
std::lock_guard<std::mutex> lock(status_mutex_); std::lock_guard<std::mutex> lock(status_mutex_);
GetContext().Init(); GetContext().Init();
// Profiling init // Profiling init
if (ProfilingManager::Instance().Init(options) != SUCCESS) {
if (ProfilingManager::Instance().Init(options, convert_2_phy_device_id) != SUCCESS) {
GELOGW("Profiling init failed."); GELOGW("Profiling init failed.");
} }
} }
@@ -362,6 +361,9 @@ Status GELib::Finalize() {
GELOGW("not initialize"); GELOGW("not initialize");
return SUCCESS; return SUCCESS;
} }
if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
GE_CHK_RT_RET(rtSetDevice(options_.device_id));
}
Status final_state = SUCCESS; Status final_state = SUCCESS;
Status mid_state; Status mid_state;
GELOGI("engineManager finalization."); GELOGI("engineManager finalization.");
@@ -412,10 +414,14 @@ Status GELib::Finalize() {


GetMutableGlobalOptions().erase(ENABLE_SINGLE_STREAM); GetMutableGlobalOptions().erase(ENABLE_SINGLE_STREAM);


if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
GE_CHK_RT_RET(rtDeviceReset(options_.device_id));
}

instancePtr_ = nullptr; instancePtr_ = nullptr;
init_flag_ = false; init_flag_ = false;
if (final_state != SUCCESS) { if (final_state != SUCCESS) {
GELOGE(FAILED, "MemManager finalization.");
GELOGE(FAILED, "finalization failed.");
return final_state; return final_state;
} }
GELOGI("finalization success."); GELOGI("finalization success.");


+ 1
- 1
ge/init/gelib.h View File

@@ -68,7 +68,7 @@ class GELib {
// get incre build cache path // get incre build cache path
const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; } const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; }


void InitProfiling(Options &options);
void InitProfiling(Options &options, bool convert_2_phy_device_id = false);
void ShutDownProfiling(); void ShutDownProfiling();


Status InitSystemWithoutOptions(); Status InitSystemWithoutOptions();


+ 7
- 7
ge/session/inner_session.cc View File

@@ -18,6 +18,7 @@
#include <map> #include <map>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "common/dump/dump_properties.h"
#include "common/util.h" #include "common/util.h"
#include "framework/common/debug/ge_log.h" #include "framework/common/debug/ge_log.h"
#include "graph/ge_context.h" #include "graph/ge_context.h"
@@ -30,6 +31,8 @@


namespace ge { namespace ge {
namespace { namespace {
const int32_t kDumpStatus = 0;

Status CheckReuseMemoryOption(const std::map<string, string> &options) { Status CheckReuseMemoryOption(const std::map<string, string> &options) {
auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY); auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY);
if (iter != options.end()) { if (iter != options.end()) {
@@ -47,7 +50,7 @@ Status CheckReuseMemoryOption(const std::map<string, string> &options) {
} // namespace } // namespace


static std::mutex mutex_; // BuildGraph and RunGraph use static std::mutex mutex_; // BuildGraph and RunGraph use
bool InnerSession::is_dump_server_inited_ = false;
InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options) InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options)
: init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {} : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {}


@@ -71,12 +74,12 @@ Status InnerSession::Initialize() {


GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId())); GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId()));


PropertiesManager::Instance().GetDumpProperties(session_id_).InitByOptions();
DumpProperties dump_properties;
dump_properties.InitByOptions();


ret = graph_manager_.Initialize(options_); ret = graph_manager_.Initialize(options_);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_); GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_);
PropertiesManager::Instance().RemoveDumpProperties(session_id_);
return ret; return ret;
} }


@@ -84,7 +87,6 @@ Status InnerSession::Initialize() {
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "failed to set malloc size"); GELOGE(ret, "failed to set malloc size");
(void)graph_manager_.Finalize(); (void)graph_manager_.Finalize();
PropertiesManager::Instance().RemoveDumpProperties(session_id_);
GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
return ret; return ret;
} }
@@ -95,7 +97,6 @@ Status InnerSession::Initialize() {
ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID); ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "failed to init session instance"); GELOGE(ret, "failed to init session instance");
PropertiesManager::Instance().RemoveDumpProperties(session_id_);
} }
init_flag_ = true; init_flag_ = true;
return SUCCESS; return SUCCESS;
@@ -120,8 +121,6 @@ Status InnerSession::Finalize() {
GELOGI("VarManager free var memory."); GELOGI("VarManager free var memory.");
(void)VarManager::Instance(session_id_)->FreeVarMemory(); (void)VarManager::Instance(session_id_)->FreeVarMemory();


PropertiesManager::Instance().RemoveDumpProperties(session_id_);

GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId()))); GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));


return ret; return ret;
@@ -297,4 +296,5 @@ Status InnerSession::SaveVariables(const Graph &graph, const std::vector<std::st
const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) { const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) {
return graph_manager_.SaveVariables(graph, var_names, outputs, var_values); return graph_manager_.SaveVariables(graph, var_names, outputs, var_values);
} }

} // namespace ge } // namespace ge

+ 1
- 0
ge/session/inner_session.h View File

@@ -71,6 +71,7 @@ class InnerSession {
std::mutex resource_mutex_; // AddGraph, RemoveGraph and Finalize use std::mutex resource_mutex_; // AddGraph, RemoveGraph and Finalize use
void UpdateThreadContext(const std::map<std::string, std::string> &options); void UpdateThreadContext(const std::map<std::string, std::string> &options);
void UpdateThreadContext(uint32_t graph_id); void UpdateThreadContext(uint32_t graph_id);
static bool is_dump_server_inited_;
}; };
} // namespace ge } // namespace ge




+ 38
- 4
ge/single_op/single_op.cc View File

@@ -24,6 +24,7 @@
#include "graph/load/new_model_manager/model_utils.h" #include "graph/load/new_model_manager/model_utils.h"
#include "runtime/mem.h" #include "runtime/mem.h"
#include "single_op/single_op_manager.h" #include "single_op/single_op_manager.h"
#include "graph/load/new_model_manager/model_manager.h"


namespace ge { namespace ge {
namespace { namespace {
@@ -42,6 +43,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
delete task; delete task;
task = nullptr; task = nullptr;
} }
GELOGI("SingleOp destory sessionId = %lu", aicpu_session_id_);
ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_);
} }


Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) { Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs) {
@@ -166,6 +169,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
if (ret != SUCCESS) { if (ret != SUCCESS) {
return ret; return ret;
} }
ret = task->OpenDump(args_, stream_);
if (ret != SUCCESS) {
GELOGE(ret, "Open dump failed");
return ret;
}
} }


return ret; return ret;
@@ -173,9 +181,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c


void SingleOp::SetStream(rtStream_t stream) { stream_ = stream; } void SingleOp::SetStream(rtStream_t stream) { stream_ = stream; }


void SingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; }

DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex, rtStream_t stream) DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex, rtStream_t stream)
: resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {} : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {}


DynamicSingleOp::~DynamicSingleOp() {
GELOGI("DynamicSingleOp destory sessionId = %lu", aicpu_session_id_);
ModelManager::GetInstance()->DestroyAicpuSession(aicpu_session_id_);
}

Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs) const { std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs) const {
if (inputs.size() != input_desc.size()) { if (inputs.size() != input_desc.size()) {
@@ -236,14 +251,22 @@ Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace
return SUCCESS; return SUCCESS;
} }


Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc, vector<void *> &outputs) {
GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));

std::vector<void *> workspace_buffers;
GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));

return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
}

Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers, Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers,
vector<GeTensorDesc> &output_desc, vector<DataBuffer> &output_buffers) { vector<GeTensorDesc> &output_desc, vector<DataBuffer> &output_buffers) {
GE_CHECK_NOTNULL(op_task_); GE_CHECK_NOTNULL(op_task_);
GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
std::lock_guard<std::mutex> lk(*stream_mutex_); std::lock_guard<std::mutex> lk(*stream_mutex_);
GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));
std::vector<void *> workspace_buffers;
GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));

std::vector<void *> inputs; std::vector<void *> inputs;
std::vector<void *> outputs; std::vector<void *> outputs;
for (auto &buffer : input_buffers) { for (auto &buffer : input_buffers) {
@@ -252,6 +275,17 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, con
for (auto &buffer : output_buffers) { for (auto &buffer : output_buffers) {
outputs.emplace_back(buffer.data); outputs.emplace_back(buffer.data);
} }
return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);

if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
return ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
} else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
return op_task_->LaunchKernel(input_desc, inputs, output_desc, outputs, stream_);
} else {
GELOGE(UNSUPPORTED, "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
op_task_->GetOpTaskType());
return UNSUPPORTED;
}
} }

void DynamicSingleOp::SetSessionID(uint64_t session_id) { aicpu_session_id_ = session_id; }
} // namespace ge } // namespace ge

+ 10
- 2
ge/single_op/single_op.h View File

@@ -27,6 +27,7 @@
#include "framework/executor/ge_executor.h" #include "framework/executor/ge_executor.h"
#include "runtime/stream.h" #include "runtime/stream.h"
#include "task/op_task.h" #include "task/op_task.h"
#include "cce/aicpu_engine_struct.h"


namespace ge { namespace ge {
class SingleOp { class SingleOp {
@@ -36,6 +37,7 @@ class SingleOp {


Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
void SetStream(rtStream_t stream); void SetStream(rtStream_t stream);
void SetSessionID(uint64_t session_id);


private: private:
Status ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); Status ValidateArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@@ -50,6 +52,7 @@ class SingleOp {
std::vector<void *> output_addr_list_; std::vector<void *> output_addr_list_;
std::vector<size_t> output_sizes_; std::vector<size_t> output_sizes_;
std::vector<uintptr_t> args_; std::vector<uintptr_t> args_;
uint64_t aicpu_session_id_ = 0;


std::vector<OpTask *> tasks_; std::vector<OpTask *> tasks_;
std::vector<std::vector<uintptr_t *>> arg_table_; std::vector<std::vector<uintptr_t *>> arg_table_;
@@ -58,9 +61,10 @@ class SingleOp {
class DynamicSingleOp { class DynamicSingleOp {
public: public:
DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
~DynamicSingleOp() = default;
~DynamicSingleOp();
Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs, Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs); std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &outputs);
void SetSessionID(uint64_t session_id);


private: private:
friend class SingleOpModel; friend class SingleOpModel;
@@ -69,12 +73,16 @@ class DynamicSingleOp {


Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, std::vector<void *> &workspaces); Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, std::vector<void *> &workspaces);


std::unique_ptr<TbeOpTask> op_task_;
Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc, vector<void *> &outputs);

std::unique_ptr<OpTask> op_task_;
uintptr_t resource_id_ = 0; uintptr_t resource_id_ = 0;
std::mutex *stream_mutex_; std::mutex *stream_mutex_;
rtStream_t stream_ = nullptr; rtStream_t stream_ = nullptr;
size_t num_inputs_ = 0; size_t num_inputs_ = 0;
size_t num_outputs_ = 0; size_t num_outputs_ = 0;
uint64_t aicpu_session_id_ = 0;
}; };
} // namespace ge } // namespace ge
#endif // GE_SINGLE_OP_SINGLE_OP_H_ #endif // GE_SINGLE_OP_SINGLE_OP_H_

+ 60
- 10
ge/single_op/single_op_model.cc View File

@@ -16,6 +16,7 @@


#include "single_op/single_op_model.h" #include "single_op/single_op_model.h"


#include <atomic>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
@@ -31,6 +32,8 @@
#include "task/aicpu_kernel_task_builder.h" #include "task/aicpu_kernel_task_builder.h"
#include "task/tbe_task_builder.h" #include "task/tbe_task_builder.h"


static std::atomic<std::uint64_t> aicpu_sessionid(0);

using domi::TaskDef; using domi::TaskDef;
using std::unique_ptr; using std::unique_ptr;
using std::vector; using std::vector;
@@ -250,17 +253,21 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
} }
single_op.tasks_.emplace_back(task); single_op.tasks_.emplace_back(task);
} else { } else {
GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernek are supported, but got %u", context.kernel_type());
GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type());
return UNSUPPORTED; return UNSUPPORTED;
} }
} else if (task_type == RT_MODEL_TASK_KERNEL_EX) { } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
GELOGD("Building AICPU_TF task"); GELOGD("Building AICPU_TF task");
OpTask *task = nullptr;
auto ret = BuildKernelExTask(task_def.kernel_ex(), single_op, &task);
AiCpuTask *aicpu_task = nullptr;
bool depend_compute_flag = false;
uint64_t singleop_sessionid = aicpu_sessionid++;
GELOGI("Build singleOp, sessionId = %lu", singleop_sessionid);
auto ret = BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_sessionid);
if (ret != SUCCESS) { if (ret != SUCCESS) {
return ret; return ret;
} }
single_op.tasks_.emplace_back(task);
single_op.tasks_.emplace_back(aicpu_task);
single_op.SetSessionID(singleop_sessionid);
} else { } else {
// skip // skip
GELOGD("Skip task type: %d", static_cast<int>(task_type)); GELOGD("Skip task type: %d", static_cast<int>(task_type));
@@ -316,7 +323,8 @@ Status SingleOpModel::BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTa
return SUCCESS; return SUCCESS;
} }


Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task) {
Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag,
bool &depend_compute_flag, uint64_t session_id) {
auto iter = op_list_.find(kernel_def.op_index()); auto iter = op_list_.find(kernel_def.op_index());
if (iter == op_list_.end()) { if (iter == op_list_.end()) {
GELOGE(INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index()); GELOGE(INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index());
@@ -329,11 +337,12 @@ Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, Sin
return MEMALLOC_FAILED; return MEMALLOC_FAILED;
} }
auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def); auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def);
auto ret = builder.BuildTask(*aicpu_task, model_params_);
auto ret = builder.BuildTask(*aicpu_task, model_params_, dynamic_flag, session_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "build aicpu_TF op task failed"); GELOGE(ret, "build aicpu_TF op task failed");
return ret; return ret;
} }
depend_compute_flag = (aicpu_task->GetUnknownType() == DEPEND_COMPUTE);


*task = aicpu_task.release(); *task = aicpu_task.release();
return SUCCESS; return SUCCESS;
@@ -370,6 +379,27 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
return BuildTaskList(single_op); return BuildTaskList(single_op);
} }


Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {
const domi::KernelDef &kernel_def = task_def.kernel();
const auto &context = kernel_def.context();
auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
if (kernel_type == cce::ccKernelType::TE) {
GELOGD("Building TBE task");
TbeOpTask *tbe_task = nullptr;
GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task));
single_op.op_task_.reset(tbe_task);
} else if (kernel_type == cce::ccKernelType::AI_CPU) {
GELOGD("Building AICPU_CC task");
OpTask *task = nullptr;
GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task));
single_op.op_task_.reset(task);
} else {
GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type());
return UNSUPPORTED;
}
return SUCCESS;
}

Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
auto ge_model = model_helper_.GetGeModel(); auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model); GE_CHECK_NOTNULL(ge_model);
@@ -385,10 +415,30 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks."); GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks.");
return UNSUPPORTED; return UNSUPPORTED;
} }

TbeOpTask *task = nullptr;
GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &task));
single_op.op_task_.reset(task);
GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(task_def, single_op));
} else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
if (single_op.op_task_ != nullptr) {
GELOGE(UNSUPPORTED, "Do not support dynamic op with multiple tasks.");
return UNSUPPORTED;
}
GELOGD("Building AICPU_TF task");
AiCpuTask *aicpu_task = nullptr;
bool depend_compute_flag = false;
uint64_t dynamic_singleop_sessionid = aicpu_sessionid++;
GELOGI("Build dynamic singleOp, sessionId = %lu", dynamic_singleop_sessionid);
GE_CHK_STATUS_RET_NOLOG(
BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, true, depend_compute_flag, dynamic_singleop_sessionid));
if (depend_compute_flag) {
if (i >= tasks.size() - 1) {
GELOGE(FAILED, "The copy task of the fourth operator was not found.");
return FAILED;
}
++i;
const TaskDef &copy_task_def = tasks[i];
GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex()));
}
single_op.op_task_.reset(aicpu_task);
single_op.SetSessionID(dynamic_singleop_sessionid);
} else { } else {
// skip // skip
GELOGD("Skip task type: %d", static_cast<int>(task_type)); GELOGD("Skip task type: %d", static_cast<int>(task_type));


+ 3
- 1
ge/single_op/single_op_model.h View File

@@ -66,8 +66,10 @@ class SingleOpModel {
Status BuildTaskList(SingleOp &single_op); Status BuildTaskList(SingleOp &single_op);
Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, SingleOp &single_op, OpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag,
bool &depend_compute_flag, uint64_t session_id);
Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task); Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task);
Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);


static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param); static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
void ParseArgTable(TbeOpTask *task, SingleOp &op); void ParseArgTable(TbeOpTask *task, SingleOp &op);


+ 23
- 0
ge/single_op/task/aicpu_kernel_task_builder.cc View File

@@ -54,6 +54,29 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task) {
task.SetSoName(so_name); task.SetSoName(so_name);
task.SetkernelName(kernel_name); task.SetkernelName(kernel_name);
task.op_desc_ = op_desc_; task.op_desc_ = op_desc_;

task.num_inputs_ = op_desc_->GetInputsSize();
task.num_outputs_ = op_desc_->GetOutputsSize();

// get kernel_ext_info
auto &kernel_ext_info = kernel_def_.kernel_ext_info();
auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size();
GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED,
"task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(),
kernel_ext_info_size);

ret = task.SetExtInfoAndType(kernel_ext_info);
if (ret != SUCCESS) {
GELOGE(ret, "Init ext info failed.");
return ret;
}

auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get());
if (task.ext_info_addr_dev_ != nullptr) {
aicpu_param_head->extInfoLength = kernel_ext_info.size();
aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
}

return SUCCESS; return SUCCESS;
} }
} // namespace ge } // namespace ge

+ 50
- 20
ge/single_op/task/aicpu_task_builder.cc View File

@@ -30,13 +30,13 @@ Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<vo
size_t arg_size = kernel_def_.args_size(); size_t arg_size = kernel_def_.args_size();
auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "rtMallocHost failed, size = %zu, ret = %d", arg_size, rt_ret);
GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
return RT_FAILED; return RT_FAILED;
} }


const void *src_addr = reinterpret_cast<const void *>(addresses.data()); const void *src_addr = reinterpret_cast<const void *>(addresses.data());
uint64_t src_len = sizeof(void *) * addresses.size(); uint64_t src_len = sizeof(void *) * addresses.size();
rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST);
rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
(void)rtFree(*io_addr); (void)rtFree(*io_addr);
GELOGE(RT_FAILED, "rtMemcpy addresses failed, ret = %d", rt_ret); GELOGE(RT_FAILED, "rtMemcpy addresses failed, ret = %d", rt_ret);
@@ -69,8 +69,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke
return RT_FAILED; return RT_FAILED;
} }


rt_ret =
rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_HOST);
rt_ret = rtMemcpy(fwk_op_args, sizeof(STR_FWK_OP_KERNEL), &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
(void)rtFree(fwk_op_args); (void)rtFree(fwk_op_args);
GELOGE(RT_FAILED, "copy args failed, ret = %d", rt_ret); GELOGE(RT_FAILED, "copy args failed, ret = %d", rt_ret);
@@ -80,7 +80,8 @@ Status AiCpuTaskBuilder::SetKernelArgs(void **args, STR_FWK_OP_KERNEL &fwk_op_ke
return SUCCESS; return SUCCESS;
} }


Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param) {
Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam &param,
bool dynamic_flag) {
if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL), GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL),
kernel_def_.args_size()); kernel_def_.args_size());
@@ -88,31 +89,60 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam
} }
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);
if (ws_addr_vec.empty()) {
GELOGE(PARAM_INVALID, "workspace Data Address is empty.");
return PARAM_INVALID;
}
auto rt_ret = rtMemcpy(ws_addr_vec[0], kernel_def_.task_info_size(), kernel_def_.task_info().data(),
kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(FAILED, "rtMemcpy error: 0x%X", rt_ret);
return FAILED;
if (dynamic_flag) {
GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
} else {
if (ws_addr_vec.empty()) {
GELOGE(PARAM_INVALID, "workspace Data Address is empty.");
return PARAM_INVALID;
}
*kernel_workspace = ws_addr_vec[0];
} }
GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), kernel_def_.task_info().data(),
kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));


void *io_addr = nullptr;
auto ret = SetInputOutputAddr(&io_addr, BuildTaskUtils::JoinAddresses(addresses));
auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
if (ret != SUCCESS) { if (ret != SUCCESS) {
return ret; return ret;
} }
return SUCCESS;
}

Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag,
uint64_t session_id) {
void *io_addr = nullptr;
void *kernel_workspace = nullptr;
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&io_addr, &kernel_workspace, param, dynamic_flag));


STR_FWK_OP_KERNEL fwk_op_kernel = {0}; STR_FWK_OP_KERNEL fwk_op_kernel = {0};
ret = SetFmkOpKernel(io_addr, ws_addr_vec[0], fwk_op_kernel);
auto ret = SetFmkOpKernel(io_addr, kernel_workspace, fwk_op_kernel);
if (ret != SUCCESS) { if (ret != SUCCESS) {
(void)rtFree(io_addr); (void)rtFree(io_addr);
return ret; return ret;
} }

task.op_desc_ = op_desc_;
task.num_inputs_ = op_desc_->GetInputsSize();
task.num_outputs_ = op_desc_->GetOutputsSize();

// get kernel_ext_info
auto &kernel_ext_info = kernel_def_.kernel_ext_info();
auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size();
GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED,
"task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(),
kernel_ext_info_size);
GE_CHK_STATUS_RET(task.SetExtInfoAndType(kernel_ext_info), "Init ext info failed.");

if (task.ext_info_addr_dev_ != nullptr) {
fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = kernel_ext_info_size;
}
GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "AiCpuTask init for summary and copy task failed.");

// Create session // Create session
auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID;
fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = session_id;
GELOGI("Begin to CreateAicpuSession, session id: %lu", session_id);
GE_CHECK_NOTNULL(ModelManager::GetInstance()); GE_CHECK_NOTNULL(ModelManager::GetInstance());
GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS, GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id); GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
@@ -127,8 +157,8 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam
task.op_type_ = op_desc_->GetName(); task.op_type_ = op_desc_->GetName();
task.io_addr_ = io_addr; task.io_addr_ = io_addr;
task.task_info_ = kernel_def_.task_info(); task.task_info_ = kernel_def_.task_info();
task.workspace_addr_ = ws_addr_vec[0];
task.op_desc_ = op_desc_;
task.workspace_addr_ = kernel_workspace;
task.dynamic_flag_ = dynamic_flag;


auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_); auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);
GELOGI("[TASK_INFO] %s %s", task.task_info_.c_str(), debug_info.c_str()); GELOGI("[TASK_INFO] %s %s", task.task_info_.c_str(), debug_info.c_str());


+ 3
- 1
ge/single_op/task/aicpu_task_builder.h View File

@@ -29,12 +29,14 @@ class AiCpuTaskBuilder {
AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def); AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def);
~AiCpuTaskBuilder() = default; ~AiCpuTaskBuilder() = default;


Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param);
Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag, uint64_t session_id);


private: private:
static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses); Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, const SingleOpModelParam &param,
bool dynamic_flag);


const OpDescPtr op_desc_; const OpDescPtr op_desc_;
const domi::KernelExDef &kernel_def_; const domi::KernelExDef &kernel_def_;


+ 408
- 21
ge/single_op/task/op_task.cc View File

@@ -20,8 +20,10 @@
#include <chrono> #include <chrono>
#include <thread> #include <thread>


#include "aicpu/common/aicpu_task_struct.h"
#include "common/dump/dump_manager.h" #include "common/dump/dump_manager.h"
#include "common/dump/dump_op.h" #include "common/dump/dump_op.h"
#include "common/formats/formats.h"
#include "framework/common/debug/log.h" #include "framework/common/debug/log.h"
#include "register/op_tiling.h" #include "register/op_tiling.h"
#include "runtime/rt.h" #include "runtime/rt.h"
@@ -30,24 +32,31 @@ namespace ge {
namespace { namespace {
constexpr int kLaunchRetryTimes = 1000; constexpr int kLaunchRetryTimes = 1000;
constexpr int kSleepTime = 10; constexpr int kSleepTime = 10;
constexpr uint64_t kReleaseFlag = 1;
constexpr int kCopyNum = 2;
} // namespace } // namespace


Status OpTask::OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream) {
if (DumpManager::GetInstance().IsDumpOpen()) {
Status OpTask::OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream) {
if (DumpManager::GetInstance().GetDumpProperties().IsSingleOpNeedDump()) {
GELOGI("Dump is open in single op,start to set dump info"); GELOGI("Dump is open in single op,start to set dump info");
std::vector<uint64_t> input_addrs; std::vector<uint64_t> input_addrs;
std::vector<uint64_t> output_adds; std::vector<uint64_t> output_adds;
auto input_size = op_desc->GetAllInputsDesc().size();
auto output_size = op_desc->GetOutputsSize();
auto input_size = op_desc_->GetInputsSize();
auto output_size = op_desc_->GetOutputsSize();
auto all_size = io_addr.size();
if (input_size + output_size != all_size) {
GELOGE(FAILED, "io_addr size is not equal input and output size");
return FAILED;
}
for (size_t i = 0; i < input_size; i++) { for (size_t i = 0; i < input_size; i++) {
uint64_t input_addr = *(reinterpret_cast<const uint64_t *>(arg) + i);
uint64_t input_addr = static_cast<uint64_t>(io_addr[i]);
input_addrs.emplace_back(input_addr); input_addrs.emplace_back(input_addr);
} }
for (size_t j = 0; j < output_size; j++) { for (size_t j = 0; j < output_size; j++) {
uint64_t output_addr = *(reinterpret_cast<const uint64_t *>(arg) + input_size + j);
uint64_t output_addr = static_cast<uint64_t>(io_addr[input_size + j]);
output_adds.emplace_back(output_addr); output_adds.emplace_back(output_addr);
} }
dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc, input_addrs, output_adds, stream);
dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
auto status = dump_op_.LaunchDumpOp(); auto status = dump_op_.LaunchDumpOp();
if (status != SUCCESS) { if (status != SUCCESS) {
GELOGE(status, "Launch dump op failed in single op"); GELOGE(status, "Launch dump op failed in single op");
@@ -112,11 +121,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) {
} }
GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());


auto status = OpenDump(args_.get(), op_desc_, stream);
if (status != SUCCESS) {
GELOGE(status, "Open dump failed in tbe single op %s", stub_name_.c_str());
return status;
}
return SUCCESS; return SUCCESS;
} }


@@ -218,6 +222,119 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *
return SUCCESS; return SUCCESS;
} }


AiCpuBaseTask::~AiCpuBaseTask() {
if (ext_info_addr_dev_ != nullptr) {
(void)rtFree(ext_info_addr_dev_);
}
}

Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info) {
if (kernel_ext_info.empty()) {
GELOGI("Kernel_ext_info is empty, no need copy to device.");
return SUCCESS;
}

int32_t unknown_shape_type_val = 0;
(void)AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);

aicpu_ext_handle_.reset(
new (std::nothrow)::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(), num_inputs_, num_outputs_, unknown_type_));
GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, FAILED, "Malloc aicpu_ext_handle mem failed!");

Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
if (ret != SUCCESS) {
GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
return ret;
}

GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, kernel_ext_info.size(), RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, kernel_ext_info.size(), kernel_ext_info.data(), kernel_ext_info.size(),
RT_MEMCPY_HOST_TO_DEVICE));
return SUCCESS;
}

Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
std::vector<GeTensorDesc> &output_desc) {
GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
if (num_inputs_ == 0 && num_outputs_ == 0) {
GELOGI("No input and output, no need update ext info.");
return SUCCESS;
}

GE_CHECK_NOTNULL(aicpu_ext_handle_);
for (size_t i = 0; i < num_inputs_; ++i) {
GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(i, input_desc[i]),
"Input[%zu] update input shape failed.", i);
}

if (unknown_type_ != DEPEND_COMPUTE) {
for (size_t j = 0; j < num_outputs_; ++j) {
GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
"Output[%zu] UpdateOutputShapeAndType failed.", j);
// debug code
GELOGD("No input and output, no need update ext info.");
}
}

GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_,
aicpu_ext_handle_->GetExtInfoLen(), // check size
aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(),
RT_MEMCPY_HOST_TO_DEVICE));

GELOGI("Update ext info end.");
return SUCCESS;
}

Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
if (num_outputs_ == 0) {
GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
return SUCCESS;
}
GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");

GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));

for (size_t i = 0; i < num_outputs_; ++i) {
GeShape shape;
DataType data_type;
aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.",
i);
}
GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
return SUCCESS;
}

Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
auto shape_old = output_desc.GetShape();
output_desc.SetShape(shape_new);
GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());

auto origin_shape_old = output_desc.GetOriginShape();
auto origin_format = output_desc.GetOriginFormat();
auto format = output_desc.GetFormat();
if (origin_format == format) {
output_desc.SetOriginShape(shape_new);
return SUCCESS;
}

std::vector<int64_t> origin_dims_new;

auto trans_ret =
formats::TransShape(format, shape_new.GetDims(), output_desc.GetDataType(), origin_format, origin_dims_new);
GE_CHK_STATUS_RET(trans_ret, "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.",
origin_format, format, shape_new.ToString().c_str());

auto origin_shape_new = GeShape(origin_dims_new);
output_desc.SetOriginShape(origin_shape_new);
GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.", origin_format, format,
origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
return SUCCESS;
}

AiCpuTask::~AiCpuTask() { AiCpuTask::~AiCpuTask() {
if (args_ != nullptr) { if (args_ != nullptr) {
(void)rtFree(args_); (void)rtFree(args_);
@@ -226,6 +343,43 @@ AiCpuTask::~AiCpuTask() {
if (io_addr_ != nullptr) { if (io_addr_ != nullptr) {
(void)rtFree(io_addr_); (void)rtFree(io_addr_);
} }

if (dynamic_flag_ && workspace_addr_ != nullptr) {
(void)rtFree(workspace_addr_);
}
if (copy_workspace_buf_ != nullptr) {
(void)rtFree(copy_workspace_buf_);
}

if (copy_ioaddr_dev_ != nullptr) {
(void)rtFree(copy_ioaddr_dev_);
}

if (copy_input_release_flag_dev_ != nullptr) {
(void)rtFree(copy_input_release_flag_dev_);
}

if (copy_input_data_size_dev_ != nullptr) {
(void)rtFree(copy_input_data_size_dev_);
}

if (copy_input_src_dev_ != nullptr) {
(void)rtFree(copy_input_src_dev_);
}

if (copy_input_dst_dev_ != nullptr) {
(void)rtFree(copy_input_dst_dev_);
}

if (copy_task_args_buf_ != nullptr) {
(void)rtFree(copy_task_args_buf_);
}

for (auto summary : output_summary_) {
if (summary != nullptr) {
(void)rtFree(summary);
}
}
} }


const void *AiCpuTask::GetIOAddr() const { return io_addr_; } const void *AiCpuTask::GetIOAddr() const { return io_addr_; }
@@ -247,15 +401,225 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) {
} }
GELOGI("[TASK_INFO] is %s", this->task_info_.c_str()); GELOGI("[TASK_INFO] is %s", this->task_info_.c_str());


auto status = OpenDump(args_, op_desc_, stream);
if (status != SUCCESS) {
GELOGE(status, "Open dump failed in aicpu single op %s", op_type_.c_str());
return status;
}
GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str()); GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
return SUCCESS; return SUCCESS;
} }


Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm) {
std::vector<uint64_t> copy_input_release_flag;
std::vector<uint64_t> copy_input_data_size;
std::vector<uint64_t> copy_input_src;
std::vector<uint64_t> copy_input_dst;

for (size_t i = 0; i < num_outputs_; ++i) {
const auto &summary = output_summary_host_[i];
GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.", i,
summary.shape_data_ptr, summary.shape_data_size, summary.raw_data_ptr, summary.raw_data_size);
auto output = outputs[i];
copy_input_release_flag.emplace_back(kReleaseFlag);
copy_input_data_size.emplace_back(summary.raw_data_size);
copy_input_src.emplace_back(summary.raw_data_ptr);
copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output));

const auto &shape_buffer = out_shape_hbm[i];
copy_input_release_flag.emplace_back(kReleaseFlag);
copy_input_data_size.emplace_back(summary.shape_data_size);
copy_input_src.emplace_back(summary.shape_data_ptr);
copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
}

const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);

GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len, copy_input_release_flag.data(),
copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len, copy_input_data_size.data(), copy_input_buf_len,
RT_MEMCPY_HOST_TO_DEVICE));
GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len, copy_input_src.data(), copy_input_buf_len,
RT_MEMCPY_HOST_TO_DEVICE));
GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len, copy_input_dst.data(), copy_input_buf_len,
RT_MEMCPY_HOST_TO_DEVICE));
return SUCCESS;
}

Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm) {
for (size_t i = 0; i < num_outputs_; ++i) {
auto &result_summary = output_summary_host_[i];

GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary), output_summary_[i],
sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMCPY_DEVICE_TO_HOST));
auto shape_data_size = result_summary.shape_data_size;
void *shape_buffer = nullptr;
GE_MAKE_GUARD_RTMEM(shape_buffer);
GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
out_shape_hbm.emplace_back(shape_buffer);
}
return SUCCESS;
}

Status AiCpuTask::CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream) {
GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs, out_shape_hbm));

GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_KERNEL_DEFAULT, stream));
GE_CHK_RT_RET(rtStreamSynchronize(stream));
return SUCCESS;
}

Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm) {
for (size_t i = 0; i < num_outputs_; ++i) {
const auto &result_summary = output_summary_host_[i];
std::vector<int64_t> shape_dims;
const auto &shape_hbm = out_shape_hbm[i];

uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
GE_CHECK_NOTNULL(shape_addr);
GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm, result_summary.shape_data_size,
RT_MEMCPY_DEVICE_TO_HOST));

for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
shape_dims.emplace_back(shape_addr[dim_idx]);
GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
}

GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
"AiCpuTask update [%zu]th output shape failed.", i);
}
return SUCCESS;
}

Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
rtStream_t stream) {
if (num_outputs_ == 0) {
GELOGI("Output num is 0, there is no need to update the output and size.");
return SUCCESS;
}

GELOGI("Update shape and data by result summary begin.");

std::vector<void *> out_shape_hbm;
GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(out_shape_hbm),
"Read ResultSummary and update output shape failed.");

GE_CHK_STATUS_RET(CopyDataToHbm(outputs, out_shape_hbm, stream), "Copy data to output failed.");

GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc, out_shape_hbm), "Update shape by hbm buffer failed.");

GELOGI("Update shape and data by result summary end.");
return SUCCESS;
}

Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) {
vector<uint64_t> io_addrs;
io_addrs.reserve(num_inputs_ + num_outputs_);
for (size_t i = 0; i < num_inputs_; ++i) {
GE_CHECK_NOTNULL(inputs[i]);
GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]);
io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i]));
}

if (unknown_type_ != DEPEND_COMPUTE) {
for (size_t i = 0; i < num_outputs_; ++i) {
GE_CHECK_NOTNULL(outputs[i]);
GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]);
io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i]));
}
} else {
for (size_t i = 0; i < num_outputs_; ++i) {
void *summary_addr = output_summary_[i];
io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr));
}
}

if (!io_addrs.empty()) {
auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_));
GE_CHK_RT_RET(rtMemcpy(dst_io_addr, sizeof(uint64_t) * io_addrs.size(), &io_addrs[0],
sizeof(uint64_t) * io_addrs.size(), RT_MEMCPY_HOST_TO_DEVICE));
GE_CHECK_NOTNULL(dst_io_addr);
};
return SUCCESS;
}

Status AiCpuTask::InitForSummaryAndCopy() {
if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_);
return SUCCESS;
}

output_summary_.resize(num_outputs_);
constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
for (size_t i = 0; i < num_outputs_; ++i) {
GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
}
output_summary_host_.resize(num_outputs_);

const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);

GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));

GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));

std::vector<uint64_t> copy_io_addr;
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));

const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();

GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));

GE_CHK_RT_RET(
rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size, copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
return SUCCESS;
}

Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL),
kernel_def.args_size());
return PARAM_INVALID;
}
GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(), kernel_def.task_info().data(),
kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));

STR_FWK_OP_KERNEL aicpu_task = {0};
auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL), kernel_def.args().data(), kernel_def.args().size());
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}

aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;

GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), &aicpu_task, sizeof(STR_FWK_OP_KERNEL),
RT_MEMCPY_HOST_TO_DEVICE));
return SUCCESS;
}

Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
rtStream_t stream) {
GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));
GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
GE_CHK_RT_RET(rtStreamSynchronize(stream));

if (unknown_type_ == DEPEND_SHAPE_RANGE) {
GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
} else if (unknown_type_ == DEPEND_COMPUTE) {
GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, outputs, stream));
}

return SUCCESS;
}

void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
args_ = std::move(args); args_ = std::move(args);
arg_size_ = arg_size; arg_size_ = arg_size;
@@ -291,11 +655,34 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
} }
GELOGD("Invoke rtCpuKernelLaunch succeeded"); GELOGD("Invoke rtCpuKernelLaunch succeeded");


auto status = OpenDump(args_.get(), op_desc_, stream);
if (status != SUCCESS) {
GELOGE(status, "Open dump failed in aicpucc single op");
return status;
return SUCCESS;
}

Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
rtStream_t stream) {
GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
"AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.", unknown_type_);

GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));

size_t arg_index = 0;
auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
GE_CHECK_NOTNULL(task_io_addr);
for (auto &input : inputs) {
task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input);
}
for (auto &output : outputs) {
task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output);
} }

GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
GE_CHK_RT_RET(rtStreamSynchronize(stream));

if (unknown_type_ == DEPEND_SHAPE_RANGE) {
GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
}

return SUCCESS; return SUCCESS;
} }
} // namespace ge } // namespace ge

+ 68
- 6
ge/single_op/task/op_task.h View File

@@ -27,6 +27,9 @@
#include "graph/op_kernel_bin.h" #include "graph/op_kernel_bin.h"
#include "runtime/stream.h" #include "runtime/stream.h"
#include "graph/node.h" #include "graph/node.h"
#include "cce/aicpu_engine_struct.h"
#include "hybrid/node_executor/aicpu/aicpu_ext_info.h"
#include "init/gelib.h"


namespace ge { namespace ge {
enum OpTaskType { enum OpTaskType {
@@ -52,14 +55,20 @@ class OpTask {
virtual const void *GetIOAddr() const = 0; virtual const void *GetIOAddr() const = 0;
const vector<int64_t> &GetWorkspaceSizes() const; const vector<int64_t> &GetWorkspaceSizes() const;
void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes); void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
const OpDescPtr &GetOpdesc() const { return op_desc_; }
Status OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream);
virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) {
return UNSUPPORTED;
}


private: private:
std::vector<int64_t> workspace_sizes_; std::vector<int64_t> workspace_sizes_;


protected: protected:
Status OpenDump(const void *arg, const OpDescPtr &op_desc, rtStream_t stream);
DumpProperties dump_properties_; DumpProperties dump_properties_;
DumpOp dump_op_; DumpOp dump_op_;
OpDescPtr op_desc_;
}; };


class TbeOpTask : public OpTask { class TbeOpTask : public OpTask {
@@ -97,10 +106,30 @@ class TbeOpTask : public OpTask {
uint32_t max_tiling_size_ = 0; uint32_t max_tiling_size_ = 0;
std::string tiling_data_; std::string tiling_data_;
NodePtr node_; NodePtr node_;
OpDescPtr op_desc_;
}; };


class AiCpuTask : public OpTask {
class AiCpuBaseTask : public OpTask {
public:
AiCpuBaseTask() = default;
~AiCpuBaseTask() override;
const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }

protected:
Status SetExtInfoAndType(const std::string &kernel_ext_info);

Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc, std::vector<GeTensorDesc> &output_desc);
Status UpdateOutputShape(vector<GeTensorDesc> &output_desc);
Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc);

protected:
size_t num_inputs_ = 0;
size_t num_outputs_ = 0;
UnknowShapeOpType unknown_type_ = DEPEND_IN_SHAPE;
std::unique_ptr<ge::hybrid::AicpuExtInfoHandler> aicpu_ext_handle_;
void *ext_info_addr_dev_ = nullptr;
};

class AiCpuTask : public AiCpuBaseTask {
public: public:
AiCpuTask() = default; AiCpuTask() = default;
~AiCpuTask() override; ~AiCpuTask() override;
@@ -109,7 +138,24 @@ class AiCpuTask : public OpTask {
OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; } OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; }
const void *GetIOAddr() const override; const void *GetIOAddr() const override;


Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;
Status SetMemCopyTask(const domi::KernelExDef &kernel_def);

private: private:
Status SetIO(const vector<void *> &inputs, vector<void *> &outputs);

// for copy task.
Status InitForSummaryAndCopy();
Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
rtStream_t stream);
Status ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm);

Status CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream);
Status PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm);

Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm);

friend class AiCpuTaskBuilder; friend class AiCpuTaskBuilder;
void *workspace_addr_ = nullptr; void *workspace_addr_ = nullptr;
std::string task_info_; std::string task_info_;
@@ -117,10 +163,24 @@ class AiCpuTask : public OpTask {
size_t arg_size_ = 0; size_t arg_size_ = 0;
std::string op_type_; std::string op_type_;
void *io_addr_ = nullptr; void *io_addr_ = nullptr;
OpDescPtr op_desc_;

bool dynamic_flag_ = false;
// for copy task
void *copy_task_args_buf_;
void *copy_workspace_buf_;

std::vector<void *> output_summary_;
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;

void *copy_ioaddr_dev_;

void *copy_input_release_flag_dev_;
void *copy_input_data_size_dev_;
void *copy_input_src_dev_;
void *copy_input_dst_dev_;
}; };


class AiCpuCCTask : public OpTask {
class AiCpuCCTask : public AiCpuBaseTask {
public: public:
AiCpuCCTask() = default; AiCpuCCTask() = default;
~AiCpuCCTask() override; ~AiCpuCCTask() override;
@@ -137,6 +197,9 @@ class AiCpuCCTask : public OpTask {
void SetIoAddr(void *io_addr); void SetIoAddr(void *io_addr);
size_t GetArgSize() const; size_t GetArgSize() const;


Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;

private: private:
friend class AiCpuCCTaskBuilder; friend class AiCpuCCTaskBuilder;
std::string so_name_; std::string so_name_;
@@ -146,7 +209,6 @@ class AiCpuCCTask : public OpTask {
uint32_t block_dim_ = 1; uint32_t block_dim_ = 1;
void *sm_desc_ = nullptr; void *sm_desc_ = nullptr;
void *io_addr_ = nullptr; void *io_addr_ = nullptr;
OpDescPtr op_desc_;
}; };
} // namespace ge } // namespace ge




+ 69
- 0
inc/external/ge/ge_prof.h View File

@@ -0,0 +1,69 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef INC_EXTERNAL_GE_GE_PROF_H_
#define INC_EXTERNAL_GE_GE_PROF_H_

#include <map>
#include <string>
#include <vector>

#include "ge/ge_api_error_codes.h"

namespace ge {
enum ProfDataTypeConfig {
kProfAcl = 0x0001,
kProfTaskTime = 0x0002,
kProfAiCoreMetrics = 0x0004,
kProfAicpuTrace = 0x0008,
kProfModelExecute = 0x0010,
kProfRuntimeApi = 0x0020,
kProfRuntimeTrace = 0x0040,
kProfScheduleTimeline = 0x0080,
kProfScheduleTrace = 0x0100,
kProfAiVectorCoreMetrics = 0x0200,
kProfSubtaskTime = 0x0400,
kProfTrainingTrace = 0x0800,
kProfHcclTrace = 0x1000,
kProfDataProcess = 0x2000,
kProfTaskTrace = 0x3842,
kProfModelLoad = 0x8000000000000000
};

enum ProfilingAicoreMetrics {
kAicoreArithmaticThroughput = 0,
kAicorePipeline = 1,
kAicoreSynchronization = 2,
kAicoreMemory = 3,
kAicoreInternalMemory = 4,
kAicoreStall = 5,
kAicoreMetricsAll = 255 // only for op_trace
};

typedef struct ProfAicoreEvents ProfAicoreEvents;
typedef struct aclgrphProfConfig aclgrphProfConfig;

Status aclgrphProfInit(const char *profiler_path, uint32_t length);
Status aclgrphProfFinalize();
aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
uint64_t data_type_config);
Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config);
Status aclgrphProfStart(aclgrphProfConfig *profiler_config);
Status aclgrphProfStop(aclgrphProfConfig *profiler_config);
} // namespace ge

#endif // INC_EXTERNAL_GE_GE_PROF_H_

+ 8
- 3
inc/framework/common/ge_inner_error_codes.h View File

@@ -97,6 +97,7 @@ GE_ERRORNO_COMMON(INTERNAL_ERROR, 4, "Internal errors"); // 1343225
GE_ERRORNO_COMMON(CSEC_ERROR, 5, "Failed to call libc_sec API!"); // 1343225861 GE_ERRORNO_COMMON(CSEC_ERROR, 5, "Failed to call libc_sec API!"); // 1343225861
GE_ERRORNO_COMMON(TEE_ERROR, 6, "Failed to call tee API!"); // 1343225862 GE_ERRORNO_COMMON(TEE_ERROR, 6, "Failed to call tee API!"); // 1343225862
GE_ERRORNO_COMMON(END_OF_SEQUENCE, 7, "End of sequence!"); // 1343225863 GE_ERRORNO_COMMON(END_OF_SEQUENCE, 7, "End of sequence!"); // 1343225863
GE_ERRORNO_COMMON(PATH_INVALID, 8, "Path is invalid!"); // 1343225864


// Error code for plugin manager // Error code for plugin manager
GE_ERRORNO_COMMON(GE_PLGMGR_PATH_INVALID, 30, "Path is invalid!"); // 1343225886 GE_ERRORNO_COMMON(GE_PLGMGR_PATH_INVALID, 30, "Path is invalid!"); // 1343225886
@@ -124,9 +125,13 @@ GE_ERRORNO_CLIENT(GE_CLI_GE_ALREADY_INITIALIZED, 10, "GE is already initialized.
GE_ERRORNO_CLIENT(GE_CLI_GE_NOT_INITIALIZED, 11, "GE is not yet initialized or is finalized."); // 1343229963 GE_ERRORNO_CLIENT(GE_CLI_GE_NOT_INITIALIZED, 11, "GE is not yet initialized or is finalized."); // 1343229963


// Init module error code definition // Init module error code definition
GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported."); // 1343234048
GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization."); // 1343234049
GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported."); // 1343234050
GE_ERRORNO_INIT(GE_MULTI_INIT, 0, "Multiple initializations are not supported."); // 1343234048
GE_ERRORNO_INIT(GE_FINALIZE_NOT_INIT, 1, "Finalize is not allowed before initialization."); // 1343234049
GE_ERRORNO_INIT(GE_MULTI_FINALIZE, 2, "Multiple finalizations are not supported."); // 1343234050
GE_ERRORNO_INIT(GE_PROF_MULTI_INIT, 3, "Multiple profiling initializations are not supported."); // 1343234051
GE_ERRORNO_INIT(GE_PROF_NOT_INIT, 4, "Profing initializations have not been done."); // 1343234052
GE_ERRORNO_INIT(GE_PROF_MODE_CONFLICT, 5,
"Profiling command mode which is preferred is running, the api mode will not work."); // 1343234053


// Session module error code definition // Session module error code definition
GE_ERRORNO_SESSION(GE_SESS_INIT_FAILED, 0, "Failed to initialize session."); // 1343238144 GE_ERRORNO_SESSION(GE_SESS_INIT_FAILED, 0, "Failed to initialize session."); // 1343238144


+ 18
- 0
inc/framework/common/util.h View File

@@ -398,6 +398,24 @@ bool CheckOutputPathValid(const std::string &file_path, const std::string &atc_p
/// @param [out] result /// @param [out] result
/// ///
bool ValidateStr(const std::string &filePath, const std::string &mode); bool ValidateStr(const std::string &filePath, const std::string &mode);

///
/// @ingroup domi_common
/// @brief Check whether the file is normal file.
/// @param [in] file_path file path
/// @param [out] result
///
bool IsValidFile(const char *file_path);

///
/// @ingroup domi_common
/// @brief Check path invalid
/// @param [in] path, path to be checked
/// @param [in] length, length of path
/// @return 0 success
/// @return -1 fail
///
Status CheckPath(const char *path, size_t length);
} // namespace ge } // namespace ge


#endif // INC_FRAMEWORK_COMMON_UTIL_H_ #endif // INC_FRAMEWORK_COMMON_UTIL_H_

+ 7
- 7
metadef/graph/CMakeLists.txt View File

@@ -18,13 +18,13 @@
set(CMAKE_CXX_FLAGS "-Wno-unused-variable ${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "-Wno-unused-variable ${CMAKE_CXX_FLAGS}")
# add all proto files, generate corresponding .h and .cc files # add all proto files, generate corresponding .h and .cc files
file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
"../proto/om.proto"
"../proto/ge_ir.proto"
"../proto/insert_op.proto"
"../proto/task.proto"
"../proto/fwk_adaper.proto"
"../proto/op_mapping_info.proto"
"../proto/dump_task.proto"
"../../proto/om.proto"
"../../proto/ge_ir.proto"
"../../proto/insert_op.proto"
"../../proto/task.proto"
"../../proto/fwk_adaper.proto"
"../../proto/op_mapping_info.proto"
"../../proto/dump_task.proto"
) )


file(GLOB_RECURSE ONNX_PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} file(GLOB_RECURSE ONNX_PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}


+ 1
- 1
metadef/graph/compute_graph.cc View File

@@ -658,7 +658,7 @@ ComputeGraph::UpdateOutputMapping(const std::map<uint32_t, uint32_t> &output_map
return GRAPH_FAILED; return GRAPH_FAILED;
} }


size_t num = op_desc->GetInputsSize();
size_t num = op_desc->GetAllInputsSize();
for (size_t i = 0; i < num; i++) { for (size_t i = 0; i < num; i++) {
GeTensorDesc tensor = op_desc->GetInputDesc(i); GeTensorDesc tensor = op_desc->GetInputDesc(i);
uint32_t cur_index = 0; uint32_t cur_index = 0;


+ 6
- 3
metadef/graph/format_refiner.cc View File

@@ -149,9 +149,10 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std
// consider special node save process // consider special node save process
// get all input desc format // get all input desc format
bool node_is_all_nd = false; bool node_is_all_nd = false;
auto input_size = static_cast<uint32_t>(op_desc->GetInputsSize());
auto input_size = static_cast<uint32_t>(op_desc->GetAllInputsSize());
for (uint32_t i = 0; i < input_size; i++) { for (uint32_t i = 0; i < input_size; i++) {
// Operator pre-set format but not origin format // Operator pre-set format but not origin format
GE_IF_BOOL_EXEC(op_desc->MutableInputDesc(i) == nullptr, continue);
auto input_format = op_desc->MutableInputDesc(i)->GetFormat(); auto input_format = op_desc->MutableInputDesc(i)->GetFormat();
// Pre-save data node (only main graph data) and default infer fail // Pre-save data node (only main graph data) and default infer fail
if (node_ptr->GetType() == DATA) { if (node_ptr->GetType() == DATA) {
@@ -164,6 +165,7 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std
// Get all output desc format // Get all output desc format
auto output_size = static_cast<uint32_t>(op_desc->GetOutputsSize()); auto output_size = static_cast<uint32_t>(op_desc->GetOutputsSize());
for (uint32_t i = 0; i < output_size; i++) { for (uint32_t i = 0; i < output_size; i++) {
GE_IF_BOOL_EXEC(op_desc->MutableOutputDesc(i) == nullptr, continue);
auto output_format = op_desc->MutableOutputDesc(i)->GetFormat(); auto output_format = op_desc->MutableOutputDesc(i)->GetFormat();
if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) { if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) {
node_is_all_nd = true; node_is_all_nd = true;
@@ -222,8 +224,9 @@ graphStatus FormatRefiner::BackInferProcess(std::deque<ge::NodePtr> &nodes, ge::
for (const auto &in_anchor : node->GetAllInDataAnchors()) { for (const auto &in_anchor : node->GetAllInDataAnchors()) {
GELOGD("Node is [%s] [B]", (node->GetName()).c_str()); GELOGD("Node is [%s] [B]", (node->GetName()).c_str());
auto in_data_anchor_idx = in_anchor->GetIdx(); auto in_data_anchor_idx = in_anchor->GetIdx();
auto to_be_set_format =
node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx))->GetOriginFormat();
auto input_desc = node->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_data_anchor_idx));
GE_IF_BOOL_EXEC(input_desc == nullptr, continue);
auto to_be_set_format = input_desc->GetOriginFormat();
if (to_be_set_format == FORMAT_ND) { if (to_be_set_format == FORMAT_ND) {
GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str()); GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str());
continue; continue;


+ 1
- 0
metadef/graph/ge_attr_define.cc View File

@@ -123,6 +123,7 @@ const std::string ATTR_NAME_AIPP_OUTPUTS = "_aipp_outputs";


const std::string ATTR_NAME_INPUT_DIMS = "input_dims"; const std::string ATTR_NAME_INPUT_DIMS = "input_dims";


const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED = "_graph_has_been_added";
const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id"; const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";
const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name"; const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name";




+ 34
- 18
metadef/graph/node.cc View File

@@ -68,7 +68,7 @@ graphStatus Node::Init() {
return GRAPH_SUCCESS; return GRAPH_SUCCESS;
} }
GE_CHK_BOOL_EXEC(op_ != nullptr, return GRAPH_FAILED, "original OpDesc is nullptr"); GE_CHK_BOOL_EXEC(op_ != nullptr, return GRAPH_FAILED, "original OpDesc is nullptr");
size_t size = op_->GetInputsSize();
size_t size = op_->GetAllInputsSize();
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), i); std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), i);
if (anchor == nullptr) { if (anchor == nullptr) {
@@ -305,13 +305,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con
GELOGE(GRAPH_FAILED, "add input desc failed."); GELOGE(GRAPH_FAILED, "add input desc failed.");
return GRAPH_FAILED; return GRAPH_FAILED;
} }
std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
if (anchor == nullptr) {
GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
return GRAPH_FAILED;

if (index < GetAllInDataAnchors().size()) {
(void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]);
} else {
std::shared_ptr<InDataAnchor> anchor =
ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
if (anchor == nullptr) {
GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
return GRAPH_FAILED;
}
in_data_anchors_.push_back(anchor);
(void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());
} }
in_data_anchors_.push_back(anchor);
(void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());


return GRAPH_SUCCESS; return GRAPH_SUCCESS;
} }
@@ -347,20 +353,30 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::AddLinkFrom(con
} }


GE_CHECK_NOTNULL(op_); GE_CHECK_NOTNULL(op_);
auto op_desc = input_node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);

if (op_->AddInputDesc(name, op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) {
GELOGE(GRAPH_FAILED, "add input desc failed.");
return GRAPH_FAILED;
auto input_op_desc = input_node->GetOpDesc();
GE_CHECK_NOTNULL(input_op_desc);
auto index = op_->GetInputIndexByName(name);
if (index != -1) {
if (index >= static_cast<int>(in_data_anchors_.size())) {
GELOGE(GRAPH_FAILED, "op %s get input name %s 's index %d is illegal.", op_->GetName().c_str(), name.c_str(),
index);
return GRAPH_FAILED;
}
(void)out_anchors.at(0)->LinkTo(in_data_anchors_[index]);
} else {
std::shared_ptr<InDataAnchor> anchor =
ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
if (anchor == nullptr) {
GELOGE(GRAPH_FAILED, "in_data_anchors_size is:%zu, malloc shared_ptr failed.", in_data_anchors_.size());
return GRAPH_FAILED;
}
in_data_anchors_.push_back(anchor);
(void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());
} }
std::shared_ptr<InDataAnchor> anchor = ComGraphMakeShared<InDataAnchor>(shared_from_this(), in_data_anchors_.size());
if (anchor == nullptr) {
GELOGE(GRAPH_FAILED, "out_anchor size is:%zu, malloc shared_ptr failed.", out_anchors.size());
if (op_->AddInputDesc(name, input_op_desc->GetOutputDesc(0)) != GRAPH_SUCCESS) {
GELOGE(GRAPH_FAILED, "add input desc failed.");
return GRAPH_FAILED; return GRAPH_FAILED;
} }
in_data_anchors_.push_back(anchor);
(void)out_anchors.at(0)->LinkTo(in_data_anchors_.back());


return GRAPH_SUCCESS; return GRAPH_SUCCESS;
} }


+ 1
- 1
metadef/graph/op_desc.cc View File

@@ -675,7 +675,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI
return nullptr; return nullptr;
} }
if (inputs_desc_[index]->IsValid() != GRAPH_SUCCESS) { if (inputs_desc_[index]->IsValid() != GRAPH_SUCCESS) {
GELOGE(GRAPH_FAILED, "inputsDesc[%u] is InValid", index);
GELOGW("inputsDesc[%u] is InValid", index);
return nullptr; return nullptr;
} else { } else {
return inputs_desc_[static_cast<size_t>(index)]; return inputs_desc_[static_cast<size_t>(index)];


+ 14
- 8
metadef/graph/operator.cc View File

@@ -1504,7 +1504,9 @@ class GraphBuilderImpl {
GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed."); GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed.");


auto ret = GraphUtils::AddEdge(src_anchor, dst_anchor); auto ret = GraphUtils::AddEdge(src_anchor, dst_anchor);
GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED, "AddEdge failed.");
GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return GRAPH_FAILED,
"from node[%s][%d] to node[%s][%d]AddEdge failed.", src_node_ptr->GetName().c_str(),
src_anchor->GetIdx(), dst_node_info->second->GetName().c_str(), dst_anchor->GetIdx());
} }
} }
auto out_control_anchor = src_node_ptr->GetOutControlAnchor(); auto out_control_anchor = src_node_ptr->GetOutControlAnchor();
@@ -1536,19 +1538,23 @@ inline bool HasSameNameNode(const ComputeGraphPtr &compute_graph) {
for (const auto &graph : compute_graph->GetAllSubgraphs()) { for (const auto &graph : compute_graph->GetAllSubgraphs()) {
std::set<string> node_names; std::set<string> node_names;
for (auto const &node : graph->GetDirectNode()) { for (auto const &node : graph->GetDirectNode()) {
node_names.insert(node->GetName());
}
if (node_names.size() != graph->GetDirectNodesSize()) {
return true;
auto result = node_names.insert(node->GetName());
if (!result.second) {
GELOGE(GRAPH_FAILED, "graph %s has same name node%s", graph->GetName().c_str(), node->GetName().c_str());
return true;
}
} }
} }


std::set<string> node_names; std::set<string> node_names;
for (auto const &node : compute_graph->GetDirectNode()) { for (auto const &node : compute_graph->GetDirectNode()) {
node_names.insert(node->GetName());
auto result = node_names.insert(node->GetName());
if (!result.second) {
GELOGE(GRAPH_FAILED, "graph %s has same name node%s", compute_graph->GetName().c_str(), node->GetName().c_str());
return true;
}
} }
return node_names.size() != compute_graph->GetDirectNodesSize();
return false;
} }


ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) { ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) {


+ 7
- 1
metadef/graph/shape_refiner.cc View File

@@ -51,6 +51,9 @@ graphStatus ReverseBrushWhileBodySubGraph(const ConstNodePtr &node) {
for (const auto &node_sub : sub_graph_body->GetAllNodes()) { for (const auto &node_sub : sub_graph_body->GetAllNodes()) {
for (size_t i = 0; i < node_sub->GetAllInDataAnchorsSize(); i++) { for (size_t i = 0; i < node_sub->GetAllInDataAnchorsSize(); i++) {
auto input_desc = node_sub->GetOpDesc()->MutableInputDesc(i); auto input_desc = node_sub->GetOpDesc()->MutableInputDesc(i);
GE_IF_BOOL_EXEC(input_desc == nullptr,
GELOGW("Get null input by index %zu from node %s ", i, node_sub->GetName().c_str());
continue);
(void)input_desc->SetUnknownDimNumShape(); (void)input_desc->SetUnknownDimNumShape();
} }
for (size_t i = 0; i < node_sub->GetAllOutDataAnchorsSize(); i++) { for (size_t i = 0; i < node_sub->GetAllOutDataAnchorsSize(); i++) {
@@ -376,10 +379,13 @@ graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) {
continue; continue;
} }
int peer_out_idx = peer_out_data_anchor->GetIdx(); int peer_out_idx = peer_out_data_anchor->GetIdx();
auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx));
auto peer_out_desc = peer_out_data_node->GetOpDesc()->MutableOutputDesc(static_cast<uint32_t>(peer_out_idx)); auto peer_out_desc = peer_out_data_node->GetOpDesc()->MutableOutputDesc(static_cast<uint32_t>(peer_out_idx));


// check shape and dtype continuity. do not stop process // check shape and dtype continuity. do not stop process
auto in_desc = node_ptr->GetOpDesc()->MutableInputDesc(static_cast<uint32_t>(in_idx));
if (in_desc == nullptr) {
continue;
}
auto in_shape = in_desc->GetShape().GetDims(); auto in_shape = in_desc->GetShape().GetDims();
auto in_dtype = in_desc->GetDataType(); auto in_dtype = in_desc->GetDataType();
auto peer_out_shape = peer_out_desc->GetShape().GetDims(); auto peer_out_shape = peer_out_desc->GetShape().GetDims();


+ 14
- 3
metadef/graph/utils/ge_ir_utils.cc View File

@@ -264,11 +264,11 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const
return; return;
} }
// Input describes // Input describes
auto size_in = op_desc->GetInputsSize();
auto size_in = op_desc->GetAllInputsSize();
AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_nums", &size_in); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_nums", &size_in);
if (size_in > 0) { if (size_in > 0) {
for (uint32_t i = 0; i < size_in; i++) { for (uint32_t i = 0; i < size_in; i++) {
auto input_desc = op_desc->GetInputDescPtr(i);
auto input_desc = op_desc->GetInputDescPtrDfault(i);
if (input_desc != nullptr) { if (input_desc != nullptr) {
auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType()); auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType());
AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i), AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i),
@@ -480,9 +480,20 @@ void OnnxUtils::AddAttrProtoFromNodeMembers(const NodePtr &node, onnx::NodeProto
if (!recv_list.empty()) { if (!recv_list.empty()) {
AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "recv_event_id_list", &recv_list); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "recv_event_id_list", &recv_list);
} }
// 2.Attributes added from node's op_(message OpDef)
auto op_desc = node->op_; auto op_desc = node->op_;
if (op_desc != nullptr) { if (op_desc != nullptr) {
// for input_name_idx_ in opdesc
auto input_name_2_indexs = op_desc->GetAllInputName();
::google::protobuf::RepeatedPtrField<::std::string> input_names;
::google::protobuf::RepeatedField<::google::protobuf::int64> input_indexes;
for (const auto &input_name_2_index : input_name_2_indexs) {
std::string input_name = input_name_2_index.first;
input_names.Add(std::move(input_name));
input_indexes.Add(input_name_2_index.second);
}
AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRINGS, "_input_name_key", input_names);
AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "_input_name_value", input_indexes);
// 2.Attributes added from node's op_(message OpDef)
// Input and out describes // Input and out describes
AddAttrProtoForOpInAndOutDesc(node_proto, op_desc); AddAttrProtoForOpInAndOutDesc(node_proto, op_desc);
// Others // Others


+ 1
- 2
metadef/graph/utils/graph_utils.cc View File

@@ -1470,8 +1470,7 @@ graphStatus GraphUtils::CopyTensorAttrs(const OpDescPtr &dst_desc, const NodePtr
for (uint32_t i = 0; i < src_node->GetAllInDataAnchorsSize(); ++i) { for (uint32_t i = 0; i < src_node->GetAllInDataAnchorsSize(); ++i) {
auto input_desc = dst_desc->MutableInputDesc(i); auto input_desc = dst_desc->MutableInputDesc(i);
if (input_desc == nullptr) { if (input_desc == nullptr) {
GELOGE(GRAPH_FAILED, "Param dst node not valid");
return GRAPH_FAILED;
continue;
} }
input_desc->CopyAttrsFrom(src_desc->GetInputDesc(i)); input_desc->CopyAttrsFrom(src_desc->GetInputDesc(i));
} }


+ 0
- 1
metadef/graph/utils/op_desc_utils.cc View File

@@ -513,7 +513,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector<GeTensorPtr> OpDescUtils::
} }
return MutableWeights(*node); return MutableWeights(*node);
} }

GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights) { OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights) {
GE_CHK_BOOL_EXEC(node.GetOpDesc() != nullptr, return GRAPH_PARAM_INVALID, "node.GetOpDesc is nullptr!"); GE_CHK_BOOL_EXEC(node.GetOpDesc() != nullptr, return GRAPH_PARAM_INVALID, "node.GetOpDesc is nullptr!");


+ 1
- 0
metadef/inc/graph/debug/ge_attr_define.h View File

@@ -142,6 +142,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM


GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_DIMS; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_DIMS;


GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GRAPH_HAS_BEEN_ADDED;
GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID;
GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME;




+ 375
- 0
src/ge/client/ge_prof.cc View File

@@ -0,0 +1,375 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "ge/ge_prof.h"
#include "ge/ge_api.h"
#include "init/gelib.h"
#include "common/debug/log.h"
#include "framework/common/debug/ge_log.h"
#include "common/profiling/profiling_manager.h"
#include "graph/load/graph_loader.h"
#include "toolchain/prof_acl_api.h"

using std::map;
using std::string;
using std::vector;

namespace {
const uint32_t kMaxDeviceNum = 64;
const std::string PROFILING_INIT = "prof_init";
const std::string PROFILING_FINALIZE = "prof_finalize";
const std::string PROFILING_START = "prof_start";
const std::string PROFILING_STOP = "prof_stop";
const std::string DEVICES_NUMS = "devNums";
const std::string DEVICE_ID_LIST = "devIdList";
const std::string AICORE_METRICS = "aicoreMetrics";

const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = {
{ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
{ge::kAicorePipeline, "AICORE_PIPELINE"},
{ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
{ge::kAicoreMemory, "AICORE_MEMORY"},
{ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
{ge::kAicoreStall, "AICORE_STALL"},
{ge::kAicoreMetricsAll, "AICORE_METRICS_ALL"}};

const std::map<uint64_t, uint64_t> kDataTypeConfigMapping = {{ge::kProfAcl, PROF_ACL_API},
{ge::kProfTaskTime, PROF_TASK_TIME},
{ge::kProfAiCoreMetrics, PROF_AICORE_METRICS},
{ge::kProfAicpuTrace, PROF_AICPU_TRACE},
{ge::kProfModelExecute, PROF_MODEL_EXECUTE},
{ge::kProfRuntimeApi, PROF_RUNTIME_API},
{ge::kProfRuntimeTrace, PROF_RUNTIME_TRACE},
{ge::kProfScheduleTimeline, PROF_SCHEDULE_TIMELINE},
{ge::kProfScheduleTrace, PROF_SCHEDULE_TRACE},
{ge::kProfAiVectorCoreMetrics, PROF_AIVECTORCORE_METRICS},
{ge::kProfSubtaskTime, PROF_SUBTASK_TIME},
{ge::kProfTrainingTrace, PROF_TRAINING_TRACE},
{ge::kProfHcclTrace, PROF_HCCL_TRACE},
{ge::kProfDataProcess, PROF_DATA_PROCESS},
{ge::kProfTaskTrace, PROF_TASK_TRACE},
{ge::kProfModelLoad, PROF_MODEL_LOAD}};
} // namespace

static bool g_graph_prof_init_ = false;
static std::mutex g_prof_mutex_;

namespace ge {
struct aclgrphProfConfig {
ProfConfig config;
};

Status aclgrphProfInit(const char *profiler_path, uint32_t length) {
GELOGT(TRACE_INIT, "Graph prof init start");

std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
return FAILED;
}

std::lock_guard<std::mutex> lock(g_prof_mutex_);
if (g_graph_prof_init_) {
GELOGW("Multi graph profiling initializations.");
return GE_PROF_MULTI_INIT;
}

Status ret = CheckPath(profiler_path, length);
if (ret != SUCCESS) {
GELOGE(ret, "Profiling config path is invalid.");
return ret;
}
// if command mode is set, just return
if (ProfilingManager::Instance().ProfilingOn()) {
GELOGW("Graph prof init failed, cause profiling command pattern is running.");
return GE_PROF_MODE_CONFLICT;
}

ret = ProfInit(profiler_path);
if (ret != SUCCESS) {
GELOGE(ret, "ProfInit init fail");
return ret;
}

GraphLoader graph_loader;
Command command;
command.cmd_params.clear();
command.cmd_type = PROFILING_INIT;
command.module_index = kProfModelLoad | kProfTrainingTrace;
ret = graph_loader.CommandHandle(command);
if (ret != SUCCESS) {
GELOGE(ret, "Handle profiling command %s failed, config = %s", PROFILING_INIT.c_str(), profiler_path);
return ret;
}
if (!g_graph_prof_init_) {
g_graph_prof_init_ = true;
GELOGI("Profiling init successfully.");
}

GELOGI("Successfully execute GraphProfInit.");
return SUCCESS;
}

Status aclgrphProfFinalize() {
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
return FAILED;
}
std::lock_guard<std::mutex> lock(g_prof_mutex_);
// if command mode is set, just return
if (ProfilingManager::Instance().ProfilingOn()) {
GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
return GE_PROF_MODE_CONFLICT;
}

if (!g_graph_prof_init_) {
GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
return GE_PROF_NOT_INIT;
}
GraphLoader graph_loader;
Command command;
command.cmd_params.clear();
command.cmd_type = PROFILING_FINALIZE;
Status ret = graph_loader.CommandHandle(command);
if (ret != SUCCESS) {
GELOGE(ret, "Handle profiling command %s failed.", PROFILING_FINALIZE.c_str());
return ret;
}

ret = ProfFinalize();
if (ret != SUCCESS) {
GELOGE(ret, "Finalize profiling failed, result = %d", ret);
}

if (ret == SUCCESS) {
g_graph_prof_init_ = false;
GELOGI("Successfully execute GraphProfFinalize.");
}
return ret;
}

bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) {
prof_config_params.clear();
prof_config_params.emplace_back(DEVICES_NUMS);
prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums));
prof_config_params.emplace_back(DEVICE_ID_LIST);
std::string devID = "";
if (profiler_config->config.devNums == 0) {
GELOGW("The device num is invalid.");
return false;
}
for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
devID.append(std::to_string(profiler_config->config.devIdList[i]));
if (i != profiler_config->config.devNums - 1) {
devID.append(",");
}
}

prof_config_params.push_back(devID);
prof_config_params.push_back(AICORE_METRICS);
auto iter =
kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics));
if (iter == kProfAicoreMetricsToString.end()) {
GELOGW("The prof aicore metrics is invalid.");
return false;
}
prof_config_params.push_back(iter->second);
return true;
}

bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
if (deviceid_list == nullptr) {
GELOGE(PARAM_INVALID, "deviceIdList is nullptr");
return false;
}
if (device_nums == 0 || device_nums > kMaxDeviceNum) {
GELOGE(PARAM_INVALID, "The device nums is invalid.");
return false;
}

// real device num
int32_t dev_count = 0;
rtError_t rt_err = rtGetDeviceCount(&dev_count);
if (rt_err != RT_ERROR_NONE) {
GELOGE(INTERNAL_ERROR, "Get the Device count fail.");
return false;
}

if (device_nums > static_cast<uint32_t>(dev_count)) {
GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
return false;
}

std::unordered_set<uint32_t> record;
for (size_t i = 0; i < device_nums; ++i) {
uint32_t dev_id = deviceid_list[i];
if (dev_id >= static_cast<uint32_t>(dev_count)) {
GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
return false;
}
if (record.count(dev_id) > 0) {
GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
return false;
}
record.insert(dev_id);
}
return true;
}

aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
uint64_t data_type_config) {
if (!isProfConfigValid(deviceid_list, device_nums)) {
return nullptr;
}
aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig();
if (config == nullptr) {
GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail");
return nullptr;
}
config->config.devNums = device_nums;
if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list,
device_nums * sizeof(uint32_t)) != EOK) {
GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums);
delete config;
return nullptr;
}

config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics);
uint64_t data_type = 0;
for (auto &iter : kDataTypeConfigMapping) {
if ((iter.first & data_type_config) == iter.first) {
data_type |= iter.second;
}
}
config->config.dataTypeConfig = data_type;
GELOGI("Successfully create prof config.");
return config;
}

Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) {
if (profiler_config == nullptr) {
GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr");
return PARAM_INVALID;
}

delete profiler_config;
GELOGI("Successfully destroy prof config.");
return SUCCESS;
}

Status aclgrphProfStart(aclgrphProfConfig *profiler_config) {
if (profiler_config == nullptr) {
GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
return FAILED;
}
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
return FAILED;
}

std::lock_guard<std::mutex> lock(g_prof_mutex_);
// if command mode is set, just return
if (ProfilingManager::Instance().ProfilingOn()) {
GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
return GE_PROF_MODE_CONFLICT;
}
if (!g_graph_prof_init_) {
GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
return GE_PROF_NOT_INIT;
}

Status ret = ProfStartProfiling(&profiler_config->config);
if (ret != SUCCESS) {
GELOGE(ret, "Start profiling failed, prof result = %d", ret);
return FAILED;
}

std::vector<string> prof_params;
if (!TransProfConfigToParam(profiler_config, prof_params)) {
GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
return PARAM_INVALID;
}

GraphLoader graph_loader;
Command command;
command.cmd_params.clear();
command.cmd_type = PROFILING_START;
command.cmd_params = prof_params;
command.module_index = profiler_config->config.dataTypeConfig;
ret = graph_loader.CommandHandle(command);
if (ret != SUCCESS) {
GELOGE(ret, "Handle profiling command failed");
return FAILED;
}

GELOGI("Successfully execute GraphProfStartProfiling.");

return SUCCESS;
}

Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
if (profiler_config == nullptr) {
GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
return FAILED;
}
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
return FAILED;
}

std::lock_guard<std::mutex> lock(g_prof_mutex_);
// if command mode is set, just return
if (ProfilingManager::Instance().ProfilingOn()) {
GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
return GE_PROF_MODE_CONFLICT;
}
if (!g_graph_prof_init_) {
GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
return GE_PROF_NOT_INIT;
}

Status ret = ProfStopProfiling(&profiler_config->config);
if (ret != SUCCESS) {
GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
return ret;
}

std::vector<string> prof_params;
if (!TransProfConfigToParam(profiler_config, prof_params)) {
GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
return PARAM_INVALID;
}

GraphLoader graph_loader;
Command command;
command.cmd_params.clear();
command.cmd_type = PROFILING_STOP;
command.cmd_params = prof_params;
command.module_index = profiler_config->config.dataTypeConfig;
ret = graph_loader.CommandHandle(command);
if (ret != SUCCESS) {
GELOGE(ret, "Handle profiling command failed");
return FAILED;
}

GELOGI("Successfully execute GraphProfStopProfiling.");
return SUCCESS;
}
} // namespace ge

+ 10
- 8
third_party/fwkacllib/inc/ops/aipp.h View File

@@ -25,14 +25,16 @@


namespace ge { namespace ge {
/** /**
*@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC), image normalization (by subtracting the mean value or multiplying a factor), image cropping (by specifying the crop start and cropping the image to the size required by the neural network), and much more.
*@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC),
image normalization (by subtracting the mean value or multiplying a factor), image cropping
(by specifying the crop start and cropping the image to the size required by the neural network), and much more. \n


*@par Inputs: *@par Inputs:
*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. *@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
*@li params: Dynamic AIPP configuration parameters of type uint8.
*@li params: Dynamic AIPP configuration parameters of type uint8. \n


*@par Attributes: *@par Attributes:
*aipp_config_path: A required string, specifying the path of the AIPP configuration file
*aipp_config_path: A required string, specifying the path of the AIPP configuration file. \n


*@par Outputs: *@par Outputs:
*features: The AIPP-processed output tensor of type float16 or uint8. *features: The AIPP-processed output tensor of type float16 or uint8.
@@ -47,17 +49,17 @@ REG_OP(Aipp)
.OP_END_FACTORY_REG(Aipp) .OP_END_FACTORY_REG(Aipp)


/** /**
*@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic \n
in aipp config file, framework will auto add one input node to graph at last.
*@brief Performs this op is for dynamic aipp.If you set aipp-mode to dynamic
in aipp config file, framework will auto add one input node to graph at last. \n


*@par Inputs: *@par Inputs:
*data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
*data: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. \n


*@par Attributes: *@par Attributes:
*index: specify aipp serial num
*index: specify aipp serial num \n


*@par Outputs: *@par Outputs:
*out: The AIPP-processed output tensor of all types.
*out: The AIPP-processed output tensor of all types. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator AippData. *Compatible with the TensorFlow operator AippData.


+ 227
- 210
third_party/fwkacllib/inc/ops/array_ops.h
File diff suppressed because it is too large
View File


+ 44
- 44
third_party/fwkacllib/inc/ops/audio_ops.h View File

@@ -26,29 +26,29 @@
namespace ge { namespace ge {


/** /**
*@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of \n
taking the DCT-II of a log-magnitude mel-scale spectrogram.
*@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of
taking the DCT-II of a log-magnitude mel-scale spectrogram . \n


*@par Inputs:
*Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar. \n
*@par Inputs:
*Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar.
* @li spectrogram: A 3D float tensor. * @li spectrogram: A 3D float tensor.
* @li sample_rate: The MFCC sample rate.
* @li sample_rate: The MFCC sample rate . \n


*@par Attributes:
*@par Attributes:
*@li upper_frequency_limit: The highest frequency for calculation. *@li upper_frequency_limit: The highest frequency for calculation.
*@li lower_frequency_limit: The lowest frequency for calculation. *@li lower_frequency_limit: The lowest frequency for calculation.
*@li filterbank_channel_count: Resolution of the Mel bank. *@li filterbank_channel_count: Resolution of the Mel bank.
*@li dct_coefficient_count: Number of output channels to produce \n
per time slice.
*@li dct_coefficient_count: Number of output channels to produce
per time slice . \n


*@par Outputs:
*y: A Tensor of type float32.
*@par Outputs:
*y: A Tensor of type float32 . \n


*@attention Constraints: \n
*Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*Mfcc runs on the Ascend AI CPU, which delivers poor performance.


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator Mfcc.
*Compatible with the TensorFlow operator Mfcc . \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -64,26 +64,26 @@ REG_OP(Mfcc)
.OP_END_FACTORY_REG(Mfcc) .OP_END_FACTORY_REG(Mfcc)


/** /**
*@brief Decodes and generates spectrogram using wav float tensor.
*@brief Decodes and generates spectrogram using wav float tensor . \n


*@par Inputs:
*Input "x" is a 2D matrix. \n
* x: A float tensor. Float representation of audio data.
*@par Inputs:
*Input "x" is a 2D matrix.
* x: A float tensor. Float representation of audio data . \n


*@par Attributes:
*@par Attributes:
*@li window_size: Size of the spectrogram window. *@li window_size: Size of the spectrogram window.
*@li stride: Size of the spectrogram stride. *@li stride: Size of the spectrogram stride.
*@li magnitude_squared: If true, uses squared magnitude.
*@li magnitude_squared: If true, uses squared magnitude . \n


*@par Outputs:
*spectrogram: A 3D float Tensor.
*@par Outputs:
*spectrogram: A 3D float Tensor . \n


*@attention Constraints: \n
*AudioSpectrogram runs on the Ascend AI CPU, which delivers \n
poor performance.
*@attention Constraints:
*AudioSpectrogram runs on the Ascend AI CPU, which delivers
poor performance . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator AudioSpectrogram.
*Compatible with the TensorFlow operator AudioSpectrogram . \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -98,26 +98,26 @@ REG_OP(AudioSpectrogram)
.OP_END_FACTORY_REG(AudioSpectrogram) .OP_END_FACTORY_REG(AudioSpectrogram)


/** /**
*@brief Decodes a 16-bit WAV file into a float tensor.
*@brief Decodes a 16-bit WAV file into a float tensor . \n


*@par Inputs:
*contents: A Tensor of type string. The WAV-encoded audio, usually from a file.
*@par Inputs:
*contents: A Tensor of type string. The WAV-encoded audio, usually from a file . \n


*@par Attributes:
*@li desired_channels: An optional int. Defaults to "-1". \n
*@par Attributes:
*@li desired_channels: An optional int. Defaults to "-1".
Number of sample channels wanted. Number of sample channels wanted.
*@li desired_samples: An optional int. Defaults to "-1". \n
Length of audio requested.
*@li desired_samples: An optional int. Defaults to "-1".
Length of audio requested . \n


*@par Outputs:
*@par Outputs:
*@li *audio: A Tensor of type float32. *@li *audio: A Tensor of type float32.
*@li *sample_rate: A Tensor of type int32.
*@li *sample_rate: A Tensor of type int32 . \n


*@attention Constraints: \n
*DecodeWav runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*DecodeWav runs on the Ascend AI CPU, which delivers poor performance.


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator DecodeWav.
*Compatible with the TensorFlow operator DecodeWav . \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -132,21 +132,21 @@ REG_OP(DecodeWav)
.OP_END_FACTORY_REG(DecodeWav) .OP_END_FACTORY_REG(DecodeWav)


/** /**
*@brief Encode audio data using the WAV file format.
*@brief Encode audio data using the WAV file format . \n


*@par Inputs: *@par Inputs:
*Including: \n
*Including:
* @li audio: A Tensor of type DT_FLOAT. * @li audio: A Tensor of type DT_FLOAT.
* @li sample_rate: A Tensor of type DT_INT32.
* @li sample_rate: A Tensor of type DT_INT32 . \n


*@par Outputs: *@par Outputs:
*contents: A Tensor of type DT_STRING.
*contents: A Tensor of type DT_STRING . \n


*@attention Constraints:\n
*EncodeWav runs on the Ascend AI CPU, which delivers poor performance.\n
*@attention Constraints:
*EncodeWav runs on the Ascend AI CPU, which delivers poor performance.


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with tensorflow Operator EncodeWav.
*Compatible with tensorflow Operator EncodeWav . \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.


+ 41
- 40
third_party/fwkacllib/inc/ops/batch_ops.h View File

@@ -26,35 +26,36 @@
namespace ge { namespace ge {


/** /**
*@brief Creates batches of tensors in "x_tensors".
*@brief Creates batches of tensors in "x_tensors" . \n


*@par Inputs:
*Input "x_tensors" is a list or a dictionary of tensors. \n
*x_tensors: The list or dictionary of tensors to enqueue.
*@par Inputs:
*Input "x_tensors" is a list or a dictionary of tensors.
*x_tensors: The list or dictionary of tensors to enqueue .
It's a dynamic input \n


*@par Attributes:
*@li num_batch_threads: The number of threads enqueuing "x_tensors". \n
*@par Attributes:
*@li num_batch_threads: The number of threads enqueuing "x_tensors".
The batching will be nondeterministic if "num_batch_threads" > 1. The batching will be nondeterministic if "num_batch_threads" > 1.
*@li max_batch_size: The maximum batch size pulled from the queue. *@li max_batch_size: The maximum batch size pulled from the queue.
*@li max_enqueued_batches: The maximum number of batches pulled from the queue. *@li max_enqueued_batches: The maximum number of batches pulled from the queue.
*@li batch_timeout_micros: The batch processing timeout, in microseconds. *@li batch_timeout_micros: The batch processing timeout, in microseconds.
*@li allowed_batch_sizes: The allowed batch size pulled from the queue. *@li allowed_batch_sizes: The allowed batch size pulled from the queue.
*@li grad_timeout_micros: The gradient batch processing timeout, \n
*@li grad_timeout_micros: The gradient batch processing timeout,
in microseconds. in microseconds.
*@li container: If non-empty, this queue is placed in the given container. \n
*@li container: If non-empty, this queue is placed in the given container.
Otherwise, a default container is used. Otherwise, a default container is used.
*@li shared_name: If set, this queue will be shared under the given name \n
*@li shared_name: If set, this queue will be shared under the given name
across multiple sessions. across multiple sessions.
*@li batching_queue: The queue resource container.
*@li batching_queue: The queue resource container . \n


*@par Outputs:
*@par Outputs:
*@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order. *@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order.
*@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order. *@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order.
*@li y_tensors: A list or dictionary of tensors with \n
the same types as "x_tensors".
*@li y_tensors: A list or dictionary of tensors with
the same types as "x_tensors" . It's a dynamic output. \n


*@attention Constraints: \n
*Batch runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*Batch runs on the Ascend AI CPU, which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator Batch. *Compatible with the TensorFlow operator Batch.
@@ -79,26 +80,26 @@ REG_OP(Batch)
.OP_END_FACTORY_REG(Batch) .OP_END_FACTORY_REG(Batch)


/** /**
*@brief Reverses the operation of Batch for a single output Tensor.
*@brief Reverses the operation of Batch for a single output Tensor . \n


*@par Inputs:
*Input "x_tensors" is a list or a dictionary of tensors. \n
*@par Inputs:
*Input "x_tensors" is a list or a dictionary of tensors.
* @li x_tensors: The list or dictionary of tensors to enqueue. * @li x_tensors: The list or dictionary of tensors to enqueue.
* @li index: The matching "batch_index" obtained from Batch. * @li index: The matching "batch_index" obtained from Batch.
* @li id: The "id" scalar emitted by Batch.
* @li id: The "id" scalar emitted by Batch . \n


*@par Attributes:
*@par Attributes:
*@li timeout_micros: The unbatch processing timeout, in microseconds. *@li timeout_micros: The unbatch processing timeout, in microseconds.
*@li container: If non-empty, this queue is placed in the given container. \n
*@li container: If non-empty, this queue is placed in the given container.
Otherwise, a default container is used. Otherwise, a default container is used.
*@li shared_name: If set, this queue will be shared under the given name \n
across multiple sessions.
*@li shared_name: If set, this queue will be shared under the given name
across multiple sessions . \n


*@par Outputs:
*y_tensor: A list or dictionary of tensors with the same types as "x_tensors".
*@par Outputs:
*y_tensor: A list or dictionary of tensors with the same types as "x_tensors" . \n


*@attention Constraints: \n
*Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator Unbatch. *Compatible with the TensorFlow operator Unbatch.
@@ -117,27 +118,27 @@ REG_OP(Unbatch)
.OP_END_FACTORY_REG(Unbatch) .OP_END_FACTORY_REG(Unbatch)


/** /**
*@brief Acts like Batch but using the given "batch_index" index of batching \n
things as they become available.
*@brief Acts like Batch but using the given "batch_index" index of batching
things as they become available . \n


*@par Inputs:
*Input "x_input" is a list or a dictionary of tensors. \n
*@par Inputs:
*Input "x_input" is a list or a dictionary of tensors.
* @li x_input: The input to the Unbatch operation. * @li x_input: The input to the Unbatch operation.
* @li index: The batch_index given to the Unbatch operation. * @li index: The batch_index given to the Unbatch operation.
* @li id: The "id" scalar emitted by Batch. * @li id: The "id" scalar emitted by Batch.
* @li grad: The downstream gradient.
* @li grad: The downstream gradient . \n


*@par Attributes:
*@li container: If non-empty, this queue is placed in the given container. \n
*@par Attributes:
*@li container: If non-empty, this queue is placed in the given container.
Otherwise, a default container is used. Otherwise, a default container is used.
*@li shared_name: If set, this queue will be shared under the given name \n
across multiple sessions.
*@li shared_name: If set, this queue will be shared under the given name
across multiple sessions . \n


*@par Outputs:
*y_grad: The return value, either an empty tensor or the batched gradient.
*@par Outputs:
*y_grad: The return value, either an empty tensor or the batched gradient . \n


*@attention Constraints: \n
*UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator UnbatchGrad. *Compatible with the TensorFlow operator UnbatchGrad.


+ 10
- 10
third_party/fwkacllib/inc/ops/bitwise_ops.h View File

@@ -26,20 +26,20 @@
namespace ge { namespace ge {


/** /**
*@brief Element-wise computes the bitwise right-shift of x and y.
*@brief Element-wise computes the bitwise right-shift of x and y . \n


*@par Inputs:
*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n
*@par Inputs:
*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper"
are 0D scalars. are 0D scalars.
* @li x: A Tensor. Must be one of the following types: int8, int16, int32, \n
int64, uint8, uint16, uint32, uint64. \n
* @li y: A Tensor. Has the same type as "x". \n
* @li x: A Tensor. Must be one of the following types: int8, int16, int32,
int64, uint8, uint16, uint32, uint64.
* @li y: A Tensor. Has the same type as "x". \n


*@par Outputs:
* z: A Tensor. Has the same type as "x". \n
*@par Outputs:
* z: A Tensor. Has the same type as "x". \n


*@attention Constraints: \n
*Unique runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*Unique runs on the Ascend AI CPU, which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator RightShift. *Compatible with the TensorFlow operator RightShift.


+ 14
- 14
third_party/fwkacllib/inc/ops/boosted_trees_ops.h View File

@@ -26,28 +26,28 @@
namespace ge { namespace ge {


/** /**
*@brief Bucketizes each feature based on bucket boundaries.
*@brief Bucketizes each feature based on bucket boundaries . \n


*@par Inputs:
*Input "float_values" is a 1D tensor. Input "bucket_boundaries" is \n
a list of 1D tensors.
* @li float_values: A list of rank 1 tensors each containing float \n
*@par Inputs:
*Input "float_values" is a 1D tensor. Input "bucket_boundaries" is
a list of 1D tensors. It's a dynamic input.
* @li float_values: A list of rank 1 tensors each containing float
values for a single feature. values for a single feature.
* @li bucket_boundaries: A list of rank 1 tensors each containing \n
the bucket boundaries for a single feature.
* @li bucket_boundaries: A list of rank 1 tensors each containing
the bucket boundaries for a single feature . It's a dynamic input. \n


*@par Attributes:
*@li num_features: Number of features \n
*@par Attributes:
*@li num_features: Number of features


*@par Outputs:
*@li y: A list of rank 1 tensors each containing the bucketized values for \n
a single feature.
*@par Outputs:
*@li y: A list of rank 1 tensors each containing the bucketized values for
a single feature . \n


*@attention Constraints: \n
*@attention Constraints:
*BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n *BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator BoostedTreesBucketize.
*Compatible with the TensorFlow operator BoostedTreesBucketize . \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.


+ 154
- 154
third_party/fwkacllib/inc/ops/candidate_sampling_ops.h View File

@@ -26,44 +26,44 @@
namespace ge { namespace ge {


/** /**
*@brief Generates labels for candidate sampling with \n
a learned unigram distribution.
*@brief Generates labels for candidate sampling with
a learned unigram distribution. \n


*@par Inputs:
*Input "true_classes" is a 2D matrix. \n
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
the IDs of the "num_true" "target_classes" in the corresponding original label.
*@par Inputs:
*Input "true_classes" is a 2D matrix.
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, \n
*@li unique: If "unique" is true, samples with rejection,
so that all sampled candidates in a batch are unique. so that all sampled candidates in a batch are unique.
*This requires some approximation to estimate the post-rejection \n
*This requires some approximation to estimate the post-rejection
sampling probabilities. sampling probabilities.
*@li range_max: The sampler will sample integers from the interval \n
*@li range_max: The sampler will sample integers from the interval
[0, range_max). [0, range_max).
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each
element is the ID of a sampled candidate. element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing \n
the number of times each candidate is expected to occur in a batch of sampled \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing
the number of times each candidate is expected to occur in a batch of sampled
candidates. If "unique" is true, then this is a probability. candidates. If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", \n
*@li sampled_expected_count: A vector of length "num_sampled",
for each sampled candidate. for each sampled candidate.
*representing the number of times the candidate is expected to occur \n
*representing the number of times the candidate is expected to occur
in a batch of sampled candidates. in a batch of sampled candidates.
* If "unique" is true, then this is a probability. \n
* If "unique" is true, then this is a probability.


*@attention Constraints: \n
*ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU, \n
which delivers poor performance.
*@attention Constraints:
*ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU,
which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler.
*Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -82,44 +82,44 @@ REG_OP(ThreadUnsafeUnigramCandidateSampler)
.OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler) .OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler)


/** /**
*@brief Generates labels for candidate sampling with a learned \n
unigram distribution.
*@brief Generates labels for candidate sampling with a learned
unigram distribution. \n


*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label. the IDs of the "num_true" "target_classes" in the corresponding original label.
*Input "true_classes" is a 2D matrix.
*Input "true_classes" is a 2D matrix. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, \n
*@li unique: If "unique" is true, samples with rejection,
so that all sampled candidates in a batch are unique. so that all sampled candidates in a batch are unique.
*This requires some approximation to estimate the post-rejection \n
*This requires some approximation to estimate the post-rejection
sampling probabilities. sampling probabilities.
*@li range_max: The sampler will sample integers from the interval \n
*@li range_max: The sampler will sample integers from the interval
[0, range_max). [0, range_max).
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled",
in which each element is the ID of a sampled candidate. in which each element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
number of times each candidate is expected to occur \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
number of times each candidate is expected to occur
in a batch of sampled candidates. in a batch of sampled candidates.
*If "unique" is true, then this is a probability. *If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each \n
*@li sampled_expected_count: A vector of length "num_sampled", for each
sampled candidate representing the number of times. sampled candidate representing the number of times.
* the candidate is expected to occur in a batch of sampled candidates. \n
*If "unique" is true, then this is a probability.
* the candidate is expected to occur in a batch of sampled candidates.
*If "unique" is true, then this is a probability. \n


*@attention Constraints: \n
*UniformCandidateSampler runs on the Ascend AI CPU, \n
which delivers poor performance.
*@attention Constraints:
*UniformCandidateSampler runs on the Ascend AI CPU,
which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator UniformCandidateSampler.
*Compatible with the TensorFlow operator UniformCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -138,56 +138,56 @@ REG_OP(UniformCandidateSampler)
.OP_END_FACTORY_REG(UniformCandidateSampler) .OP_END_FACTORY_REG(UniformCandidateSampler)


/** /**
*@brief Generates labels for candidate sampling with a learned \n
unigram distribution.
*@brief Generates labels for candidate sampling with a learned
unigram distribution. \n


*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label. the IDs of the "num_true" "target_classes" in the corresponding original label.
* Input "true_classes" is a 2D matrix.
* Input "true_classes" is a 2D matrix. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, \n
so that all sampled candidates in a batch are unique. This requires \n
*@li unique: If "unique" is true, samples with rejection,
so that all sampled candidates in a batch are unique. This requires
some approximation to estimate the post-rejection sampling probabilities. some approximation to estimate the post-rejection sampling probabilities.
*@li range_max: The sampler will sample integers from the interval [0, range_max). *@li range_max: The sampler will sample integers from the interval [0, range_max).
*@li vocab_file: Each valid line in this file (which should have a \n
CSV-like format) corresponds to a valid word ID. \n
*@li vocab_file: Each valid line in this file (which should have a
CSV-like format) corresponds to a valid word ID.
*IDs are in sequential order, starting from num_reserved_ids. *IDs are in sequential order, starting from num_reserved_ids.
*@li distortion: The distortion is used to skew the unigram probability \n
distribution. Each weight is first raised to the distortion's power before \n
*@li distortion: The distortion is used to skew the unigram probability
distribution. Each weight is first raised to the distortion's power before
adding to the internal unigram distribution. adding to the internal unigram distribution.
*@li num_reserved_ids: Optionally some reserved IDs can be added in the range \n
[0, ..., num_reserved_ids) by the users. \n
*@li num_reserved_ids: Optionally some reserved IDs can be added in the range
[0, ..., num_reserved_ids) by the users.
* One use case is that a special unknown word token is used as ID 0. * One use case is that a special unknown word token is used as ID 0.
*@li num_shards: A sampler can be used to sample from a subset of the \n
*@li num_shards: A sampler can be used to sample from a subset of the
original range. in order to speed up the whole computation through parallelism. original range. in order to speed up the whole computation through parallelism.
*@li shard: A sampler can be used to sample from a subset of the original \n
*@li shard: A sampler can be used to sample from a subset of the original
range in order to speed up the whole computation through parallelism. range in order to speed up the whole computation through parallelism.
*@li unigrams: A list of unigram counts or probabilities, one per ID in \n
*@li unigrams: A list of unigram counts or probabilities, one per ID in
sequential order. sequential order.
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each
element is the ID of a sampled candidate. element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
number of times each candidate is expected to occur in a batch of sampled \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
number of times each candidate is expected to occur in a batch of sampled
candidates. If "unique" is true, then this is a probability. candidates. If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", \n
for each sampled candidate representing the number of times the candidate is \n
expected to occur in a batch of sampled candidates. \n
If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled",
for each sampled candidate representing the number of times the candidate is
expected to occur in a batch of sampled candidates.
If "unique" is true, then this is a probability. \n


*@attention Constraints: \n
* FixedUnigramCandidateSampler runs on the Ascend AI CPU, \n
which delivers poor performance.
*@attention Constraints:
* FixedUnigramCandidateSampler runs on the Ascend AI CPU,
which delivers poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator FixedUnigramCandidateSampler.
*Compatible with the TensorFlow operator FixedUnigramCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -212,43 +212,43 @@ REG_OP(FixedUnigramCandidateSampler)
.OP_END_FACTORY_REG(FixedUnigramCandidateSampler) .OP_END_FACTORY_REG(FixedUnigramCandidateSampler)


/** /**
*@brief Generates labels for candidate sampling with a learned \n
unigram distribution.
*@brief Generates labels for candidate sampling with a learned
unigram distribution. \n


*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label. the IDs of the "num_true" "target_classes" in the corresponding original label.
* Input "true_classes" is a 2D matrix.
* Input "true_classes" is a 2D matrix. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, \n
so that all sampled candidates in a batch are unique. \n
*This requires some approximation to estimate the post-rejection \n
*@li unique: If "unique" is true, samples with rejection,
so that all sampled candidates in a batch are unique.
*This requires some approximation to estimate the post-rejection
sampling probabilities. sampling probabilities.
*@li range_max: The sampler will sample integers from the interval \n
*@li range_max: The sampler will sample integers from the interval
[0, range_max). [0, range_max).
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each
element is the ID of a sampled candidate. element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing \n
the number of times each candidate is expected to occur in a batch of sampled candidates. \n
*If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each \n
sampled candidate representing the number of times the candidate is expected \n
to occur in a batch of sampled candidates. \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing
the number of times each candidate is expected to occur in a batch of sampled candidates.
*If "unique" is true, then this is a probability. *If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each
sampled candidate representing the number of times the candidate is expected
to occur in a batch of sampled candidates.
*If "unique" is true, then this is a probability. \n


*@attention Constraints: \n
*LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers \n
poor performance.
*@attention Constraints:
*LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers
poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator LearnedUnigramCandidateSampler.
*Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -267,42 +267,42 @@ REG_OP(LearnedUnigramCandidateSampler)
.OP_END_FACTORY_REG(LearnedUnigramCandidateSampler) .OP_END_FACTORY_REG(LearnedUnigramCandidateSampler)


/** /**
*@brief Generates labels for candidate sampling with a log-uniform \n
distribution.
*@brief Generates labels for candidate sampling with a log-uniform
distribution. \n


*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
the IDs of the "num_true" "target_classes" in the corresponding original label. \n
* Input "true_classes" is a 2D matrix.
*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label.
* Input "true_classes" is a 2D matrix. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, so that all \n
sampled candidates in a batch are unique. This requires some approximation \n
*@li unique: If "unique" is true, samples with rejection, so that all
sampled candidates in a batch are unique. This requires some approximation
to estimate the post-rejection sampling probabilities. to estimate the post-rejection sampling probabilities.
*@li range_max: The sampler will sample integers from the interval \n
*@li range_max: The sampler will sample integers from the interval
[0, range_max). [0, range_max).
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", in which each
element is the ID of a sampled candidate. element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing \n
the number of times each candidate is expected to occur in a batch of sampled \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing
the number of times each candidate is expected to occur in a batch of sampled
candidates. If "unique" is true, then this is a probability. candidates. If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each \n
sampled candidate representing the number of times the candidate is expected \n
to occur in a batch of sampled candidates. \n
*If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each
sampled candidate representing the number of times the candidate is expected
to occur in a batch of sampled candidates.
*If "unique" is true, then this is a probability. \n


*@attention Constraints: \n
*LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers \n
poor performance.
*@attention Constraints:
*LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers
poor performance. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator LogUniformCandidateSampler.
*Compatible with the TensorFlow operator LogUniformCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -321,38 +321,38 @@ REG_OP(LogUniformCandidateSampler)
.OP_END_FACTORY_REG(LogUniformCandidateSampler) .OP_END_FACTORY_REG(LogUniformCandidateSampler)


/** /**
*@brief Generates labels for candidate sampling with a learned \n
unigram distribution.
*@brief Generates labels for candidate sampling with a learned
unigram distribution. \n


*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains \n
the IDs of the "num_true" "target_classes" in the corresponding original label. \n
* Input "true_classes" is a 2D matrix.
*@par Inputs:
*true_classes: A "batch_size * num_true" matrix, in which each row contains
the IDs of the "num_true" "target_classes" in the corresponding original label.
* Input "true_classes" is a 2D matrix. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li num_sampled: Number of candidates to randomly sample. *@li num_sampled: Number of candidates to randomly sample.
*@li unique: If "unique" is true, samples with rejection, \n
so that all sampled candidates in a batch are unique. This requires some \n
*@li unique: If "unique" is true, samples with rejection,
so that all sampled candidates in a batch are unique. This requires some
approximation to estimate the post-rejection sampling probabilities. approximation to estimate the post-rejection sampling probabilities.
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled", \n
*@par Outputs:
*@li sampled_candidates: A vector of length "num_sampled",
in which each element is the ID of a sampled candidate. in which each element is the ID of a sampled candidate.
*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n
number of times each candidate is expected to occur in a batch of sampled candidates. \n
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
number of times each candidate is expected to occur in a batch of sampled candidates.
*If "unique" is true, then this is a probability. *If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each \n
sampled candidate representing the number of times the candidate is expected \n
to occur in a batch of sampled candidates. If "unique" is true, then this is a probability.
*@li sampled_expected_count: A vector of length "num_sampled", for each
sampled candidate representing the number of times the candidate is expected
to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. \n


*@attention Constraints: \n
*AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance.


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator AllCandidateSampler.
*Compatible with the TensorFlow operator AllCandidateSampler. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
@@ -370,31 +370,31 @@ REG_OP(AllCandidateSampler)
.OP_END_FACTORY_REG(AllCandidateSampler) .OP_END_FACTORY_REG(AllCandidateSampler)


/** /**
*@brief Computes the "ids" of the positions in "sampled_candidates" that \n
match "true_labels".
*@brief Computes the "ids" of the positions in "sampled_candidates" that
match "true_labels". \n


*@par Inputs:
* @li Input "true_classes" is a 2D matrix. \n
* @li true_classes: The "true_classes" output of UnpackSparseLabels. \n
* @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n
*@par Inputs:
* @li Input "true_classes" is a 2D matrix.
* @li true_classes: The "true_classes" output of UnpackSparseLabels.
* @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n


*@par Attributes:
*@par Attributes:
*@li num_true: Number of true labels per context. *@li num_true: Number of true labels per context.
*@li seed: If either "seed" or "seed2" are set to be non-zero. *@li seed: If either "seed" or "seed2" are set to be non-zero.
*@li seed2: A second seed to avoid seed collision.
*@li seed2: A second seed to avoid seed collision. \n


*@par Outputs:
*@par Outputs:
* @li indices: A vector of indices corresponding to rows of "true_candidates". * @li indices: A vector of indices corresponding to rows of "true_candidates".
* @li ids: A vector of IDs of positions in "sampled_candidates" that match a \n
* @li ids: A vector of IDs of positions in "sampled_candidates" that match a
"true_label" for the row with the corresponding index in indices. "true_label" for the row with the corresponding index in indices.
* @li weights: A vector of the same length as "indices" and "ids", in which \n
each element is -FLOAT_MAX.
* @li weights: A vector of the same length as "indices" and "ids", in which
each element is -FLOAT_MAX. \n


*@attention Constraints: \n
*ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance. \n
*@attention Constraints:
*ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance.


*@par Third-party framework compatibility *@par Third-party framework compatibility
*Compatible with the TensorFlow operator ComputeAccidentalHits.
*Compatible with the TensorFlow operator ComputeAccidentalHits. \n


*@par Restrictions: *@par Restrictions:
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.


+ 4
- 4
third_party/fwkacllib/inc/ops/condtake_ops.h View File

@@ -26,17 +26,17 @@


namespace ge { namespace ge {
/** /**
*@brief Take elements from data if specific condition is satisfied on mask.
*@brief Take elements from data if specific condition is satisfied on mask. \n


*@par Inputs: *@par Inputs:
*@li data: input tensor from which to take elements, High-dimension input would \n
*@li data: input tensor from which to take elements, High-dimension input would
first be flattened. first be flattened.
*@li mask: condition param; must be the same shape with data.
*@li mask: condition param; must be the same shape with data. \n


*@par Attributes: *@par Attributes:
*@li mode:convert by convert in Mode. *@li mode:convert by convert in Mode.
*@li val:convert by <class 'float'> *@li val:convert by <class 'float'>
*@li eps:convert by <class 'float'> (default: 1e-06)
*@li eps:convert by <class 'float'> (default: 1e-06) \n


*@par Outputs: *@par Outputs:
*@li out_data: the elements taken *@li out_data: the elements taken


+ 91
- 91
third_party/fwkacllib/inc/ops/control_flow_ops.h View File

@@ -27,21 +27,21 @@
namespace ge { namespace ge {


/** /**
*@brief Forwards the value of an available tensor from input "x" to output "y". \n
* Merge waits for at least one of the input tensors to become available. \n
* It is usually combined with Switch to implement branching. \n
* Merge forwards the first tensor to become available to output "y", \n
* and sets "value_index" the index of the tensor in inputs.
*@brief Forwards the value of an available tensor from input "x" to output "y".
* Merge waits for at least one of the input tensors to become available.
* It is usually combined with Switch to implement branching.
* Merge forwards the first tensor to become available to output "y",
* and sets "value_index" the index of the tensor in inputs . \n


*@par Inputs: *@par Inputs:
*x: The input tensors, one of which will become available. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The input tensors, one of which will become available.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n


*@par Outputs: *@par Outputs:
*@li y: The available tensor. Has the same type as "x". *@li y: The available tensor. Has the same type as "x".
*@li value_index: A scalar of type int32, for the index of the chosen input \n
* tensor.
*@li value_index: A scalar of type int32, for the index of the chosen input
* tensor . \n


*@see Switch() *@see Switch()


@@ -59,21 +59,21 @@ REG_OP(Merge)
.OP_END_FACTORY_REG(Merge) .OP_END_FACTORY_REG(Merge)


/** /**
*@brief Forwards the value of an available tensor from input "x" to output "y". \n
* Merge waits for at least one of the input tensors to become available. \n
* It is usually combined with Switch to implement branching. \n
* Merge forwards the first tensor to become available to output "y", \n
* and sets "value_index" the index of the tensor in inputs.
*@brief Forwards the value of an available tensor from input "x" to output "y".
* Merge waits for at least one of the input tensors to become available.
* It is usually combined with Switch to implement branching.
* Merge forwards the first tensor to become available to output "y",
* and sets "value_index" the index of the tensor in inputs . \n


*@par Inputs: *@par Inputs:
*x: The input tensors, one of which will become available. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The input tensors, one of which will become available.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . It's a dynamic input. \n


*@par Outputs: *@par Outputs:
*@li y: The available tensor. Has the same type as "x". *@li y: The available tensor. Has the same type as "x".
*@li value_index: A scalar of type int32, for the index of the chosen input \n
* tensor.
*@li value_index: A scalar of type int32, for the index of the chosen input
* tensor . \n


*@see Switch() | Merge() *@see Switch() | Merge()


@@ -91,21 +91,21 @@ REG_OP(RefMerge)
.OP_END_FACTORY_REG(RefMerge) .OP_END_FACTORY_REG(RefMerge)


/** /**
*@brief Forwards "data" to the output port determined by "pred". \n
* If "pred" is "true", the data input is forwarded to "output_true". \n
* Otherwise, the data is forwarded to "output_false".
*@brief Forwards "data" to the output port determined by "pred".
* If "pred" is "true", the data input is forwarded to "output_true".
* Otherwise, the data is forwarded to "output_false" . \n


*@par Inputs: *@par Inputs:
*@li data: The tensor to be forwarded. \ n *@li data: The tensor to be forwarded. \ n
* Must be one of the following types: float16, float32, float64, \n
* Must be one of the following types: float16, float32, float64,
* int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*@li pred: A boolean scalar. The output port that will receive data.
*@li pred: A boolean scalar. The output port that will receive data . \n


*@par Outputs: *@par Outputs:
*@li output_false: If "pred" is "false", data will be forwarded to this output. \n
*@li output_false: If "pred" is "false", data will be forwarded to this output.
* Has the same type as "data". * Has the same type as "data".
*@li output_true: If "pred" is "true", data will be forwarded to this output. \n
* Has the same type as "data".
*@li output_true: If "pred" is "true", data will be forwarded to this output.
* Has the same type as "data" . \n


*@see Merge() *@see Merge()


@@ -126,21 +126,21 @@ REG_OP(Switch)
.OP_END_FACTORY_REG(Switch) .OP_END_FACTORY_REG(Switch)


/** /**
*@brief Forwards "data" to the output port determined by "pred". \n
* If "pred" is "true", the data input is forwarded to "output_true". \n
* Otherwise, the data is forwarded to "output_false".
*@brief Forwards "data" to the output port determined by "pred".
* If "pred" is "true", the data input is forwarded to "output_true".
* Otherwise, the data is forwarded to "output_false" . \n


*@par Inputs: *@par Inputs:
*@li data: The ref tensor to be forwarded. \n
* Must be one of the following types: float16, float32, float64, \n
*@li data: The ref tensor to be forwarded.
* Must be one of the following types: float16, float32, float64,
* int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*@li pred: A boolean scalar. The output port that will receive data.
*@li pred: A boolean scalar. The output port that will receive data . \n


*@par Outputs: *@par Outputs:
*@li output_false: If "pred" is "false", data will be forwarded to this output. \n
*@li output_false: If "pred" is "false", data will be forwarded to this output.
* Has the same type as "data". * Has the same type as "data".
*@li output_true: If "pred" is "true", data will be forwarded to this output. \n
* Has the same type as "data".
*@li output_true: If "pred" is "true", data will be forwarded to this output.
* Has the same type as "data" . \n


*@see Merge() | Switch() *@see Merge() | Switch()


@@ -161,16 +161,16 @@ REG_OP(RefSwitch)
.OP_END_FACTORY_REG(RefSwitch) .OP_END_FACTORY_REG(RefSwitch)


/** /**
*@brief Forwards "data" to the output port determined by "pred_value".
*@brief Forwards "data" to the output port determined by "pred_value" . \n


*@par Inputs: *@par Inputs:
*@li data: The tensor to be forwarded. \ n *@li data: The tensor to be forwarded. \ n
* Must be one of the following types: float16, float32, float64, \n
* Must be one of the following types: float16, float32, float64,
* int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*@li pred_value: A int64 tensor which determines the output port that will receive data.
*@li pred_value: A int64 tensor which determines the output port that will receive data . \n


*@par Outputs: *@par Outputs:
*output: The output tensors, one of which will become available. \n
*output: The output tensors, one of which will become available.
* Has the same type as "data". * Has the same type as "data".
*/ */
REG_OP(SwitchN) REG_OP(SwitchN)
@@ -184,24 +184,24 @@ REG_OP(SwitchN)
.OP_END_FACTORY_REG(SwitchN) .OP_END_FACTORY_REG(SwitchN)


/** /**
*@brief Creates or finds a child frame, and makes "x" available to the child \n
* frame. This op is used together with Exit to create loops in the graph. \n
* The Executor uses the unique "frame_name" to identify frames. \n
* If "is_constant" is "true", output "y" is a constant in the child \n
* frame; otherwise it may be changed in the child frame.
*@brief Creates or finds a child frame, and makes "x" available to the child
* frame. This op is used together with Exit to create loops in the graph.
* The Executor uses the unique "frame_name" to identify frames.
* If "is_constant" is "true", output "y" is a constant in the child
* frame; otherwise it may be changed in the child frame . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the child frame. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the child frame.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Attributes: *@par Attributes:
*@li frame_name: A required string. The name of the child frame. *@li frame_name: A required string. The name of the child frame.
*@li is_constant: A required bool. If true, the output is constant in \n
* the child frame.
*@li is_constant: A required bool. If true, the output is constant in
* the child frame . \n


*@par Outputs: *@par Outputs:
*y: A Tensor. Has the same type as "x".
*y: A Tensor. Has the same type as "x" . \n


*@see Exit() *@see Exit()


@@ -220,24 +220,24 @@ REG_OP(Enter)
.OP_END_FACTORY_REG(Enter) .OP_END_FACTORY_REG(Enter)


/** /**
*@brief Creates or finds a child frame, and makes "x" available to the child \n
* frame. This op is used together with Exit to create loops in the graph. \n
* The Executor uses the unique "frame_name" to identify frames. \n
* If "is_constant" is "true", output "y" is a constant in the child \n
* frame; otherwise it may be changed in the child frame.
*@brief Creates or finds a child frame, and makes "x" available to the child
* frame. This op is used together with Exit to create loops in the graph.
* The Executor uses the unique "frame_name" to identify frames.
* If "is_constant" is "true", output "y" is a constant in the child
* frame; otherwise it may be changed in the child frame . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the child frame. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the child frame.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Attributes: *@par Attributes:
*@li frame_name: A required string. The name of the child frame. *@li frame_name: A required string. The name of the child frame.
*@li is_constant: A required bool. If true, the output is constant in \n
* the child frame.
*@li is_constant: A required bool. If true, the output is constant in
* the child frame . \n


*@par Outputs: *@par Outputs:
*y: A tensor. Has the same type as "x".
*y: A tensor. Has the same type as "x" . \n


*@see Exit() | Enter() *@see Exit() | Enter()


@@ -256,14 +256,14 @@ REG_OP(RefEnter)
.OP_END_FACTORY_REG(RefEnter) .OP_END_FACTORY_REG(RefEnter)


/** /**
*@brief Forwards the input to the output. This op represents the loop \n
* termination condition.
*@brief Forwards the input to the output. This op represents the loop
* termination condition . \n


*@par Inputs: *@par Inputs:
*x: A boolean scalar. The condition of the Switch op.
*x: A boolean scalar. The condition of the Switch op . \n


*@par Outputs: *@par Outputs:
*y: The tensor "x".
*y: The tensor "x" . \n


*@see Switch() *@see Switch()


@@ -276,15 +276,15 @@ REG_OP(LoopCond)
.OP_END_FACTORY_REG(LoopCond) .OP_END_FACTORY_REG(LoopCond)


/** /**
*@brief Makes the input available to the next iteration.
*@brief Makes the input available to the next iteration . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the next iteration. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the next iteration.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Outputs: *@par Outputs:
*y: A Tensor. Has the same type as "x".
*y: A Tensor. Has the same type as "x" . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator NextIteration. *@Compatible with the TensorFlow operator NextIteration.
@@ -299,15 +299,15 @@ REG_OP(NextIteration)
.OP_END_FACTORY_REG(NextIteration) .OP_END_FACTORY_REG(NextIteration)


/** /**
*@brief Makes the input available to the next iteration.
*@brief Makes the input available to the next iteration . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the next iteration. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the next iteration.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Outputs: *@par Outputs:
*y: A tensor. Has the same type as "x".
*y: A tensor. Has the same type as "x" . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator RefNextIteration. *@Compatible with the TensorFlow operator RefNextIteration.
@@ -322,15 +322,15 @@ REG_OP(RefNextIteration)
.OP_END_FACTORY_REG(RefNextIteration) .OP_END_FACTORY_REG(RefNextIteration)


/** /**
*@brief Exits the current frame to its parent frame.
*@brief Exits the current frame to its parent frame . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the parent frame. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the parent frame.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Outputs: *@par Outputs:
*y: A Tensor. Has the same type as "x".
*y: A Tensor. Has the same type as "x" . \n


*@see Enter() *@see Enter()


@@ -347,15 +347,15 @@ REG_OP(Exit)
.OP_END_FACTORY_REG(Exit) .OP_END_FACTORY_REG(Exit)


/** /**
*@brief Exits the current frame to its parent frame.
*@brief Exits the current frame to its parent frame . \n


*@par Inputs: *@par Inputs:
*x: The tensor to be made available to the parent frame. \n
* Must be one of the following types: float16, float32, float64, int8, \n
* int16, int32, int64, uint8, uint16, uint32, uint64, bool.
*x: The tensor to be made available to the parent frame.
* Must be one of the following types: float16, float32, float64, int8,
* int16, int32, int64, uint8, uint16, uint32, uint64, bool . \n


*@par Outputs: *@par Outputs:
*y: A tensor. Has the same type as "x".
*y: A tensor. Has the same type as "x" . \n


*@see Enter() | Exit() *@see Enter() | Exit()


@@ -372,9 +372,9 @@ REG_OP(RefExit)
.OP_END_FACTORY_REG(RefExit) .OP_END_FACTORY_REG(RefExit)


/** /**
*@brief Only useful as a placeholder for control edges. \n
* It is similar to a no-op that always produces a live control output \n
* even when some control inputs are dead.
*@brief Only useful as a placeholder for control edges.
* It is similar to a no-op that always produces a live control output
* even when some control inputs are dead . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator ControlTrigger. *@Compatible with the TensorFlow operator ControlTrigger.
@@ -389,7 +389,7 @@ REG_OP(ControlTrigger)
* Three inputs, including: * Three inputs, including:
*@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8. *@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8.
*@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried. *@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried.
*@li level_index: One dimensional tensore of type int32, specifying secondary index.
*@li level_index: One dimensional tensore of type int32, specifying secondary index. \n


*@par Outputs: *@par Outputs:
*@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map. *@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map.


+ 31
- 31
third_party/fwkacllib/inc/ops/ctc_ops.h View File

@@ -27,29 +27,29 @@
namespace ge { namespace ge {


/** /**
*@brief Calculates the CTC Loss (log probability) for each batch entry. \n
Also calculates the gradient.
*@brief Calculates the CTC Loss (log probability) for each batch entry.
Also calculates the gradient. \n


*@par Inputs: *@par Inputs:
*@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
*@li labels_indices: The indices of a `SparseTensor<int32, 2>`. \n
`labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for \n
*@li labels_indices: The indices of a `SparseTensor<int32, 2>`.
`labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
`(batch b, time t)`. `(batch b, time t)`.
*@li labels_values: The values (labels) associated with the given batch and time. *@li labels_values: The values (labels) associated with the given batch and time.
*@li sequence_length: A vector containing sequence lengths (batch).
*@li sequence_length: A vector containing sequence lengths (batch). \n


*@par Outputs: *@par Outputs:
*@li loss: A vector (batch) containing log-probabilities. *@li loss: A vector (batch) containing log-probabilities.
*@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x \n
batch_size x num_classes)`.
*@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x
batch_size x num_classes)`. \n


*@par Attributes: *@par Attributes:
*@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to \n
*@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to
the CTC calculation.If not specified, defaults to false the CTC calculation.If not specified, defaults to false
*@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation \n
repeated non-blank labels will not be merged and are interpreted as \n
individual labels. This is a simplified version of CTC. \n
If not specified, defaults to true
*@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation
repeated non-blank labels will not be merged and are interpreted as
individual labels. This is a simplified version of CTC.
If not specified, defaults to true. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
* Compatible with TensorFlow CTCLoss operator. * Compatible with TensorFlow CTCLoss operator.
@@ -67,24 +67,24 @@ REG_OP(CTCLoss)
.OP_END_FACTORY_REG(CTCLoss) .OP_END_FACTORY_REG(CTCLoss)


/** /**
*@brief Performs greedy decoding on the logits given in inputs.
*@brief Performs greedy decoding on the logits given in inputs. \n


*@par Inputs: *@par Inputs:
*@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
*@li sequence_length: A vector containing sequence lengths, size `(batch_size)`.
*@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n


*@par Attributes: *@par Attributes:
*@li merge_repeated: If True, merge repeated classes in output.
*@li merge_repeated: If True, merge repeated classes in output. \n


*@par Outputs: *@par Outputs:
*@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,\n
*@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,
of a `SparseTensor<int64, 2>`. The rows store: [batch, time]. of a `SparseTensor<int64, 2>`. The rows store: [batch, time].
*@li decoded_values: Values vector, size: `(total_decoded_outputs)`,\n
*@li decoded_values: Values vector, size: `(total_decoded_outputs)`,
of a `SparseTensor<int64, 2>`. The vector stores the decoded classes. of a `SparseTensor<int64, 2>`. The vector stores the decoded classes.
*@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.\n
*@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.
Values are: `[batch_size, max_decoded_length]`. Values are: `[batch_size, max_decoded_length]`.
*@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence\n
log-probabilities.
*@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence
log-probabilities. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
* Compatible with TensorFlow CTCGreedyDecoder operator. * Compatible with TensorFlow CTCGreedyDecoder operator.
@@ -100,27 +100,27 @@ REG_OP(CTCGreedyDecoder)
.OP_END_FACTORY_REG(CTCGreedyDecoder) .OP_END_FACTORY_REG(CTCGreedyDecoder)


/** /**
*@brief Performs beam search decoding on the logits given in input.
*@brief Performs beam search decoding on the logits given in input. \n


*@par Inputs: *@par Inputs:
*@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
*@li sequence_length: A vector containing sequence lengths, size `(batch_size)`.
*@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n


*@par Attributes: *@par Attributes:
*@li merge_repeated: If True, merge repeated classes in output.
*@li merge_repeated: If True, merge repeated classes in output. \n


*@par Outputs: *@par Outputs:
*@li decoded_indices: A list (length: top_paths) of indices matrices. Matrix j,\n
size `(total_decoded_outputs[j] x 2)`, has indices of a\n
*@li decoded_indices: A list (length: top_paths) of indices matrices. Matrix j,
size `(total_decoded_outputs[j] x 2)`, has indices of a
`SparseTensor<int64, 2>`. The rows store: [batch, time]. `SparseTensor<int64, 2>`. The rows store: [batch, time].
*@li decoded_values: A list (length: top_paths) of values vectors. Vector j,\n
size `(length total_decoded_outputs[j])`, has the values of a\n
*@li decoded_values: A list (length: top_paths) of values vectors. Vector j,
size `(length total_decoded_outputs[j])`, has the values of a
`SparseTensor<int64, 2>`. The vector stores the decoded classes for beam j. `SparseTensor<int64, 2>`. The vector stores the decoded classes for beam j.
*@li decoded_shape: A list (length: top_paths) of shape vector. Vector j,\n
size `(2)`, stores the shape of the decoded `SparseTensor[j]`.\n
*@li decoded_shape: A list (length: top_paths) of shape vector. Vector j,
size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
Its values are: `[batch_size, max_decoded_length[j]]`. Its values are: `[batch_size, max_decoded_length[j]]`.
*@li log_probability: A matrix, shaped: `(batch_size x top_paths)`. The\n
sequence log-probabilities.
*@li log_probability: A matrix, shaped: `(batch_size x top_paths)`. The
sequence log-probabilities. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
* Compatible with TensorFlow CTCBeamSearchDecoder operator. * Compatible with TensorFlow CTCBeamSearchDecoder operator.


+ 553
- 551
third_party/fwkacllib/inc/ops/data_flow_ops.h
File diff suppressed because it is too large
View File


+ 360
- 350
third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
File diff suppressed because it is too large
View File


+ 85
- 103
third_party/fwkacllib/inc/ops/functional_ops.h View File

@@ -25,40 +25,27 @@
#include "graph/operator.h" #include "graph/operator.h"


namespace ge { namespace ge {
REG_OP(SymbolicGradient)
.DYNAMIC_INPUT(input, TensorType::ALL())
.DYNAMIC_OUTPUT(output, TensorType::ALL())
.GRAPH(f)
.OP_END_FACTORY_REG(SymbolicGradient)

REG_OP(RemoteCall)
.INPUT(target, DT_STRING)
.DYNAMIC_INPUT(args, TensorType::ALL())
.DYNAMIC_OUTPUT(output, TensorType::ALL())
.GRAPH(f)
.OP_END_FACTORY_REG(RemoteCall)

/** /**
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
* If "cond" means True, the selected subgraph is "then_branch". \n
* Otherwise, the selected subgraph is "else_branch".
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
* If "cond" means True, the selected subgraph is "then_branch".
* Otherwise, the selected subgraph is "else_branch" . \n


*@par Inputs: *@par Inputs:
*@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if "cond" is a numerical scalar, non-zero means True and zero means False; \n
* if "cond" is a string scalar, non-empty means True and empty means False; \n
*@li cond: A Tensor. If "cond" is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if "cond" is a numerical scalar, non-zero means True and zero means False;
* if "cond" is a string scalar, non-empty means True and empty means False;
* if "cond" is not a scalar, non-empty means True and empty means False. * if "cond" is not a scalar, non-empty means True and empty means False.
*@li input: The input tensors.
*@li input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
*@li then_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what else_branch returns. * whose types are the same as what else_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
* whose types are the same as what then_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what then_branch returns . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by either then_branch(input) or else_branch(input).
*output: The output tensors returned by either then_branch(input) or else_branch(input) . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator _If. *@Compatible with the TensorFlow operator _If.
@@ -72,26 +59,26 @@ REG_OP(_If)
.OP_END_FACTORY_REG(_If) .OP_END_FACTORY_REG(_If)


/** /**
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
* If "cond" means True, the selected subgraph is "then_branch". \n
* Otherwise, the selected subgraph is "else_branch".
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
* If "cond" means True, the selected subgraph is "then_branch".
* Otherwise, the selected subgraph is "else_branch" . \n


*@par Inputs: *@par Inputs:
*@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if "cond" is a numerical scalar, non-zero means True and zero means False; \n
* if "cond" is a string scalar, non-empty means True and empty means False; \n
*@li cond: A Tensor. If "cond" is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if "cond" is a numerical scalar, non-zero means True and zero means False;
* if "cond" is a string scalar, non-empty means True and empty means False;
* if "cond" is not a scalar, non-empty means True and empty means False. * if "cond" is not a scalar, non-empty means True and empty means False.
*@li input: The input tensors.
*@li input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
*@li then_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what else_branch returns. * whose types are the same as what else_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
* whose types are the same as what then_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what then_branch returns . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by either then_branch(input) or else_branch(input).
*output: The output tensors returned by either then_branch(input) or else_branch(input) . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator StatelessIf. *@Compatible with the TensorFlow operator StatelessIf.
@@ -105,26 +92,26 @@ REG_OP(StatelessIf)
.OP_END_FACTORY_REG(StatelessIf) .OP_END_FACTORY_REG(StatelessIf)


/** /**
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors. \n
* If "cond" means True, the selected subgraph is "then_branch". \n
* Otherwise, the selected subgraph is "else_branch".
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
* If "cond" means True, the selected subgraph is "then_branch".
* Otherwise, the selected subgraph is "else_branch" . \n


*@par Inputs: *@par Inputs:
*@li cond: A Tensor. If "cond" is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if "cond" is a numerical scalar, non-zero means True and zero means False; \n
* if "cond" is a string scalar, non-empty means True and empty means False; \n
*@li cond: A Tensor. If "cond" is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if "cond" is a numerical scalar, non-zero means True and zero means False;
* if "cond" is a string scalar, non-empty means True and empty means False;
* if "cond" is not a scalar, non-empty means True and empty means False. * if "cond" is not a scalar, non-empty means True and empty means False.
*@li input: The input tensors.
*@li input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li then_branch: A subgraph takes 'input' and returns a list of tensors, \n
*@li then_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what else_branch returns. * whose types are the same as what else_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors, \n
* whose types are the same as what then_branch returns.
*@li else_branch: A subgraph takes 'input' and returns a list of tensors,
* whose types are the same as what then_branch returns . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by either then_branch(input) or else_branch(input).
*output: The output tensors returned by either then_branch(input) or else_branch(input) . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator If. *@Compatible with the TensorFlow operator If.
@@ -138,18 +125,18 @@ REG_OP(If)
.OP_END_FACTORY_REG(If) .OP_END_FACTORY_REG(If)


/** /**
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors.
*@brief Select one of the subgraphs to pass the input tensors and return the output tensors . \n


*@par Inputs: *@par Inputs:
*@li branch_index: A int32 scalar which determines the selected subgraph. *@li branch_index: A int32 scalar which determines the selected subgraph.
*@li input: The input tensors, which will be passed to the subgraph.
*@li input: The input tensors, which will be passed to the subgraph . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors, \n
* whose types are the same as what every other subgraph returns.
*branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors,
* whose types are the same as what every other subgraph returns . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by one of branches.
*output: The output tensors returned by one of branches . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator Case. *@Compatible with the TensorFlow operator Case.
@@ -162,25 +149,25 @@ REG_OP(Case)
.OP_END_FACTORY_REG(Case) .OP_END_FACTORY_REG(Case)


/** /**
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n


*@par Inputs: *@par Inputs:
*input: The input tensors.
*input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li cond: A subgraph takes 'input' and returns a tensor. \n
* If the tensor is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if it is a numerical scalar, non-zero means True and zero means False; \n
* if it is a string scalar, non-empty means True and empty means False; \n
*@li cond: A subgraph takes 'input' and returns a tensor.
* If the tensor is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if it is a numerical scalar, non-zero means True and zero means False;
* if it is a string scalar, non-empty means True and empty means False;
* if it is not a scalar, non-empty means True and empty means False. * if it is not a scalar, non-empty means True and empty means False.
*@li body: A subgraph takes 'input' and returns a another list of tensors.
*@li body: A subgraph takes 'input' and returns a another list of tensors . \n


*@par Attributes: *@par Attributes:
*parallel_iterations: An optional int, default as 10.
*parallel_iterations: An optional int, default as 10 . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "body". Has the same type as "input".
*output: The output tensors returned by "body". Has the same type as "input" . \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator _While. *@Compatible with the TensorFlow operator _While.
@@ -193,25 +180,25 @@ REG_OP(_While)
.OP_END_FACTORY_REG(_While) .OP_END_FACTORY_REG(_While)


/** /**
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n


*@par Inputs: *@par Inputs:
*input: The input tensors.
*input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li cond: A subgraph takes 'input' and returns a tensor. \n
* If the tensor is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if it is a numerical scalar, non-zero means True and zero means False; \n
* if it is a string scalar, non-empty means True and empty means False; \n
*@li cond: A subgraph takes 'input' and returns a tensor.
* If the tensor is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if it is a numerical scalar, non-zero means True and zero means False;
* if it is a string scalar, non-empty means True and empty means False;
* if it is not a scalar, non-empty means True and empty means False. * if it is not a scalar, non-empty means True and empty means False.
*@li body: A subgraph takes 'input' and returns a another list of tensors.
*@li body: A subgraph takes 'input' and returns a another list of tensors . \n


*@par Attributes: *@par Attributes:
*parallel_iterations: An optional int, default as 10.
*parallel_iterations: An optional int, default as 10 . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "body". Has the same type as "input".
*output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator While. *@Compatible with the TensorFlow operator While.
@@ -225,25 +212,25 @@ REG_OP(While)
.OP_END_FACTORY_REG(While) .OP_END_FACTORY_REG(While)


/** /**
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False.
*@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n


*@par Inputs: *@par Inputs:
*input: The input tensors.
*input: The input tensors . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*@li cond: A subgraph takes 'input' and returns a tensor. \n
* If the tensor is not a scalar of boolean type, \n
* it will be converted to a boolean according to the following rule: \n
* if it is a numerical scalar, non-zero means True and zero means False; \n
* if it is a string scalar, non-empty means True and empty means False; \n
*@li cond: A subgraph takes 'input' and returns a tensor.
* If the tensor is not a scalar of boolean type,
* it will be converted to a boolean according to the following rule:
* if it is a numerical scalar, non-zero means True and zero means False;
* if it is a string scalar, non-empty means True and empty means False;
* if it is not a scalar, non-empty means True and empty means False. * if it is not a scalar, non-empty means True and empty means False.
*@li body: A subgraph takes 'input' and returns a another list of tensors.
*@li body: A subgraph takes 'input' and returns a another list of tensors . \n


*@par Attributes: *@par Attributes:
*parallel_iterations: An optional int, default as 10.
*parallel_iterations: An optional int, default as 10 . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "body". Has the same type as "input".
*output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator StatelessWhile. *@Compatible with the TensorFlow operator StatelessWhile.
@@ -257,19 +244,19 @@ REG_OP(StatelessWhile)
.OP_END_FACTORY_REG(StatelessWhile) .OP_END_FACTORY_REG(StatelessWhile)


/** /**
*@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound.
*@brief Cyclic execute the "body" subgraph until the first input of For op exceed upper bound . \n


*@par Inputs: *@par Inputs:
*@li start: A int32 scalar. The lower bound. *@li start: A int32 scalar. The lower bound.
*@li limit: A int32 scalar. The upper bound. *@li limit: A int32 scalar. The upper bound.
*@li delta: A int32 scalar. The step size. *@li delta: A int32 scalar. The step size.
*@li input: The input tensors, which will be passed to "body".
*@li input: The input tensors, which will be passed to "body" . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*body: A subgraph takes 'input' and returns a another list of tensors.
*body: A subgraph takes 'input' and returns a another list of tensors . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "body". Has the same type as "input".
*output: The output tensors returned by "body". Has the same type as "input" . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator For. *@Compatible with the TensorFlow operator For.
@@ -284,21 +271,21 @@ REG_OP(For)
.OP_END_FACTORY_REG(For) .OP_END_FACTORY_REG(For)


/** /**
*@brief Pass the input tensors to the subgraph "f" and return the output tensors.
*@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n


*@par Inputs: *@par Inputs:
*args: The input tensors, which will be passed to "f".
*args: The input tensors, which will be passed to "f" . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*f: A subgraph takes 'args' and returns a another list of tensors.
*f: A subgraph takes 'args' and returns a another list of tensors . \n


*@par Attributes: *@par Attributes:
*@li config: An optional string, default as "". *@li config: An optional string, default as "".
*@li config_proto: An optional int, default as "". *@li config_proto: An optional int, default as "".
*@li executor_type: An optional int, default as "".
*@li executor_type: An optional int, default as "" . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "f".
*output: The output tensors returned by "f" . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator PartitionedCall. *@Compatible with the TensorFlow operator PartitionedCall.
@@ -313,21 +300,21 @@ REG_OP(PartitionedCall)
.OP_END_FACTORY_REG(PartitionedCall) .OP_END_FACTORY_REG(PartitionedCall)


/** /**
*@brief Pass the input tensors to the subgraph "f" and return the output tensors.
*@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n


*@par Inputs: *@par Inputs:
*args: The input tensors, which will be passed to "f".
*args: The input tensors, which will be passed to "f" . It's a dynamic input. \n


*@par Graphs: *@par Graphs:
*f: A subgraph takes 'args' and returns a another list of tensors.
*f: A subgraph takes 'args' and returns a another list of tensors . \n


*@par Attributes: *@par Attributes:
*@li config: An optional string, default as "". *@li config: An optional string, default as "".
*@li config_proto: An optional int, default as "". *@li config_proto: An optional int, default as "".
*@li executor_type: An optional int, default as "".
*@li executor_type: An optional int, default as "" . \n


*@par Outputs: *@par Outputs:
*output: The output tensors returned by "f".
*output: The output tensors returned by "f" . It's a dynamic output. \n


*@par Third-party framework compatibility *@par Third-party framework compatibility
*@Compatible with the TensorFlow operator StatefulPartitionedCall. *@Compatible with the TensorFlow operator StatefulPartitionedCall.
@@ -341,11 +328,6 @@ REG_OP(StatefulPartitionedCall)
.ATTR(executor_type, String, "") .ATTR(executor_type, String, "")
.OP_END_FACTORY_REG(StatefulPartitionedCall) .OP_END_FACTORY_REG(StatefulPartitionedCall)


REG_OP(FakeParam)
.OUTPUT(output, TensorType::ALL())
.ATTR(shape, ListInt, {})
.OP_END_FACTORY_REG(FakeParam)

} // namespace ge } // namespace ge


#endif // GE_FUNCTIONAL_OPS_H_ #endif // GE_FUNCTIONAL_OPS_H_

+ 64
- 63
third_party/fwkacllib/inc/ops/hcom_ops.h View File

@@ -27,18 +27,18 @@ namespace ge {
/** /**
* @brief Outputs a tensor gathering all input tensors. * @brief Outputs a tensor gathering all input tensors.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
* float32.
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
float32.
* @par Attributes: * @par Attributes:
* @li rank_size: A required integer identifying the number of ranks
* participating in the op.
* @li group: A required string identifying the group name of ranks
* participating in the op.
* @li rank_size: A required integer identifying the number of ranks
participating in the op.
* @li group: A required string identifying the group name of ranks
participating in the op.
* @par Outputs: * @par Outputs:
* y: A Tensor. Has the same type as "x". * y: A Tensor. Has the same type as "x".
* @attention Constraints:\n
* "group" is limited to 128 characters. Use "hccl_world_group"
* as the name of a world group.
* @attention Constraints:
"group" is limited to 128 characters. Use "hccl_world_group"
as the name of a world group.
*/ */
REG_OP(HcomAllGather) REG_OP(HcomAllGather)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -50,25 +50,25 @@ REG_OP(HcomAllGather)
.OP_END_FACTORY_REG(HcomAllGather) .OP_END_FACTORY_REG(HcomAllGather)


/** /**
* @brief Outputs a tensor containing the reduction across all input tensors
* passed to op.
* @brief Outputs a tensor containing the reduction across all input tensors
passed to op.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
* float32.
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
float32.
* @par Attributes: * @par Attributes:
* @li reduction: A required string identifying the reduction operation to
* perform.The supported operation are: "sum", "max", "min", "prod".
* @li group: A required string identifying the group name of ranks
* participating in the op.
* @li fusion: An optional integer identifying the fusion flag of the op. \n
* 0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id.
* @li reduction: A required string identifying the reduction operation to
perform.The supported operation are: "sum", "max", "min", "prod".
* @li group: A required string identifying the group name of ranks
participating in the op.
* @li fusion: An optional integer identifying the fusion flag of the op.
0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id.
* @li fusion_id: An optional integer identifying the fusion id of the op. * @li fusion_id: An optional integer identifying the fusion id of the op.
* The HcomAllReduce ops with the same fusion id will be fused. * The HcomAllReduce ops with the same fusion id will be fused.
* @par Outputs: * @par Outputs:
* y: A Tensor. Has the same type as "x". * y: A Tensor. Has the same type as "x".
* @attention Constraints: \n
* "group" is limited to 128 characters. Use "hccl_world_group"
* as the name of a world group.
* @attention Constraints:
*"group" is limited to 128 characters. Use "hccl_world_group"
as the name of a world group.
*/ */
REG_OP(HcomAllReduce) REG_OP(HcomAllReduce)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -84,18 +84,19 @@ REG_OP(HcomAllReduce)
/** /**
* @brief Broadcasts the input tensor in root rank to all ranks. * @brief Broadcasts the input tensor in root rank to all ranks.
* @par Inputs: * @par Inputs:
* x: A list of dynamic input tensor. Must be one of the following types:
* int8, int16, int32, float16, float32.
* x: A list of dynamic input tensor. Must be one of the following types:
int8, int16, int32, float16, float32. It's a dynamic input.
* @par Attributes: * @par Attributes:
* @li root_rank: A required integer identifying the root rank in the op
* input of this rank will be broadcast to other ranks.
* @li group: A required string identifying the group name of ranks
* participating in the op.
* @li root_rank: A required integer identifying the root rank in the op
input of this rank will be broadcast to other ranks.
* @li group: A required string identifying the group name of ranks
participating in the op.
* @par Outputs: * @par Outputs:
* y: A list of dynamic output tensor. Has the same type and length as "x". * y: A list of dynamic output tensor. Has the same type and length as "x".
* @attention Constraints:\n
* "group" is limited to 128 characters. Use "hccl_world_group"
* as the name of a world group.
* It's a dynamic output.
* @attention Constraints:
"group" is limited to 128 characters. Use "hccl_world_group"
as the name of a world group.
*/ */
REG_OP(HcomBroadcast) REG_OP(HcomBroadcast)
.DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -107,24 +108,24 @@ REG_OP(HcomBroadcast)
.OP_END_FACTORY_REG(HcomBroadcast) .OP_END_FACTORY_REG(HcomBroadcast)


/** /**
* @brief Performs reduction across all input tensors, scattering in equal
* blocks among ranks, each rank getting a chunk of data based on its rank
* index.
* @brief Performs reduction across all input tensors, scattering in equal
blocks among ranks, each rank getting a chunk of data based on its rank
index.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
* float32.
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
float32.
* @par Attributes: * @par Attributes:
* @li reduction: A required string identifying the reduction operation to
* perform. The supported operation are: "sum", "max", "min", "prod".
* @li group: A required string identifying the group name of ranks
* participating in the op.
* @li rank_size: A required integer identifying the number of ranks
* participating in the op.
* @li reduction: A required string identifying the reduction operation to
perform. The supported operation are: "sum", "max", "min", "prod".
* @li group: A required string identifying the group name of ranks
participating in the op.
* @li rank_size: A required integer identifying the number of ranks
participating in the op.
* @par Outputs: * @par Outputs:
* y: A Tensor. Has the same type as "x". * y: A Tensor. Has the same type as "x".
* @attention Constraints:\n
* "group" is limited to 128 characters. Use "hccl_world_group"
* as the name of a world group.
* @attention Constraints:
"group" is limited to 128 characters. Use "hccl_world_group"
as the name of a world group.
*/ */
REG_OP(HcomReduceScatter) REG_OP(HcomReduceScatter)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16})) .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
@@ -139,19 +140,19 @@ REG_OP(HcomReduceScatter)
/** /**
* @brief Sends the input tensor to destination rank. * @brief Sends the input tensor to destination rank.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
* float32.
* x: A tensor. Must be one of the following types: int8, int16, int32, float16,
float32.
* @par Attributes: * @par Attributes:
* @li sr_tag: A required integer identifying the send/recv message tag. The
* message will be received by the HcomReceive op with the same "sr_tag".
* @li sr_tag: A required integer identifying the send/recv message tag. The
message will be received by the HcomReceive op with the same "sr_tag".
* @li dest_rank: A required integer identifying the destination rank. * @li dest_rank: A required integer identifying the destination rank.
* @li group: A string identifying the group name of ranks participating in
* the op.
* @li group: A string identifying the group name of ranks participating in
the op.
* @par Outputs: * @par Outputs:
* None. * None.
* @attention Constraints:\n
* @li "group" is limited to 128 characters. Use
* "hccl_world_group" as the name of a world group.
* @attention Constraints:
@li "group" is limited to 128 characters. Use
"hccl_world_group" as the name of a world group.
* @li Operators HcomSend and HcomReceive have the same "sr_tag". * @li Operators HcomSend and HcomReceive have the same "sr_tag".
* @see HcomReceive * @see HcomReceive
*/ */
@@ -169,20 +170,20 @@ REG_OP(HcomSend)
* @par Inputs: * @par Inputs:
* None. * None.
* @par Attributes: * @par Attributes:
* @li sr_tag: A required integer identifying the send/recv message tag. The
* message will be send by the HcomSend op with the same "sr_tag".
* @li sr_tag: A required integer identifying the send/recv message tag. The
message will be send by the HcomSend op with the same "sr_tag".
* @li src_rank: A required integer identifying the source rank. * @li src_rank: A required integer identifying the source rank.
* @li group: A required string identifying the group name of ranks * @li group: A required string identifying the group name of ranks
* participating in the op. * participating in the op.
* @li shape: A required list identifying the shape of the tensor to be
* received.
* @li dtype: A required integer identifying the type of the tensor to be
* received. The supported types are: int8, int16, int32, float16, float32.
* @li shape: A required list identifying the shape of the tensor to be
received.
* @li dtype: A required integer identifying the type of the tensor to be
received. The supported types are: int8, int16, int32, float16, float32.
* @par Outputs: * @par Outputs:
* y: A tensor with type identified in "dtype". * y: A tensor with type identified in "dtype".
* @attention Constraints:\n
* @li "group" is limited to 128 characters. Use
* "hccl_world_group" as the name of a world group.
* @attention Constraints:
@li "group" is limited to 128 characters. Use
"hccl_world_group" as the name of a world group.
* @li Operators HcomSend and HcomReceive have the same "sr_tag". * @li Operators HcomSend and HcomReceive have the same "sr_tag".
* @li "shape" should be same as the input tensor of HcomSend. * @li "shape" should be same as the input tensor of HcomSend.
* @li "dtype" should be same as the input tensor of HcomSend. * @li "dtype" should be same as the input tensor of HcomSend.


+ 13
- 13
third_party/fwkacllib/inc/ops/hvd_ops.h View File

@@ -28,10 +28,10 @@ namespace ge {
* @brief Outputs a tensor gathering all input tensors. * @brief Outputs a tensor gathering all input tensors.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: uint8, int8, uint16, int16, int32, * x: A tensor. Must be one of the following types: uint8, int8, uint16, int16, int32,
* int64, float16, bool.
int64, float16, bool.
* @par Attributes: * @par Attributes:
* @li rank_size: A required integer identifying the number of ranks
* participating in the op.
* @li rank_size: A required integer identifying the number of ranks
participating in the op.
* @par Outputs: * @par Outputs:
* y: A Tensor. Has the same type as "x". * y: A Tensor. Has the same type as "x".
*/ */
@@ -44,13 +44,13 @@ REG_OP(HorovodAllgather)
.OP_END_FACTORY_REG(HorovodAllgather) .OP_END_FACTORY_REG(HorovodAllgather)


/** /**
* @brief Outputs a tensor containing the reduction across all input tensors
* passed to op.
* @brief Outputs a tensor containing the reduction across all input tensors
passed to op.
* @par Inputs: * @par Inputs:
* x: A tensor. Must be one of the following types: int32, int64, float16, float32
* @par Attributes:
* @li reduce_op: A required int identifying the reduction operation to
* perform.The supported operation are: "sum", "max", "min", "prod".
* x: A tensor. Must be one of the following types: int32, int64, float16, float32
@par Attributes:
* @li reduce_op: A required int identifying the reduction operation to
perform.The supported operation are: "sum", "max", "min", "prod".
* @par Outputs: * @par Outputs:
* y: A Tensor. Has the same type as "x". * y: A Tensor. Has the same type as "x".
*/ */
@@ -63,11 +63,11 @@ REG_OP(HorovodAllreduce)
/** /**
* @brief Broadcasts the input tensor in root rank to all ranks. * @brief Broadcasts the input tensor in root rank to all ranks.
* @par Inputs: * @par Inputs:
* x: A list of dynamic input tensor. Must be one of the following types:
* int8, int32, float16, float32.
* x: A list of dynamic input tensor. Must be one of the following types:
int8, int32, float16, float32.
* @par Attributes: * @par Attributes:
* @li root_rank: A required integer identifying the root rank in the op
* input of this rank will be broadcast to other ranks.
* @li root_rank: A required integer identifying the root rank in the op
input of this rank will be broadcast to other ranks.
* @par Outputs: * @par Outputs:
* y: A list of dynamic output tensor. Has the same type and length as "x". * y: A list of dynamic output tensor. Has the same type and length as "x".
*/ */


+ 396
- 377
third_party/fwkacllib/inc/ops/image_ops.h
File diff suppressed because it is too large
View File


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save