syn code for 0715

5 years ago · f4cb445ea8
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,10 @@ else()
    set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
 endif()

 if (ENABLE_PYTHON)
    add_compile_definitions(ENABLE_PYTHON)
 endif()

 set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb -fno-inline-functions -fno-omit-frame-pointer -Wl,--allow-shlib-undefined -D_LIBCPP_INLINE_VISIBILITY='' -D'_LIBCPP_EXTERN_TEMPLATE(...)=' -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2 -Wno-cpp")

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I/usr/local/include -std=c++17 -Werror -Wall -Wno-deprecated-declarations -fPIC")
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -70,6 +70,22 @@ Alexey Shevlyakov, avakh, baihuawei, BowenK, buxue, caifubi, caojian05, Cathy Wo

 Contributions of any kind are welcome!

 # Release 0.3.1-alpha

 ## Major Features and Improvements

 ### Ascend 910 Training and Inference Framework
 * Frontend and User Interface
    * Independent model init interface.
 * Data processing, augmentation, and save format
    * Support sample padding for minddataset.

 ## Bugfixes
 * Python API
    * Fix bugs in the lars optimizer([!1894](https://gitee.com/mindspore/mindspore/pulls/1894))
 * Data processing
    * Fix accuracy problem of RandomCropDecodeResize ([!2340](https://gitee.com/mindspore/mindspore/pulls/2340))

 # Release 0.3.0-alpha

 ## Major Features and Improvements
--- a/build.sh
+++ b/build.sh
@@ -24,8 +24,8 @@ usage()
 {
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E]"
  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
  echo ""
  echo "Options:"
  echo "    -d Debug mode"
@@ -48,6 +48,7 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump memory, default off"
  echo "    -D Enable dumping of function graph ir, default on"
  echo "    -S Enable async data dump, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@@ -56,6 +57,7 @@ usage()
  echo "    -s Enable serving module, default off"
  echo "    -B Enable debugger, default off"
  echo "    -E Enable IBVERBS for parameter server, default off"
  echo "    -l Compile with python dependency, default on"
 }

 # check value of input is 'on' or 'off'
@@ -87,6 +89,7 @@ checkopts()
  ENABLE_TIMELINE="off"
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
  ENABLE_DATA_DUMP="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="off"
@@ -98,9 +101,10 @@ checkopts()
  ENABLE_SERVING="off"
  ENABLE_DEBUGGER="off"
  ENABLE_IBVERBS="off"
  ENABLE_PYTHON="on"

  # Process the options
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:sB:E' opt
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@@ -151,6 +155,10 @@ checkopts()
        check_on_off $OPTARG p
        ENABLE_PROFILE="$OPTARG"
        ;;
      l)
        check_on_off $OPTARG l
        ENABLE_PYTHON="$OPTARG"
        ;;
      i)
        INC_BUILD="on"
        ;;
@@ -212,6 +220,11 @@ checkopts()
        ENABLE_DUMPE2E="$OPTARG"
        echo "enable dump end to end"
        ;;
      S)
        check_on_off $OPTARG S
        ENABLE_DATA_DUMP="$OPTARG"
        echo "enable data dump"
        ;;
      D)
        check_on_off $OPTARG D
        ENABLE_DUMP_IR="$OPTARG"
@@ -315,7 +328,11 @@ build_mindspore()
    if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
    fi
    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
    fi
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
    if [[ "X$ENABLE_MPI" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_MPI=ON"
    fi
--- a/cmake/external_libs/icu4c.cmake
+++ b/cmake/external_libs/icu4c.cmake
@@ -9,11 +9,11 @@ else()
            LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
            URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
            MD5 0c2662a2b0bc80b0eb56495205247c8f
            CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-rpath --disable-tests --disable-samples --disable-icuio --disable-extras ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
            CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh
            )
    include_directories(${icu4c_INC})
    add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
    add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
    add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
    add_definitions(-D ENABLE_ICU4C)
 endif()
 endif()
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@@ -15,7 +15,7 @@ include(${CMAKE_SOURCE_DIR}/cmake/external_libs/json.cmake)
 include(${CMAKE_SOURCE_DIR}/cmake/dependency_securec.cmake)
 include(${CMAKE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)

 if (ENABLE_DEBUGGER)
 if (ENABLE_DEBUGGER OR ENABLE_SERVING)
    # build dependencies of gRPC
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/absl.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/c-ares.cmake)
@@ -30,7 +30,7 @@ include(${CMAKE_SOURCE_DIR}/cmake/external_libs/flatbuffers.cmake)
 if(USE_GLOG)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/glog.cmake)
 endif()
 if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
 if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND NOT ENABLE_GE)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/zeromq.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/pslite.cmake)
 endif()
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -19,6 +19,7 @@ option(ENABLE_MPI "enable mpi" OFF)
 option(ENABLE_AKG "enable akg" OFF)
 option(ENABLE_DEBUGGER "enable debugger" OFF)
 option(ENABLE_IBVERBS "enable IBVERBS for parameter server" OFF)
 option(ENABLE_PYTHON "Enable python" ON)

 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    if (WIN32)
@@ -115,6 +116,10 @@ if(ENABLE_DUMP_E2E)
    add_compile_definitions(ENABLE_DUMP_E2E)
 endif()

 if(ENABLE_DATA_DUMP)
    add_compile_definitions(ENABLE_DATA_DUMP)
 endif()

 if(ENABLE_DEBUGGER)
    add_compile_definitions(ENABLE_DEBUGGER)
 endif()
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -213,7 +213,6 @@ install(
        ${CMAKE_SOURCE_DIR}/mindspore/parallel
        ${CMAKE_SOURCE_DIR}/mindspore/mindrecord
        ${CMAKE_SOURCE_DIR}/mindspore/train
        ${CMAKE_SOURCE_DIR}/mindspore/model_zoo
        ${CMAKE_SOURCE_DIR}/mindspore/common
        ${CMAKE_SOURCE_DIR}/mindspore/ops
        ${CMAKE_SOURCE_DIR}/mindspore/communication
@@ -261,3 +260,17 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
        COMPONENT mindspore
    )
 endif ()

 if (ENABLE_SERVING)
    install(
        TARGETS ms_serving
        DESTINATION ${INSTALL_BASE_DIR}
        COMPONENT mindspore
    )

    install(
        TARGETS inference
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
 endif ()
--- a/config/data_dump.json
+++ b/config/data_dump.json
@@ -0,0 +1,15 @@
 {
  "DumpSettings": {
    "net_name": "ResNet50",
    "mode": 1,
    "iteration": 0,
    "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
  },

  "DumpSettingsSpec": {
    "net_name": "net name eg:ResNet50",
    "mode": "0: dump all kernels, 1: dump kernels in kernels list",
    "iteration": "specified iteration ",
    "kernels": "op's full scope name which need to be dump"
  }
 }
--- a/config/op_info.config
+++ b/config/op_info.config
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 4084909d62c159da6ba316f61ad3d02a4857b34b
 Subproject commit 31aa96ef41067a0ecdc4113ef245f8ede48f3457
--- a/include/ms_tensor.h
+++ b/include/ms_tensor.h
@@ -20,7 +20,7 @@
 #include <utility>
 #include <vector>
 #include <memory>
 #include "ir/dtype/type_id.h"
 #include "mindspore/core/ir/dtype/type_id.h"

 namespace mindspore {
 #define MS_API __attribute__((visibility("default")))
--- a/mindspore/_extends/parse/parser.py
+++ b/mindspore/_extends/parse/parser.py
@@ -334,7 +334,7 @@ class Parser:
    def __init__(self, fn: (types.FunctionType, types.MethodType), parse_method=None) -> None:
        self.fn = fn
        self.parse_method = parse_method
        _, self.line_offset = inspect.getsourcelines(self.fn)
        self.line_offset = 0
        self.filename: str = inspect.getfile(self.fn)

        # Used to resolve the function's globals Namespace.
@@ -350,7 +350,8 @@ class Parser:
        logger.debug("fn = %r", self.fn)
        tree = None
        if isinstance(self.fn, (types.FunctionType, types.MethodType)):
            original_src = inspect.getsource(self.fn)
            lines, self.line_offset = inspect.getsourcelines(self.fn)
            original_src = ''.join(lines)
            hexstr = hashlib.sha256(original_src.encode()).hexdigest()
            tree = Parser.ast_cache.get(hexstr)
            if not tree:
--- a/mindspore/_extends/parse/standard_method.py
+++ b/mindspore/_extends/parse/standard_method.py
@@ -108,7 +108,8 @@ def enumerate_(x, start=0):
    """Enumerate list or tuple."""
    x_type = F.typeof(x)
    ret = ()
    if check_is_tuple_or_list(x_type, "enumerate"):
    op_name = "enumerate"
    if check_is_tuple_or_list(x_type, op_name, "first input") and check_is_const_int(start, op_name, "start"):
        ret = zip(range(start, start + len(x)), x)
    return ret

@@ -123,11 +124,22 @@ def while_cond(x):


@constexpr
 def check_is_tuple_or_list(x, op_name):
 def check_is_tuple_or_list(x, op_name, arg_name):
    """check whether x is list or tuple."""
    if isinstance(x, (mstype.list_type, mstype.tuple_type)):
        return True
    raise TypeError(f"For '{op_name}', the input parameter should be tuple or list, but got {x}.")
    raise TypeError(f"For '{op_name}', the '{arg_name}' should be tuple or list, but got {x}.")


@constexpr
 def check_is_const_int(x, op_name, arg_name):
    """check whether x is const int."""
    if x is None:
        raise TypeError(f"For '{op_name}', the '{arg_name}' should be a const int number, but got not const.")
    if not isinstance(x, int):
        raise TypeError(f"For '{op_name}', the '{arg_name}' should be a const int number, but got {x}.")
    return True


@constexpr
 def check_is_tensor_bool_cond(shp):
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -1,4 +1,5 @@
 ## common setting
 include_directories(${CMAKE_SOURCE_DIR}/mindspore/core)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_BINARY_DIR})
 link_directories(${CMAKE_SOURCE_DIR}/build/mindspore/graphengine)
@@ -35,20 +36,20 @@ if(ENABLE_GPU)
    include_directories(${CUDNN_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS})

    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
            "device/gpu/*.cc"
            "device/gpu/*.cu"
            "kernel/gpu/*.cu"
            "kernel/akg/gpu/*.cc"
            "kernel/akg/akg_kernel_build.cc"
            "kernel/akg/akg_kernel_attrs_process.cc"
            "runtime/device/gpu/*.cc"
            "runtime/device/gpu/*.cu"
            "backend/kernel_compiler/gpu/*.cu"
            "backend/kernel_compiler/akg/gpu/*.cc"
            "backend/kernel_compiler/akg/akg_kernel_build.cc"
            "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
            )

    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
    list(REMOVE_ITEM GPU_SRC_LIST "device/gpu/blocking_queue.cc" "device/gpu/gpu_buffer_mgr.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "device/gpu/mpi/mpi_initializer.cc"
                                  "device/gpu/distribution/collective_wrapper.cc"
                                  "device/gpu/distribution/mpi_wrapper.cc"
                                  "device/gpu/distribution/nccl_wrapper.cc"
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
                                  "runtime/device/gpu/distribution/collective_wrapper.cc"
                                  "runtime/device/gpu/distribution/mpi_wrapper.cc"
                                  "runtime/device/gpu/distribution/nccl_wrapper.cc"
                                  )

    set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
@@ -56,6 +57,7 @@ if(ENABLE_GPU)
    set_property(SOURCE ${GPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
    cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
    add_compile_definitions(ENABLE_GPU)
 endif ()

 ## make flatuffer files
@@ -101,16 +103,20 @@ if (ENABLE_DUMP_PROTO)
 endif ()

 if (ENABLE_D)
    include_directories("${CMAKE_BINARY_DIR}/kernel/aicpu")
    include_directories("${CMAKE_BINARY_DIR}/backend/kernel_compiler/aicpu")
    include_directories("${CMAKE_BINARY_DIR}/predict/generator/ir")
    file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "kernel/aicpu/proto/*.proto")
    file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "backend/kernel_compiler/aicpu/proto/*.proto")
    ms_protobuf_generate(PROTOSRCS PROTOHDRS ${PROTO_IN})
    
    file(GLOB_RECURSE PROTO_INNER RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "predict/proto/*.proto")
    ms_protobuf_generate(PREDICT_PROTOSRCS PREDICT_PROTOHDRS ${PROTO_INNER})

    file(GLOB_RECURSE PROTO_DUMP RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "runtime/device/ascend/dump/proto/*.proto")
    ms_protobuf_generate(DUMP_PROTOSRCS PROTOHDRS ${PROTO_DUMP})

    list(APPEND MINDSPORE_PROTO_LIST ${PROTOSRCS})
    list(APPEND MINDSPORE_PROTO_LIST ${PREDICT_PROTOSRCS})
    list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS})

    add_compile_definitions(ENABLE_D)
 endif ()
@@ -121,18 +127,36 @@ if (MINDSPORE_PROTO_LIST)
 endif()

 ## make sub objects
 set(SUB_COMP 
    transform pre_activate parallel pipeline device kernel common debug gvar ir onnx operator optimizer predict
    pybind_api pynative session utils vm
 set(SUB_COMP
        transform/graph_ir
        transform/onnx
        backend/optimizer
        backend/kernel_compiler
        backend/session
        runtime/device
        frontend/optimizer
        frontend/parallel
        frontend/operator
        pipeline/jit
        pipeline/pynative
        common debug gvar predict pybind_api utils vm
 )

 foreach (_comp ${SUB_COMP})
    add_subdirectory(${_comp})
    if (TARGET _mindspore_${_comp}_obj)
        list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_${_comp}_obj>)
        add_dependencies(_mindspore_${_comp}_obj proto_input flat_input)
    string(REPLACE "/" "_" sub ${_comp})
    if (TARGET _mindspore_${sub}_obj)
        list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_${sub}_obj>)
        add_dependencies(_mindspore_${sub}_obj proto_input flat_input)
    endif ()
 endforeach ()
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/base base)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_base_obj>)
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/abstract abstract)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_abstract_obj>)
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/ir ir)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_ir_obj>)
 add_dependencies(_mindspore_base_obj _mindspore_ir_obj _mindspore_abstract_obj proto_input flat_input)

 set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore STATIC ${SUB_OBJECTS_SRC})
@@ -204,8 +228,8 @@ endif()

 # set c_expression building
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set_property(SOURCE "pipeline/init.cc" PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
 pybind11_add_module(_c_expression "pipeline/init.cc")
 set_property(SOURCE "pipeline/jit/init.cc" PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
 pybind11_add_module(_c_expression "pipeline/jit/init.cc")

 MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}")
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -231,9 +255,11 @@ else ()
    target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
    target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module)
    target_link_libraries(_c_expression PRIVATE mindspore_gvar)
    target_link_libraries(_c_expression PRIVATE mindspore::pslite mindspore::protobuf ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a)
    if (${ENABLE_IBVERBS} STREQUAL "ON")
        target_link_libraries(_c_expression PRIVATE ibverbs rdmacm)
    if (NOT ENABLE_GE)
        target_link_libraries(_c_expression PRIVATE mindspore::pslite mindspore::protobuf ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a)
        if (${ENABLE_IBVERBS} STREQUAL "ON")
            target_link_libraries(_c_expression PRIVATE ibverbs rdmacm)
        endif()
    endif()
 endif ()

@@ -260,8 +286,8 @@ if (ENABLE_CPU)
 endif ()

 if (ENABLE_MINDDATA)
    add_subdirectory(mindrecord)
    add_subdirectory(dataset)
    add_subdirectory(minddata/mindrecord)
    add_subdirectory(minddata/dataset)
 endif ()

 # build inference
@@ -270,7 +296,7 @@ set(LOAD_ONNX_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/utils/load_onnx/anf_model_parser.cc
        )
 add_library(inference SHARED
        ${CMAKE_CURRENT_SOURCE_DIR}/session/session.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/backend/session/session.cc
        ${LOAD_ONNX_SRC}
        )
 target_link_libraries(inference PRIVATE ${PYTHON_LIBRARIES} ${SECUREC_LIBRARY}
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -0,0 +1,66 @@
 file(GLOB_RECURSE KERNEL_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
 	"kernel_build_info.cc"
 	"kash/*.cc"
 	"common_utils.cc"
 	"oplib/*.cc"
 )

 if (ENABLE_D)
 	file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
 		"kernel_query.cc"
 		"kernel_fusion.cc"
 		"akg/ascend/*.cc"
 		"akg/akg_kernel_build.cc"
 		"akg/akg_kernel_attrs_process.cc"
 		"akg/akg_kernel_metadata.cc"
 		"tbe/*.cc"
 		"aicpu/*.cc"
 		"rts/*.cc"
 		"hccl/*.cc"
 	)
 	add_compile_definitions(ENABLE_D)
 endif ()

 if (ENABLE_CPU)
    file(GLOB_RECURSE CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "cpu/*.cc"
    )

    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" 
                                  "cpu/ps/pull_kernel.cc"
                                  "cpu/ps/embedding_look_up_ps_kernel.cc"
                                  "cpu/ps/embedding_look_up_proxy_kernel.cc"
                                  "cpu/ps/apply_momentum_ps_kernel.cc"
                                  "cpu/ps/sparse_apply_adam_ps_kernel.cc"
                                  "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")

    if (NOT ENABLE_MPI)
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/embedding_look_up_comm_grad_cpu_kernel.cc")
    endif ()
 endif ()

 if (ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
        "akg/gpu/*.cc"
        "akg/akg_kernel_build.cc"
        "akg/akg_kernel_attrs_process.cc"
 	)

    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_gpu_kernel.cc")

    if (ENABLE_MPI)
        include(ExternalProject)
        file(GLOB_RECURSE GPU_NCCL_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/nccl/*.cc")
        list(APPEND GPU_SRC_LIST ${GPU_NCCL_LIST})
 	endif ()

 	# add_library(_mindspore_kernel_cuda_obj OBJECT ${CUDA_SRC_LIST})
 endif()

 set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST}
    PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 add_library(_mindspore_backend_kernel_compiler_obj OBJECT ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST})
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
@@ -0,0 +1,312 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_build.h"
 #include <google/protobuf/text_format.h>
 #include <fstream>
 #include <utility>
 #include <string>
 #include <vector>
 #include <memory>
 #include <algorithm>
 #include <map>
 #include "runtime/device/kernel_runtime.h"
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
 #include "proto/tensor.pb.h"
 #include "proto/tensor_shape.pb.h"
 #include "proto/attr.pb.h"
 #include "proto/node_def.pb.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "common/utils.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
 using FNodeAttrHandle = std::function<void(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto)>;

 bool SetIOIputSize(const std::shared_ptr<AnfNode> &anf_node, const size_t &input_num,
                   std::vector<size_t> *input_size_list) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(input_size_list);
  for (size_t i = 0; i < input_num; i++) {
    std::vector<size_t> shape_i = AnfAlgo::GetInputDeviceShape(anf_node, i);
    if (AnfAlgo::GetInputDeviceDataType(anf_node, i) == kObjectTypeString) {
      if (!anf_node->isa<CNode>()) {
        MS_LOG(EXCEPTION) << "anf_node is not CNode.";
      }
      auto cnode = anf_node->cast<CNodePtr>();
      MS_EXCEPTION_IF_NULL(cnode);
      if (cnode->inputs().size() < (i + 1)) {
        MS_LOG(ERROR) << "cnode inputs size " << cnode->inputs().size() << " is smaller than " << i + 1;
        return false;
      }
      auto input_node = cnode->inputs()[i + 1];
      MS_EXCEPTION_IF_NULL(input_node);
      if (input_node->isa<ValueNode>()) {
        auto value_ptr = GetValueNode(input_node);
        auto value = GetValue<std::string>(value_ptr);
        input_size_list->push_back(value.size());
      }
    } else {
      auto type_ptr = TypeIdToType(AnfAlgo::GetInputDeviceDataType(anf_node, i));
      MS_EXCEPTION_IF_NULL(type_ptr);
      int64_t size_i = 1;
      for (size_t j = 0; j < shape_i.size(); j++) {
        size_i = LongMulWithOverflowCheck(size_i, static_cast<int>(shape_i[j]));
      }
      size_t type_byte = GetTypeByte(type_ptr);
      if (type_byte == 0) {
        return false;
      }
      size_i = LongMulWithOverflowCheck(size_i, SizeToInt(type_byte));
      input_size_list->push_back(LongToSize(size_i));
    }
  }
  return true;
 }

 bool SetIOSize(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
  std::vector<size_t> input_size_list;
  std::vector<size_t> output_size_list;
  size_t input_num = AnfAlgo::GetInputTensorNum(anf_node);
  size_t output_num = AnfAlgo::GetOutputTensorNum(anf_node);

  if (!SetIOIputSize(anf_node, input_num, &input_size_list)) {
    return false;
  }
  kernel_mod_ptr->SetInputSizeList(input_size_list);

  for (size_t i = 0; i < output_num; i++) {
    std::vector<size_t> shape_i = AnfAlgo::GetOutputDeviceShape(anf_node, i);
    TypePtr type_ptr = TypeIdToType(AnfAlgo::GetOutputDeviceDataType(anf_node, i));
    MS_EXCEPTION_IF_NULL(type_ptr);
    int64_t size_i = 1;
    for (size_t j = 0; j < shape_i.size(); j++) {
      size_i = LongMulWithOverflowCheck(size_i, static_cast<int>(shape_i[j]));
    }
    size_t type_byte = GetTypeByte(type_ptr);
    if (type_byte == 0) {
      return false;
    }
    size_i = LongMulWithOverflowCheck(size_i, SizeToInt(type_byte));
    output_size_list.push_back(LongToSize(size_i));
  }
  kernel_mod_ptr->SetOutputSizeList(output_size_list);
  return true;
 }

 void ParseAttrValue(const std::string &type, const std::string &attr_name, const mindspore::ValuePtr &value,
                    ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr) {
  MS_EXCEPTION_IF_NULL(node_attr);
  MS_EXCEPTION_IF_NULL(value);
  if (type == "int") {
    auto attr_value = GetValue<int>(value);
    (*node_attr)[attr_name].set_i(attr_value);
  } else if (type == "str") {
    auto attr_value = GetValue<std::string>(value);
    (*node_attr)[attr_name].set_s(attr_value);
  } else if (type == "bool") {
    auto attr_value = GetValue<bool>(value);
    (*node_attr)[attr_name].set_b(attr_value);
  } else if (type == "float") {
    auto attr_value = GetValue<float>(value);
    (*node_attr)[attr_name].set_f(attr_value);
  } else if (type == "listInt") {
    std::vector<int> attr_value;
    auto value_type = value->type();
    MS_EXCEPTION_IF_NULL(value_type);
    auto value_type_str = value_type->ToString();
    if (value_type_str == "Int32") {
      int data = GetValue<int>(value);
      attr_value.push_back(data);
    } else {
      attr_value = GetValue<std::vector<int>>(value);
    }
    mindspore::AttrValue input_shape_attr;
    mindspore::AttrValue_ArrayValue *input_shape_attr_list = input_shape_attr.mutable_array();
    MS_EXCEPTION_IF_NULL(input_shape_attr_list);
    for (const auto shape : attr_value) {
      input_shape_attr_list->add_i(shape);
    }
    (*node_attr)[attr_name] = input_shape_attr;
  } else {
    MS_LOG(EXCEPTION) << "type: " << type << "not support";
  }
 }

 void SetNodeAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(proto);
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }
  if (op_name == kPrint) {
    return;
  }

  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAICPU);
  MS_EXCEPTION_IF_NULL(op_info_ptr);
  auto attrs_ptr = op_info_ptr->attrs_ptr();
  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  MS_EXCEPTION_IF_NULL(primitive);
  ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
  for (const auto &attr_ptr : attrs_ptr) {
    MS_EXCEPTION_IF_NULL(attr_ptr);
    std::string attr_name = attr_ptr->name();
    auto value = primitive->GetAttr(attr_name);
    if (value != nullptr) {
      if (attr_name == kQueueName || attr_name == kSharedName) {
        attr_name = kChannelName;
      } else if (attr_name == kSeed0) {
        attr_name = kSeed;
      } else if (attr_name == kSeed1) {
        attr_name = kSeed2;
      }
      std::string type = attr_ptr->type();
      ParseAttrValue(type, attr_name, value, node_attr);
    }
  }
  MS_LOG(INFO) << "Set node attr end!";
 }

 void SetNodeInputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(proto);
  MS_EXCEPTION_IF_NULL(anf_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(anf_node);
  if (input_num == 0) {
    MS_LOG(INFO) << "Node [" << AnfAlgo::GetCNodeName(anf_node) << "] does not have input.";
    return;
  }

  for (size_t input_index = 0; input_index < input_num; input_index++) {
    ::mindspore::Tensor *node_inputs = proto->add_inputs();
    MS_EXCEPTION_IF_NULL(node_inputs);
    TypeId input_type = AnfAlgo::GetInputDeviceDataType(anf_node, input_index);
    std::vector<size_t> input_shape;
    int32_t input_data_type;
    if (input_type == kObjectTypeString) {
      auto cnode = anf_node->cast<CNodePtr>();
      MS_EXCEPTION_IF_NULL(cnode);
      auto input_node = cnode->inputs()[input_index + 1];
      auto value_ptr = GetValueNode(input_node);
      auto value = GetValue<std::string>(value_ptr);
      input_shape.push_back(1);
      input_shape.push_back(value.size());
      input_data_type = AicpuOpUtil::MsTypeToProtoType(kTypeUnknown);
    } else {
      input_shape = AnfAlgo::GetInputDeviceShape(anf_node, input_index);
      input_data_type = AicpuOpUtil::MsTypeToProtoType(input_type);
    }

    mindspore::TensorShape *tensorShape = node_inputs->mutable_tensor_shape();
    for (auto item : input_shape) {
      mindspore::TensorShape_Dim *dim = tensorShape->add_dim();
      dim->set_size((::google::protobuf::int64)item);
    }
    node_inputs->set_tensor_type((mindspore::DataType)input_data_type);
    node_inputs->set_mem_device("HBM");
  }
 }

 void SetNodeOutputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(proto);
  MS_EXCEPTION_IF_NULL(anf_node);
  size_t output_num = AnfAlgo::GetOutputTensorNum(anf_node);
  if (output_num == 0) {
    MS_LOG(INFO) << "Node [" << AnfAlgo::GetCNodeName(anf_node) << "] does not have output. ";
    return;
  }

  for (size_t output_index = 0; output_index < output_num; output_index++) {
    ::mindspore::Tensor *node_outputs = proto->add_outputs();
    MS_EXCEPTION_IF_NULL(node_outputs);
    std::vector<size_t> output_shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
    mindspore::TensorShape *tensorShape = node_outputs->mutable_tensor_shape();
    MS_EXCEPTION_IF_NULL(tensorShape);
    for (auto item : output_shape) {
      mindspore::TensorShape_Dim *dim = tensorShape->add_dim();
      MS_EXCEPTION_IF_NULL(dim);
      dim->set_size((::google::protobuf::int64)item);
    }
    TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
    int32_t output_data_type = AicpuOpUtil::MsTypeToProtoType(output_type);
    node_outputs->set_tensor_type((mindspore::DataType)output_data_type);
    node_outputs->set_mem_device("HBM");
  }
 }

 void SetNodedefProto(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(proto);
  MS_LOG(INFO) << "SetNodedefProto entry";
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }
  // set op name
  proto->set_op(op_name);
  // set inputs tensor
  SetNodeInputs(anf_node, proto);
  // set outputs tensor
  SetNodeOutputs(anf_node, proto);
  // set node attr
  SetNodeAttr(anf_node, proto);
  MS_LOG(INFO) << "SetNodedefProto end!";
 }

 bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
                        const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) {
  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_LOG(INFO) << "CreateNodeDefBytes entry";

  mindspore::NodeDef proto;
  SetNodedefProto(anf_node, &proto);
  std::string nodeDefStr;
  if (!proto.SerializeToString(&nodeDefStr)) {
    MS_LOG(ERROR) << "Serialize nodeDef to string failed.";
    return false;
  }
  kernel_mod_ptr->SetNodeDef(nodeDefStr);
  MS_LOG(INFO) << "CreateNodeDefBytes end!";
  return true;
 }

 KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }
  auto kernel_mod_ptr = std::make_shared<AicpuOpKernelMod>();
  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
  kernel_mod_ptr->SetAnfNode(anf_node);
  kernel_mod_ptr->SetNodeName(op_name);
  if (!CreateNodeDefBytes(anf_node, kernel_mod_ptr)) {
    MS_LOG(EXCEPTION) << "Create nodeDefBytes faild!";
  }
  if (!SetIOSize(anf_node, kernel_mod_ptr)) {
    MS_LOG(EXCEPTION) << "Set input output size list failed.";
  }
  return kernel_mod_ptr;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.h
@@ -0,0 +1,27 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_BUILD_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_BUILD_H_
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"

 namespace mindspore {
 namespace kernel {
 KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_BUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_metadata.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_metadata.cc
@@ -0,0 +1,73 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/aicpu/aicpu_kernel_metadata.h"
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/oplib/oplib.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "backend/session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace kernel {
 void AicpuMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
  MS_LOG(INFO) << "AicpuMetadataInfo.";
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);
  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }
  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAICPU);
  if (op_info_ptr == nullptr) {
    MS_LOG(DEBUG) << "Aicpu does not have op [" << op_name << "]";
    return;
  }
  // For compatibility with the current framework
  if (op_name == kPrint || op_name == kGetNext || op_name == kPack) {
    std::vector<std::string> inputs_format{};
    std::vector<TypeId> inputs_type{};
    if (op_name == kPrint || op_name == kPack) {
      for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
        inputs_format.emplace_back(kOpFormat_DEFAULT);
        inputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index));
      }
    }
    std::vector<std::string> outputs_format;
    std::vector<TypeId> outputs_type;
    for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
      outputs_format.emplace_back(kOpFormat_DEFAULT);
      outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
    }
    auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
    builder.SetInputsFormat(inputs_format);
    builder.SetInputsDeviceType(inputs_type);
    builder.SetOutputsFormat(outputs_format);
    builder.SetOutputsDeviceType(outputs_type);
    builder.SetProcessor(AICPU);
    builder.SetKernelType(AICPU_KERNEL);
    builder.SetFusionType(OPAQUE);
    kernel_info_list->push_back(builder.Build());
    return;
  }
  if (!ParseMetadata(kernel_node, op_info_ptr, AICPU, kernel_info_list)) {
    MS_LOG(WARNING) << "Aicpu parsed metadata op [" << op_name << "] failed";
    return;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_metadata.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_metadata.h
@@ -0,0 +1,30 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_META_DATA_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_META_DATA_H_

 #include <string>
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/kernel_build_info.h"

 namespace mindspore {
 namespace kernel {
 void AicpuMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_META_DATA_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
@@ -0,0 +1,156 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"

 #include <memory>
 #include <vector>
 #include <string>
 #include <algorithm>

 #include "runtime/mem.h"
 #include "runtime/rt.h"
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_build.h"
 #include "utils/convert_utils.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "utils/context/ms_context.h"

 using AicpuTaskInfoPtr = std::shared_ptr<ge::model_runner::AicpuTaskInfo>;

 namespace mindspore {
 namespace kernel {
 constexpr auto AICPU_OPS_SO_NAME = "libaicpu_kernels.so";

 AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {}

 AicpuOpKernelMod::~AicpuOpKernelMod() {
  args_.clear();
  inputList_.clear();
  outputList_.clear();
  anf_node_ = nullptr;
  input_size_list_.clear();
  output_size_list_.clear();
  workspace_size_list_.clear();
 }

 void AicpuOpKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
 const std::vector<size_t> &AicpuOpKernelMod::GetInputSizeList() const { return input_size_list_; }
 void AicpuOpKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
 const std::vector<size_t> &AicpuOpKernelMod::GetOutputSizeList() const { return output_size_list_; }
 void AicpuOpKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
 const std::vector<size_t> &AicpuOpKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
 void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &inputList) { inputList_ = inputList; }
 void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { outputList_ = outputList; }
 void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); }
 void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; }
 void AicpuOpKernelMod::SetAnfNode(const mindspore::AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  anf_node_ = anf_node;
 }

 void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs,
                                           const std::vector<AddressPtr> &outputs) {
  MS_LOG(INFO) << "CreateCpuKernelInfoOffline start";

  node_so_ = AICPU_OPS_SO_NAME;

  // InputOutputAddr
  vector<void *> io_addrs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(io_addrs),
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(io_addrs),
                       [](const AddressPtr &output) -> void * { return output->addr; });

  auto io_addrs_num = io_addrs.size();
  // calculate paramLen: AicpuParamHead.len + ioAddrsSize + notifyId.len + customizedAttr.len
  auto param_len = sizeof(AicpuParamHead);

  // get input and output addrs size, no need to check overflow
  auto io_addrs_size = io_addrs_num * sizeof(uint64_t);
  // refresh paramLen, no need to check overflow
  param_len += io_addrs_size;

  auto node_def_len = node_def_str_.length();
  param_len += node_def_len;

  // Create taskArgs: AicpuParamHead + ioAddrs + notifyId + customizedAttr
  AicpuParamHead paramHead = {static_cast<uint32_t>(param_len), static_cast<uint32_t>(io_addrs_num)};
  args_.clear();
  (void)args_.append(reinterpret_cast<const char *>(&paramHead), sizeof(AicpuParamHead));
  // TaskArgs append ioAddrs
  if (io_addrs_size != 0) {
    (void)args_.append(reinterpret_cast<const char *>(io_addrs.data()), io_addrs_size);
  }

  // When it's aicpu customized ops, taskArgs should append customized attr
  if (node_def_len != 0) {
    (void)args_.append(reinterpret_cast<const char *>(node_def_str_.data()), node_def_len);
  }

  MS_LOG(INFO) << "CreateCpuKernelInfoOffline end";
 }

 bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                              const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == nullptr) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
    return false;
  }

  CreateCpuKernelInfo(inputs, outputs);
  if (node_name_ == kTopK) {
    node_name_ = kTopKV2;
  }
  MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
               << ", args_size:" << args_.length();
  if (rtCpuKernelLaunch(reinterpret_cast<const void *>(node_so_.c_str()),
                        reinterpret_cast<const void *>(node_name_.c_str()), 1,
                        reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), nullptr,
                        stream_ptr) != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Aicpu op launch failed!";

    return false;
  }
  return true;
 }

 std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr> &inputs,
                                                   const std::vector<AddressPtr> &,
                                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
  MS_LOG(INFO) << "AicpuOpKernelMod GenTask start";

  stream_id_ = stream_id;
  node_so_ = AICPU_OPS_SO_NAME;
  std::vector<void *> input_data_addrs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs),
                       [](const AddressPtr &input) -> void * { return input->addr; });

  std::vector<void *> output_data_addrs;
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
                       [](const AddressPtr &output) -> void * { return output->addr; });

  if (node_name_ == kTopK) {
    node_name_ = kTopKV2;
  }

  AicpuTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::AicpuTaskInfo>(
    kernel_name_, stream_id, node_so_, node_name_, node_def_str_, input_data_addrs, output_data_addrs, NeedDump());

  MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";
  return {task_info_ptr};
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
@@ -0,0 +1,75 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_MOD_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_MOD_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/ascend_kernel_mod.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 namespace mindspore {
 namespace kernel {
 class AicpuOpKernelMod : public AscendKernelMod {
 public:
  AicpuOpKernelMod();
  ~AicpuOpKernelMod() override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;

  std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;

  void SetInputList(const std::vector<int64_t> &inputList);
  void SetOutputList(const std::vector<int64_t> &outputList);
  void SetAnfNode(const AnfNodePtr &anf_node);
  void SetNodeDef(const std::string &nodeDef);
  void SetNodeName(const std::string &node_name);

  /**
   *  @brief Build AICPU Engine kernel structure, and allocate device memory for offline task generate
   *  @return SUCCESS
   *  @return FAIL
   *
   */
  void CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  void SetInputSizeList(const std::vector<size_t> &size_list);
  void SetOutputSizeList(const std::vector<size_t> &size_list);
  void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
  const std::vector<size_t> &GetInputSizeList() const override;
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;

 private:
  std::string args_;
  std::string node_def_str_;
  std::string node_name_;
  std::string node_so_;
  std::vector<int64_t> inputList_;
  std::vector<int64_t> outputList_;
  AnfNodePtr anf_node_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 using AicpuOpKernelModPtr = std::shared_ptr<AicpuOpKernelMod>;
 using AicputOpKernelModPtrList = std::vector<AicpuOpKernelModPtr>;
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.cc
@@ -0,0 +1,56 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include <vector>
 #include <string>
 #include "proto/types.pb.h"
 #include "runtime/mem.h"
 #include "runtime/rt.h"
 #include "utils/convert_utils.h"
 #include "backend/session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace kernel {
 static std::map<int32_t, int32_t> MS_PROTO_DATA_TYPE_MAP = {
  {mindspore::TypeId::kTypeUnknown, mindspore::DataType::MS_UNKNOWN},
  {mindspore::TypeId::kNumberTypeBool, mindspore::DataType::MS_BOOL},
  {mindspore::TypeId::kNumberTypeInt, mindspore::DataType::MS_INT32},
  {mindspore::TypeId::kNumberTypeInt8, mindspore::DataType::MS_INT8},
  {mindspore::TypeId::kNumberTypeInt16, mindspore::DataType::MS_INT16},
  {mindspore::TypeId::kNumberTypeInt32, mindspore::DataType::MS_INT32},
  {mindspore::TypeId::kNumberTypeInt64, mindspore::DataType::MS_INT64},
  {mindspore::TypeId::kNumberTypeUInt, mindspore::DataType::MS_UINT32},
  {mindspore::TypeId::kNumberTypeUInt8, mindspore::DataType::MS_UINT8},
  {mindspore::TypeId::kNumberTypeUInt16, mindspore::DataType::MS_UINT16},
  {mindspore::TypeId::kNumberTypeUInt32, mindspore::DataType::MS_UINT32},
  {mindspore::TypeId::kNumberTypeUInt64, mindspore::DataType::MS_UINT64},
  {mindspore::TypeId::kNumberTypeFloat16, mindspore::DataType::MS_FLOAT16},
  {mindspore::TypeId::kNumberTypeFloat, mindspore::DataType::MS_FLOAT32},
  {mindspore::TypeId::kNumberTypeFloat32, mindspore::DataType::MS_FLOAT32},
  {mindspore::TypeId::kNumberTypeFloat64, mindspore::DataType::MS_FLOAT64},
 };

 int AicpuOpUtil::MsTypeToProtoType(TypeId ms_type) {
  auto iter = MS_PROTO_DATA_TYPE_MAP.find(ms_type);
  if (iter != MS_PROTO_DATA_TYPE_MAP.end()) {
    return MS_PROTO_DATA_TYPE_MAP[ms_type];
  } else {
    MS_LOG(ERROR) << "UnSupported ms_type value" << static_cast<int>(ms_type);
    return -1;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h
@@ -0,0 +1,64 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_UTIL_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_UTIL_H_

 #include <cstdint>
 #include <vector>
 #include <map>
 #include <string>
 #include "backend/kernel_compiler/kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr auto kInitDataSetQueue = "InitDataSetQueue";
 constexpr auto kInitData = "InitData";
 constexpr auto kGetNext = "GetNext";
 constexpr auto kPrint = "Print";
 constexpr auto kPack = "Pack";
 constexpr auto kOutputTypes = "output_types";
 constexpr auto kOutputShapes = "output_shapes";
 constexpr auto kChannelName = "channel_name";
 constexpr auto kSharedName = "shared_name";
 constexpr auto kShapes = "shapes";
 constexpr auto kTypes = "types";
 constexpr auto kQueueName = "queue_name";
 constexpr auto kSeed = "seed";
 constexpr auto kSeed0 = "Seed0";
 constexpr auto kSeed1 = "Seed1";
 constexpr auto kSeed2 = "seed2";
 constexpr auto kTopK = "TopK";
 constexpr auto kTopKV2 = "TopKV2";

 struct AicpuParamHead {
  uint32_t length;         // Total length: include cunstom message
  uint32_t ioAddrNum;      // Input and output address number
  uint32_t extInfoLength;  // extInfo struct Length
  uint64_t extInfoAddr;    // extInfo address
 } __attribute__((packed));

 class AicpuOpUtil {
 public:
  static int MsTypeToProtoType(TypeId ms_type);

 private:
  // kernel id
  static uint64_t KernelId_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_AICPU_AICPU_UTIL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@@ -0,0 +1,180 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/akg/akg_kernel_attrs_process.h"

 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace kernel {
 void SetAkgAttrsForFour2Five(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // The x and output are akg op input and output param.
  std::vector<std::string> input_names = {"x"};
  std::vector<std::string> output_names = {"output"};
  AnfAlgo::SetNodeAttr("input_names", MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr("output_names", MakeValue(output_names), anf_node);

  TypeId dst_type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
  std::string dst_type;
  if (dst_type_id == kFloat32->type_id()) {
    dst_type = "float32";
  } else if (dst_type_id == kFloat16->type_id()) {
    dst_type = "float16";
  }
  AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }

 void SetAkgAttrsForFive2Four(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> input_names = {"x"};
  std::vector<std::string> output_names = {"output"};
  AnfAlgo::SetNodeAttr("input_names", MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr("output_names", MakeValue(output_names), anf_node);
  std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(anf_node, 0);
  if (origin_shape.size() != kShape4dDims) {
    MS_LOG(EXCEPTION) << "The dim of origin_shape is not equal to 4, but it's dim is " << origin_shape.size() << ".";
  }
  std::vector<int> shape_transform;
  (void)std::transform(origin_shape.begin(), origin_shape.end(), std::back_inserter(shape_transform),
                       [](const int &origin_shape) { return static_cast<int>(origin_shape); });
  AnfAlgo::SetNodeAttr("shape4d", MakeValue(shape_transform), anf_node);
  AnfAlgo::SetNodeAttr("output_format", MakeValue(kOpFormat_NCHW), anf_node);

  TypeId dst_type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
  std::string dst_type;
  if (dst_type_id == kFloat32->type_id()) {
    dst_type = "float32";
  } else if (dst_type_id == kFloat16->type_id()) {
    dst_type = "float16";
  }
  AnfAlgo::SetNodeAttr("dstType", MakeValue(dst_type), anf_node);
 }

 void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // The x and output are akg op input and output param.
  std::vector<std::string> input_names = {"x", "dst_type"};
  std::vector<std::string> output_names = {"output"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(output_names), anf_node);

  std::string dst_type;
  TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
  if (output_type == kFloat32->type_id()) {
    dst_type = "float32";
  } else if (output_type == kFloat16->type_id()) {
    dst_type = "float16";
  } else if (output_type == kInt32->type_id()) {
    dst_type = "int32";
  } else {
    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
  }
  AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }

 void SetAkgAttrsForBNGrad1(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> input_names{"dy", "data", "mean"};
  std::vector<std::string> output_names{"dgamma_red_hw", "dbeta_red_hw", "data_minus_mean"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(output_names), anf_node);
 }

 void SetAkgAttrsForBNGrad2(const AnfNodePtr &anf_node) {
  const size_t kBNGrad2InputSize = 5;
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> input_names{"dgamma_red_hw", "dbeta_red_hw", "variance", "gamma"};
  std::vector<std::string> output_names{"bn_scale", "bn_bias", "rs", "dgamma_dx", "dbeta_dx"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(output_names), anf_node);
  auto cnode = anf_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  if (cnode->inputs().size() < kBNGrad2InputSize) {
    MS_LOG(EXCEPTION) << "The inputs size of BNGrad2 is less then " << kBNGrad2InputSize;
  }
  auto input1 = cnode->input(1);
  MS_EXCEPTION_IF_NULL(input1);
  auto tuple_getitem = input1->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(tuple_getitem);
  if (tuple_getitem->inputs().size() < kTupleGetItemInputSize) {
    MS_LOG(EXCEPTION) << "The inputs size of tuple_getitem is less then " << kTupleGetItemInputSize;
  }
  auto bn_grad1 = tuple_getitem->input(kRealInputNodeIndexInTupleGetItem);
  std::vector<size_t> data_shape = AnfAlgo::GetInputDeviceShape(bn_grad1, 0);
  AnfAlgo::SetNodeAttr(kAttrDataShape, MakeValue(opt::Convert2Int(data_shape)), anf_node);
 }

 void SetAkgAttrsForBNGrad3(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> input_names{"dy", "rs", "dgamma_dx", "dbeta_dx", "data_minus_mean"};
  std::vector<std::string> output_names{"dx"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(output_names), anf_node);
 }

 void SetAkgAttrsForFusedBN1(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // Set attr for fused_bn1
  std::vector<std::string> fused_bn1_input_names{"data"};
  std::vector<std::string> fused_bn1_output_names{"mean", "var_part"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(fused_bn1_input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(fused_bn1_output_names), anf_node);
 }

 void SetAkgAttrsForFusedBN2(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // Set attr for fused_bn2
  std::vector<std::string> fused_bn2_input_names{"mean", "var_part", "running_mean", "running_var"};
  std::vector<std::string> fused_bn2_output_names{"variance", "running_mean", "running_variance"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(fused_bn2_input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(fused_bn2_output_names), anf_node);
 }

 void SetAkgAttrsForFusedBN3(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  // Set attr for fused_bn3
  std::vector<std::string> fused_bn3_input_names{"data", "mean", "variance", "gamma", "beta"};
  std::vector<std::string> fused_bn3_output_names{"y"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(fused_bn3_input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(fused_bn3_output_names), anf_node);
 }

 void SetAkgAttrsForConvBN1(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> conv_bn1_output_names{"data", "var_part", "mean"};
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(conv_bn1_output_names), anf_node);
 }

 void SetAkgAttrsForBN2AddRelu(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> bn2_add_relu_input_names{"data",  "var_part", "mean",         "other_branch_data",
                                                    "gamma", "beta",     "running_mean", "running_var"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(bn2_add_relu_input_names), anf_node);
  std::vector<std::string> bn2_add_relu_output_names{"output", "running_mean", "running_variance", "save_inv_variance"};
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(bn2_add_relu_output_names), anf_node);
 }

 void SetAkgAttrsForBN2Relu(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::vector<std::string> bn2_input_names{"data", "var_part", "mean", "gamma", "beta", "running_mean", "running_var"};
  std::vector<std::string> bn2_output_names{"y", "running_mean", "running_variance", "save_inv_variance"};
  AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(bn2_input_names), anf_node);
  AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(bn2_output_names), anf_node);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_ATTRS_PROCESS_H
 #define MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_ATTRS_PROCESS_H

 #include <vector>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include "ir/anf.h"
 #include "utils/utils.h"
 #include "frontend/operator/ops.h"

 namespace mindspore {
 namespace kernel {
 void SetAkgAttrsForFour2Five(const AnfNodePtr &anf_node);
 void SetAkgAttrsForFive2Four(const AnfNodePtr &anf_node);
 void SetAkgAttrsForCast(const AnfNodePtr &anf_node);
 void SetAkgAttrsForBNGrad1(const AnfNodePtr &anf_node);
 void SetAkgAttrsForBNGrad2(const AnfNodePtr &anf_node);
 void SetAkgAttrsForBNGrad3(const AnfNodePtr &anf_node);
 void SetAkgAttrsForFusedBN1(const AnfNodePtr &anf_node);
 void SetAkgAttrsForFusedBN2(const AnfNodePtr &anf_node);
 void SetAkgAttrsForFusedBN3(const AnfNodePtr &anf_node);
 void SetAkgAttrsForConvBN1(const AnfNodePtr &anf_node);
 void SetAkgAttrsForBN2AddRelu(const AnfNodePtr &anf_node);
 void SetAkgAttrsForBN2Relu(const AnfNodePtr &anf_node);

 const std::unordered_map<std::string, std::function<void(const AnfNodePtr &anf_node)>> kAkgKernelAttrsProcessMap = {
  {kFour2FiveOpName, SetAkgAttrsForFour2Five},
  {kFive2FourOpName, SetAkgAttrsForFive2Four},
  {"Cast", SetAkgAttrsForCast},
  {kBNGrad1OpName, SetAkgAttrsForBNGrad1},
  {kBNGrad2OpName, SetAkgAttrsForBNGrad2},
  {kBNGrad3OpName, SetAkgAttrsForBNGrad3},
  {kFusedBN1OpName, SetAkgAttrsForFusedBN1},
  {kFusedBN2OpName, SetAkgAttrsForFusedBN2},
  {kFusedBN3OpName, SetAkgAttrsForFusedBN3},
  {kConvBN1OpName, SetAkgAttrsForConvBN1},
  {kBN2AddReluOpName, SetAkgAttrsForBN2AddRelu},
  {kBN2ReLUOpName, SetAkgAttrsForBN2Relu},
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_ATTRS_PROCESS_H
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@@ -0,0 +1,623 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
 #include <Python.h>
 #include <sys/types.h>
 #include <signal.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <cctype>
 #include <cstdint>
 #include <memory>
 #include <map>
 #include <utility>
 #include <algorithm>
 #include <functional>
 #include <sstream>
 #include <iterator>
 #include <numeric>
 #include <unordered_set>
 #include "common/utils.h"
 #include "utils/convert_utils.h"
 #include "utils/any.h"
 #include "utils/utils.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/akg/akg_kernel_attrs_process.h"

 namespace mindspore {
 namespace kernel {
 constexpr int ME_MAX_KERNEL_NAME_LENGTH = 200;
 constexpr int32_t ARGS_SIZE = 1;
 constexpr auto kCompileWithJsonFunc = "compilewithjson";

 // json key
 constexpr auto kOpDesc = "op_desc";
 constexpr auto kInputDesc = "input_desc";
 constexpr auto kShape = "shape";
 constexpr auto kDataType = "data_type";
 constexpr auto kOutputDesc = "output_desc";
 constexpr auto kName = "name";
 constexpr auto kTensorName = "tensor_name";
 constexpr auto kValue = "value";
 constexpr auto KDynInputSizes = "dyn_input_sizes";
 constexpr auto KInputNames = "input_names";
 constexpr auto KInput = "input";
 constexpr auto KDtype = "dtype";
 namespace {
 template <typename T>
 std::string Vector2Str(const std::vector<T> &inputs) {
  if (!inputs.empty()) {
    std::ostringstream oss;
    (void)std::copy(inputs.begin(), inputs.end() - 1, std::ostream_iterator<T>(oss, ", "));
    oss << inputs.back();
    return oss.str();
  }
  return "";
 }
 }  // namespace

 std::string AkgKernelBuild::PyObjectToStr(PyObject *const PyObj) {
  char *pChar = nullptr;
  std::string str_res;
  if (PyObj == nullptr) {
    MS_LOG(ERROR) << "Input parameter is nullptr.";
    return str_res;
  }
  PyObject *strArgs = PyObject_Str(PyObj);
  if (strArgs != nullptr) {
    (void)PyArg_Parse(strArgs, "s", &pChar);
  }
  if (pChar == nullptr) {
    MS_LOG(ERROR) << "pChar is nullptr.";
    return str_res;
  }
  str_res = pChar;
  return str_res;
 }

 std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
                          const std::pair<size_t, size_t> &position) {
  if (node_json.count(tag) == 0) {
    MS_LOG(ERROR) << "Node [" << node_json.dump() << "] has no key [" << tag << "].";
    return "";
  }

  auto const &tag_desc = node_json[tag];
  nlohmann::json first_index;
  if (tag == kOutputDesc) {
    first_index = tag_desc;
  } else if (!tag_desc.is_array() || tag_desc.size() <= position.first) {
    MS_LOG(ERROR) << "Node [" << tag_desc.dump() << "] has no enough value [" << position.first << "].";
    return "";
  } else {
    first_index = tag_desc[position.first];
  }

  if (!first_index.is_array() || first_index.size() <= position.second) {
    MS_LOG(ERROR) << "Node [" << first_index.dump() << "] has no enough value [" << position.second << "].";
    return "";
  }
  auto const &second_index = first_index[position.second];
  if (second_index.count(kTensorName) == 0) {
    MS_LOG(ERROR) << "Node [" << second_index.dump() << "] has no key [" << kTensorName << "].";
    return "";
  }

  return second_index[kTensorName];
 }

 void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
                   nlohmann::json *const node_json) {
  MS_EXCEPTION_IF_NULL(node_json);
  if (node_json->count(tag) == 0) {
    MS_LOG(ERROR) << "Node [" << node_json->dump() << "] has no key [" << tag << "].";
    return;
  }

  nlohmann::json *tag_desc = &((*node_json)[tag]);
  nlohmann::json *first_index;
  if (tag == kOutputDesc) {
    first_index = tag_desc;
  } else if (!tag_desc->is_array() || tag_desc->size() <= position.first) {
    MS_LOG(ERROR) << "Node [" << tag_desc->dump() << "] has no enough value [" << position.first << "].";
    return;
  } else {
    first_index = &((*tag_desc)[position.first]);
  }

  if (!first_index->is_array() || first_index->size() <= position.second) {
    MS_LOG(ERROR) << "Node [" << first_index->dump() << "] has no enough value [" << position.second << "].";
    return;
  }
  nlohmann::json *second_index = &((*first_index)[position.second]);
  if (second_index->count(kTensorName) == 0) {
    MS_LOG(ERROR) << "Node [" << second_index->dump() << "] has no key [" << kTensorName << "].";
    return;
  }
  (*second_index)[kTensorName] = new_name;
  return;
 }

 int AkgKernelBuild::op_cnt_ = 0;
 std::mutex AkgKernelBuild::op_cnt_mtx_;

 std::string AkgKernelBuild::GetProcessor(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::string device;
  switch (AnfAlgo::GetProcessor(anf_node)) {
    case Processor::AICORE:
      device = kProcessorAiCore;
      break;

    case Processor::AICPU:
      device = kProcessorAiCpu;
      break;

    case Processor::CUDA:
      device = kProcessorCuda;
      break;

    default:
      MS_LOG(ERROR) << "Unknown processor type.";
      break;
  }

  return device;
 }

 bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
               std::vector<size_t> *const output_size) {
  if (input_size == nullptr || output_size == nullptr) {
    MS_LOG(ERROR) << "input size or output size is nullptr";
    return false;
  }
  input_size->clear();
  output_size->clear();

  for (size_t i = 0; i < node_json[kInputDesc].size(); i++) {
    for (size_t m = 0; m < node_json[kInputDesc][i].size(); m++) {
      std::string dtype = node_json[kInputDesc][i][m][kDataType];
      size_t nbyte = GetDtypeNbyte(dtype);
      size_t size_i = std::accumulate(node_json[kInputDesc][i][m][kShape].begin(),
                                      node_json[kInputDesc][i][m][kShape].end(), nbyte, std::multiplies<size_t>());
      input_size->push_back(size_i);
    }
  }

  for (size_t i = 0; i < node_json[kOutputDesc].size(); i++) {
    std::string dtype = node_json[kOutputDesc][i][kDataType];
    size_t nbyte = GetDtypeNbyte(dtype);
    size_t size_i = std::accumulate(node_json[kOutputDesc][i][kShape].begin(), node_json[kOutputDesc][i][kShape].end(),
                                    nbyte, std::multiplies<size_t>());
    output_size->push_back(size_i);
  }

  return true;
 }

 int AkgKernelBuild::GetOpCntInc() {
  op_cnt_mtx_.lock();
  int cnt = op_cnt_++;
  op_cnt_mtx_.unlock();
  return cnt;
 }

 bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const inputs_json) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(inputs_json);

  // for dynamic input number, dyn_input_sizes has the info of dynamic input num for each input.
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  auto op_info = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
  if (op_info == nullptr) {
    MS_LOG(ERROR) << "Apply kernel [" << op_name << "] op_info is nullptr";
    return false;
  }

  std::vector<std::shared_ptr<OpIOInfo>> inputs_ptr = op_info->inputs_ptr();
  if (inputs_ptr.empty()) {
    MS_LOG(INFO) << "Apply kernel [" << op_name << "] regist info has no input info";
    return true;
  }
  auto op_info_input_num = inputs_ptr.size();

  // for dynamic input number, dyn_input_sizes has the info of dynamic input num for each input.
  std::vector<int> dyn_input_sizes;
  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  MS_EXCEPTION_IF_NULL(primitive);

  if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
    dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
  }

  size_t real_input_index = 0;
  std::vector<nlohmann::json> input_list;
  for (size_t i = 0; i < op_info_input_num; i++) {
    size_t input_tensor_num;
    std::shared_ptr<OpIOInfo> input_ptr = inputs_ptr[i];
    std::string op_input_name;
    if (input_ptr == nullptr) {
      MS_LOG(ERROR) << "Apply kernel [" << op_name << "] regist input[" << i << "] is nullptr";
      return false;
    }

    op_input_name = input_ptr->name();
    if (dyn_input_sizes.empty()) {
      input_tensor_num = 1;
    } else {
      input_tensor_num = IntToSize(dyn_input_sizes[i]);
    }

    input_list.clear();
    for (size_t input_i = 0; input_i < input_tensor_num; input_i++) {
      // dtype : float16
      auto type_id = AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index);
      std::string dtype = TypeId2String(type_id);
      if (dtype.empty()) {
        MS_LOG(ERROR) << "Op [" << op_name << "] input [" << input_i << "] data type is null. ";
        return false;
      }
      nlohmann::json input_desc_json;
      input_desc_json[kDataType] = dtype;
      input_desc_json[kName] = op_input_name;
      input_desc_json[kTensorName] = "input_" + std::to_string(GetInputTensorIdxInc(anf_node, real_input_index));
      auto input_shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
      if (anf_node->func_graph() != nullptr && anf_node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) &&
          GetInputTensorValue(anf_node, real_input_index, &input_desc_json)) {
        MS_LOG(WARNING) << "we take input[" << real_input_index << "] of [" << anf_node->DebugString(2)
                        << "] as const tensor, shape: [" << Vector2Str(input_shape)
                        << "], value: " << input_desc_json[kValue];

        input_shape.clear();
      }
      if (input_shape.empty()) {
        input_shape.push_back(1);
      }
      input_desc_json[kShape] = input_shape;
      input_list.emplace_back(input_desc_json);
      real_input_index++;
    }
    inputs_json->emplace_back(input_list);
  }
  return true;
 }

 bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const outputs_json) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(outputs_json);
  size_t output_tensor_num = AnfAlgo::GetOutputTensorNum(anf_node);
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);

  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
  auto outputs = op_info_ptr->outputs_ptr();
  for (size_t i = 0; i < output_tensor_num; i++) {
    nlohmann::json output_json;
    auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, i);
    std::string dtype = TypeId2String(type_id);
    if (dtype.empty()) {
      MS_LOG(ERROR) << "Op [" << op_name << "] output [" << i << "] data type is null. ";
      return false;
    }

    std::string output_name = outputs[i]->name();
    output_json[kDataType] = dtype;
    output_json[kName] = output_name;
    output_json[kTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc());
    output_json[kShape] = AnfAlgo::GetOutputDeviceShape(anf_node, i);
    outputs_json->push_back(output_json);
  }
  return true;
 }

 void GetJson(const AnfNodePtr &anf_node, const std::vector<int> &dyn_input_sizes,
             const std::shared_ptr<OpAttr> &op_attr, nlohmann::json *const attr_json, const ValuePtr &attr_value) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(op_attr);
  MS_EXCEPTION_IF_NULL(attr_json);
  std::string type = op_attr->type();
  if (type == "int") {
    (*attr_json)[kValue] = GetValue<int>(attr_value);
  } else if (type == "str") {
    (*attr_json)[kValue] = GetValue<std::string>(attr_value);
  } else if (type == "bool") {
    (*attr_json)[kValue] = GetValue<bool>(attr_value);
  } else if (type == "float") {
    (*attr_json)[kValue] = GetValue<float>(attr_value);
  } else if (type == "listInt") {
    (*attr_json)[kValue] = GetValue<std::vector<int>>(attr_value);
  } else if (type == "listStr") {
    std::vector<std::string> data_format;
    if (op_attr->name() == kArgDataformat) {
      size_t tensor_args_num = !dyn_input_sizes.empty() ? dyn_input_sizes.size() : AnfAlgo::GetInputTensorNum(anf_node);
      for (size_t format_i = 0; format_i < tensor_args_num; format_i++) {
        auto input_format = AnfAlgo::GetInputFormat(anf_node, format_i);
        data_format.push_back(input_format);
      }
    } else {
      data_format = GetValue<std::vector<std::string>>(attr_value);
    }
    (*attr_json)[kValue] = data_format;
  } else {
    MS_LOG(WARNING) << "attr type:" << type;
  }
 }

 bool AkgKernelBuild::CreateAttrDescJson(const AnfNodePtr &anf_node, const std::string &op_name,
                                        const std::shared_ptr<OpInfo> &op_info, nlohmann::json *const attrs_json) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(attrs_json);
  MS_EXCEPTION_IF_NULL(op_info);
  std::vector<std::shared_ptr<OpAttr>> attrs = op_info->attrs_ptr();
  if (attrs.empty()) {
    MS_LOG(INFO) << "Apply kernel [" << op_name << "] op info attrs is empty";
    return true;
  }
  std::vector<std::shared_ptr<OpIOInfo>> inputs = op_info->inputs_ptr();

  std::vector<int> dyn_input_sizes;
  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  MS_EXCEPTION_IF_NULL(primitive);
  if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
    dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
  }

  if (inputs.empty()) {
    MS_LOG(ERROR) << "Apply kernel [" << op_name << "] op info inputs is empty";
    return false;
  }

  // create input name list for atch "x_shape" in att with "x" in primitive.
  std::map<size_t, std::string> op_info_shape_name;
  for (size_t op_info_input_i = 0; op_info_input_i < inputs.size(); op_info_input_i++) {
    std::string input_name = inputs[op_info_input_i]->name();
    std::string x_shape_name = input_name + "_shape";
    (void)op_info_shape_name.insert(make_pair(op_info_input_i, x_shape_name));
  }

  for (const auto &op_attr : attrs) {
    nlohmann::json attr_json;
    ValuePtr attr_value = primitive->GetAttr(op_attr->name());
    if (attr_value == nullptr && op_attr->name() != kArgDataformat) {
      if (op_attr->param_type() == "required") {
        // match "x_shape" in att with "x" in primitive.
        std::string attr_name = op_attr->name();
        auto find_item = std::find_if(
          op_info_shape_name.begin(), op_info_shape_name.end(),
          [attr_name](const std::map<size_t, std::string>::value_type item) { return item.second == attr_name; });
        if (find_item != op_info_shape_name.end()) {
          if (!dyn_input_sizes.empty()) {
            if (find_item->first >= dyn_input_sizes.size() - 1) {
              MS_LOG(EXCEPTION) << "dyn_input_sizes list index:" << find_item->first
                                << " is out of range:" << dyn_input_sizes.size() - 1 << ".";
              return false;
            }
            size_t tensor_idx = IntToSize(std::accumulate(&dyn_input_sizes[0], &dyn_input_sizes[find_item->first], 0));
            for (int input_i = 0; input_i < dyn_input_sizes[find_item->first]; input_i++) {
              attr_json[kValue] = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, tensor_idx);
              attr_json[kName] = op_attr->name();
              attrs_json->push_back(attr_json);
              tensor_idx++;
            }
          } else {
            attr_json[kValue] = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, find_item->first);
            attr_json[kName] = op_attr->name();
            attrs_json->push_back(attr_json);
          }
        } else {
          MS_LOG(ERROR) << "op [" << op_name << "] should have attr :" << op_attr->name();
          return false;
        }
      }
      continue;
    }

    GetJson(anf_node, dyn_input_sizes, op_attr, &attr_json, attr_value);

    attr_json[kName] = op_attr->name();
    attrs_json->push_back(attr_json);
  }
  return true;
 }

 bool AkgKernelBuild::GenerateSingleKernelJson(const AnfNodePtr &anf_node, const std::string &op_name,
                                              nlohmann::json *const node_json) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(node_json);
  int op_cnt = GetOpCntInc();
  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
  MS_EXCEPTION_IF_NULL(op_info_ptr);

  // get basic params from currentNodeOpDesc
  (*node_json)[kName] = op_name;
  (*node_json)["impl_path"] = op_info_ptr->impl_path();
  (*node_json)["process"] = AkgKernelBuild::GetProcessor(anf_node);
  (*node_json)["composite"] = false;

  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  MS_EXCEPTION_IF_NULL(primitive);
  ValuePtr input_names_v = primitive->GetAttr(KInputNames);
  if (input_names_v == nullptr) {
    MS_LOG(ERROR) << "ApplyKernel has no input_names, op[" << op_name << "].";
    return false;
  }
  std::vector<std::string> prim_input_names = GetValue<const std::vector<std::string>>(input_names_v);
  std::string inputs_name;
  for (const auto &prim_input_name : prim_input_names) {
    (void)inputs_name.append("_input_").append(prim_input_name).append("_");
  }

  // input desc
  nlohmann::json inputs_json;
  if (!CreateInputDescJson(anf_node, &inputs_json)) {
    MS_LOG(ERROR) << "Create input desc json failed, op[" << op_name << "].";
    return false;
  }
  (*node_json)[kInputDesc] = inputs_json;
  MS_LOG(INFO) << "Akg create input desc json success.";
  std::string inputs_shape = "inputs_shape_";
  for (auto &i : inputs_json) {
    for (auto &m : i) {
      std::string data_type = m[kDataType];
      (void)inputs_shape.append("_").append(data_type).append("_");
      for (auto &j : m[kShape]) {
        size_t n = j;
        (void)inputs_shape.append(std::to_string(n)).append("_");
      }
    }
  }

  // output desc
  nlohmann::json outputs_json;
  if (!CreateOutputDescJson(anf_node, &outputs_json)) {
    MS_LOG(ERROR) << "Create output desc json failed, op[" << op_name << "].";
    return false;
  }

  (*node_json)[kOutputDesc] = outputs_json;
  MS_LOG(INFO) << "Akg create output desc json success.";
  std::string outputs_shape = "outputs_shape_";
  for (auto &i : outputs_json) {
    std::string data_type = i[kDataType];
    (void)outputs_shape.append("_").append(data_type).append("_");
    for (auto &j : i[kShape]) {
      size_t m = j;
      (void)outputs_shape.append(std::to_string(m)).append("_");
    }
  }

  // attribute desc
  nlohmann::json attrs_json;
  if (!CreateAttrDescJson(anf_node, op_name, op_info_ptr, &attrs_json)) {
    MS_LOG(ERROR) << "Create attr desc json failed, op[" << op_name << "].";
    return false;
  }
  (*node_json)["attr"] = attrs_json;
  std::string json_str = node_json->dump();
  size_t hash_id = std::hash<std::string>()(json_str);
  json_name_ = op_name + "_";
  (void)json_name_.append(std::to_string(hash_id));
  MS_LOG(INFO) << "full scope name is : " << anf_node->fullname_with_scope() << ", json info name is : " << json_name_;
  json_info_ = json_str;
  (*node_json)["id"] = op_cnt;
  (*node_json)["op"] = json_name_;
  MS_LOG(INFO) << "Akg create node desc json success.";
  return true;
 }

 KernelPackPtr AkgKernelBuild::OpBuild(const std::string &node_json, const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  auto processor = AkgKernelBuild::GetProcessor(anf_node);
  auto cached_kernel_pack = SearchCache(json_name_, processor);
  if (cached_kernel_pack != nullptr) {
    MS_LOG(INFO) << "Use cached kernel, json_name_[" << json_name_ << "], fullname_with_scope["
                 << anf_node->fullname_with_scope() << "].";
    return cached_kernel_pack;
  }

  PyObject *pModule = nullptr;
  PyObject *pFunc = nullptr;
  PyObject *pArg = nullptr;
  PyObject *pRes = nullptr;

  pModule = PyImport_ImportModule(kAkgModule);
  if (pModule == nullptr) {
    MS_LOG(ERROR) << "Failed to import [" << kAkgModule << "].";
    return nullptr;
  }

  pFunc = PyObject_GetAttrString(pModule, kCompileWithJsonFunc);
  pArg = PyTuple_New(ARGS_SIZE);
  (void)PyTuple_SetItem(pArg, 0, Py_BuildValue("s", node_json.c_str()));

  (void)alarm(AUTODIFF_COMPILE_OVERTIME);
  pRes = PyEval_CallObject(pFunc, pArg);
  (void)alarm(0);
  if (pRes == nullptr) {
    MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
    return nullptr;
  }
  if (PyObject_IsTrue(pRes) != 1) {
    MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
    return nullptr;
  }

  auto new_kernel_pack = InsertCache(json_name_, processor);
  kernel::SaveJsonInfo(json_name_, json_info_);
  if (new_kernel_pack == nullptr) {
    MS_LOG(ERROR) << "Insert to cache failed, json_name_[" << json_name_ << "], fullname_with_scope["
                  << anf_node->fullname_with_scope() << "].";
    return nullptr;
  }
  return new_kernel_pack;
 }

 KernelPackPtr AkgKernelBuild::BuildByJson(const AnfNodePtr &anf_node, std::vector<size_t> *const input_size,
                                          std::vector<size_t> *const output_size) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  auto it = kAkgKernelAttrsProcessMap.find(op_name);
  if (it != kAkgKernelAttrsProcessMap.end()) {
    it->second(anf_node);
  }
  MS_LOG(INFO) << "Akg start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
  nlohmann::json node_json;
  if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
    MS_LOG(ERROR) << "Op[" << op_name << "] create single kernel json failed.";
  }

  std::string json_str = node_json.dump();
  auto kernel_pack = OpBuild(json_str, anf_node);
  if (kernel_pack == nullptr) {
    MS_LOG(ERROR) << "Akg build failed op[" << op_name << "], json:" << json_str;
    return nullptr;
  }

  if (!GetIOSize(node_json, input_size, output_size)) {
    MS_LOG(ERROR) << "Cal mem size failed.";
    return nullptr;
  }
  MS_LOG(INFO) << "Akg compile success, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node)
               << "]";
  return kernel_pack;
 }

 size_t AkgKernelBuild::GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx) {
  MS_EXCEPTION_IF_NULL(anf_node);
  auto cnode = anf_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  if (input_idx + 1 >= cnode->inputs().size()) {
    MS_EXCEPTION(ArgumentError) << "input_idx [" << input_idx << "] is out of index of inputs of ["
                                << cnode->inputs().size() - 1 << "][" << cnode->DebugString() << "]";
  }

  auto input_node = cnode->input(input_idx + 1);
  if (input_tensor_idx_.find(input_node) == input_tensor_idx_.end()) {
    size_t index = input_tensor_idx_.size();
    input_tensor_idx_[input_node] = index;
  }

  return input_tensor_idx_[input_node];
 }

 size_t AkgKernelBuild::GetOutputTensorIdxInc() {
  size_t idx = output_tensor_idx_++;
  return idx;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@@ -0,0 +1,76 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_AKGKERNELBUILD_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_AKGKERNELBUILD_H_
 #include <unordered_map>
 #include <string>
 #include <vector>
 #include <memory>
 #include <map>
 #include <utility>
 #include "backend/kernel_compiler/kernel.h"
 #include "ir/dtype.h"
 #include <nlohmann/json.hpp>
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/oplib/oplib.h"

 namespace mindspore {
 namespace kernel {
 class AkgKernelBuild {
 public:
  AkgKernelBuild() {
    input_tensor_idx_ = {};
    output_tensor_idx_ = 0;
  }
  ~AkgKernelBuild() = default;

  KernelPackPtr BuildByJson(const AnfNodePtr &anf_node, std::vector<size_t> *const input_size,
                            std::vector<size_t> *const output_size);
  static std::string GetProcessor(const AnfNodePtr &anf_node);
  static std::string PyObjectToStr(PyObject *const PyObj);

 protected:
  bool CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const inputs_json);
  bool CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const outputs_json);
  bool CreateAttrDescJson(const AnfNodePtr &anf_node, const std::string &op_name,
                          const std::shared_ptr<OpInfo> &op_info, nlohmann::json *const attrs_json);
  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
  int GetOpCntInc();
  size_t GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx);
  size_t GetOutputTensorIdxInc();
  bool GenerateSingleKernelJson(const AnfNodePtr &anf_node, const std::string &op_name,
                                nlohmann::json *const node_json);

  static int op_cnt_;
  // lock for variable fusionOpCnt in singleton mode
  static std::mutex op_cnt_mtx_;
  std::string json_name_;
  std::string json_info_;
  std::unordered_map<AnfNodePtr, size_t> input_tensor_idx_;
  size_t output_tensor_idx_;
 };

 bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
               std::vector<size_t> *const output_size);
 void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
                   nlohmann::json *const node_json);
 std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
                          const std::pair<size_t, size_t> &position);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_AKGKERNELBUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/akg_kernel_metadata.h"
 #include <memory>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
 #include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
 void AkgMetadataInfo(const CNodePtr &kernel_node,
                     std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);

  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
  for (size_t i = 0; i < support_devices.size(); i++) {
    auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
    if (op_info_ptr == nullptr) {
      continue;
    }

    if (!ParseMetadata(kernel_node, op_info_ptr, Processor(i), kernel_info_list)) {
      MS_LOG(WARNING) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "] failed.";
    } else {
      MS_LOG(DEBUG) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "].";
      break;
    }
  }

  if (kernel_info_list->empty()) {
    MS_LOG(WARNING) << "Akg dose not has metadata of op[" << op_name << "].";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.h
@@ -0,0 +1,31 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_

 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <memory>
 #include "backend/kernel_compiler/kernel_build_info.h"

 namespace mindspore {
 namespace kernel {
 void AkgMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
@@ -0,0 +1,422 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h"

 #include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include <Python.h>
 #include "ir/dtype.h"
 #include "ir/func_graph.h"
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.h"
 #include "backend/kernel_compiler/akg/akg_kernel_attrs_process.h"
 #include "backend/session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace kernel {
 constexpr int32_t PARALLEL_ARGS_SIZE = 3;
 constexpr int32_t PROCESS_NUM = 16;
 constexpr int32_t TIME_OUT = 300;

 constexpr auto kOpDesc = "op_desc";
 constexpr auto kShape = "shape";
 constexpr auto kDataType = "data_type";
 constexpr auto kInputDesc = "input_desc";
 constexpr auto kOutputDesc = "output_desc";
 constexpr auto kTensorName = "tensor_name";
 constexpr auto kCompileAkgKernelParallelFunc = "compile_akg_kernel_parallel";
 constexpr auto kMultiProcModule = "mindspore._extends.parallel_compile.akg_compiler.multi_process_compiler";
 namespace {
 void UpdateTensorNameInJson(const std::vector<AnfNodePtr> &anf_nodes,
                            std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
  for (auto const &anf_node : anf_nodes) {
    std::vector<int> dyn_input_sizes;
    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
    MS_EXCEPTION_IF_NULL(primitive);

    if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
      dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
    }

    bool is_dynamic_input = !dyn_input_sizes.empty();
    size_t input_num = is_dynamic_input ? dyn_input_sizes.size() : AnfAlgo::GetInputTensorNum(anf_node);
    size_t real_input_index = 0;
    for (size_t i = 0; i < input_num; ++i) {
      size_t input_tensor_num = is_dynamic_input ? IntToSize(dyn_input_sizes[i]) : 1;
      for (size_t j = 0; j < input_tensor_num; ++j) {
        auto tmp_input = GetKernelInput(anf_node, real_input_index);
        std::string tensor_name = GetTensorName((*node_json_map)[anf_node], kInputDesc, std::make_pair(i, j));
        if (node_json_map->find(tmp_input.first) != node_json_map->end()) {
          std::string new_tensor_name =
            GetTensorName((*node_json_map)[tmp_input.first], kOutputDesc, std::make_pair(0, tmp_input.second));
          SetTensorName(kInputDesc, new_tensor_name, std::make_pair(i, j), &((*node_json_map)[anf_node]));
          MS_LOG(DEBUG) << "Update [" << real_input_index << "] input [" << tensor_name << "] of ["
                        << anf_node->fullname_with_scope() << "] to [" << tmp_input.second << "] output ["
                        << new_tensor_name << "] of [" << tmp_input.first->fullname_with_scope() << "].";
        } else {
          MS_LOG(DEBUG) << "[" << real_input_index << "] input " << tensor_name << "] of ["
                        << anf_node->fullname_with_scope() << "] is out input.";
        }
        real_input_index++;
      }
    }
  }
 }

 nlohmann::json GetInputsJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
                             std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
  nlohmann::json inputs_json;
  auto input_index = GetInputIndex(anf_nodes, input_list);
  for (size_t i = 0; i < input_index.size(); ++i) {
    auto tmp_input = input_index[i];
    auto type_id = AnfAlgo::GetInputDeviceDataType(tmp_input.first, tmp_input.second.first);
    std::string dtype = TypeId2String(type_id);
    nlohmann::json input_desc_json;
    input_desc_json[kTensorName] = GetTensorName((*node_json_map)[tmp_input.first], kInputDesc, tmp_input.second);
    input_desc_json[kDataType] = dtype;
    input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(tmp_input.first, tmp_input.second.first);
    inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json});
  }

  return inputs_json;
 }

 nlohmann::json GetOutputsJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
                              const std::vector<AnfNodePtr> &output_list, const nlohmann::json &inputs_json,
                              std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
  nlohmann::json outputs_json;
  auto output_index = GetOutputIndex(anf_nodes, input_list, output_list);
  for (size_t i = 0; i < output_index.size(); ++i) {
    auto tmp_output = output_index[i];
    bool found = false;
    nlohmann::json output_desc_json;
    for (size_t input_i = 0; input_i < input_list.size(); ++input_i) {
      if (tmp_output.first == input_list[input_i]) {
        output_desc_json = inputs_json[input_i][0];
        found = true;
        break;
      }
    }
    if (!found) {
      auto type_id = AnfAlgo::GetOutputDeviceDataType(tmp_output.first, tmp_output.second);
      std::string dtype = TypeId2String(type_id);
      output_desc_json[kTensorName] =
        GetTensorName((*node_json_map)[tmp_output.first], kOutputDesc, std::make_pair(0, tmp_output.second));
      output_desc_json[kDataType] = dtype;
      auto output_shape = AnfAlgo::GetOutputDeviceShape(tmp_output.first, tmp_output.second);
      if (output_shape.empty()) {
        output_shape.push_back(1);
      }
      output_desc_json[kShape] = output_shape;
    }
    outputs_json.emplace_back(output_desc_json);
  }

  return outputs_json;
 }

 std::pair<std::vector<std::string>, std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>>> PreProcessJsonForBuild(
  const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
  std::vector<std::string> jsons;
  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> repeat_nodes;
  std::unordered_set<std::string> json_name_set;
  for (const auto &[builder, anf_node] : build_args) {
    MS_EXCEPTION_IF_NULL(anf_node);
    auto json_name = builder.json_name();
    MS_LOG(DEBUG) << "Akg start compile op: " << json_name;
    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
    if (cached_kernel_pack != nullptr) {
      MS_LOG(DEBUG) << "Use cached kernel, json_name_[" << json_name << "], fullname_with_scope["
                    << anf_node->fullname_with_scope() << "].";
      auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
      kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
      kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
      AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
      continue;
    }

    if (json_name_set.count(json_name) != 0) {
      repeat_nodes.push_back({builder, anf_node});
      continue;
    }
    json_name_set.insert(json_name);
    auto node_json = builder.kernel_json();
    kernel::SaveJsonInfo(json_name, node_json);
    jsons.push_back(node_json);
  }

  return std::make_pair(jsons, repeat_nodes);
 }

 bool PostProcessAfterCompile(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args,
                             const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &repeat_nodes) {
  for (const auto &[builder, anf_node] : build_args) {
    auto json_name = builder.json_name();
    auto new_kernel_pack = tbe::TbeUtils::InsertCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
    if (new_kernel_pack == nullptr) {
      MS_LOG(ERROR) << "Insert to cache failed, json_name_[" << json_name << "], fullname_with_scope["
                    << anf_node->fullname_with_scope() << "].";
      return false;
    }
    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(new_kernel_pack);
    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
    MS_LOG(DEBUG) << "Akg compile " << json_name << " kernel and insert cache successfully!";
  }

  for (const auto &[builder, anf_node] : repeat_nodes) {
    auto node_json = builder.kernel_json();
    auto json_name = builder.json_name();
    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
    if (cached_kernel_pack == nullptr) {
      return false;
    }
    MS_LOG(INFO) << "Use just compiled kernel, json_name_[" << json_name << "], fullname_with_scope["
                 << anf_node->fullname_with_scope() << "].";
    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
  }

  return true;
 }
 }  // namespace

 bool AkgAscendKernelBuilder::CollectJson(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  MS_LOG(INFO) << "AKG start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
  auto it = kAkgKernelAttrsProcessMap.find(op_name);
  if (it != kAkgKernelAttrsProcessMap.end()) {
    it->second(anf_node);
  }
  MS_LOG(INFO) << "Akg start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
  nlohmann::json node_json;
  if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
    MS_LOG(ERROR) << "Op[" << op_name << "] create single kernel json failed.";
  }

  kernel_json_ = node_json.dump();

  if (!GetIOSize(node_json, &input_size_list_, &output_size_list_)) {
    MS_LOG(ERROR) << "Cal mem size failed.";
    return false;
  }

  return true;
 }

 bool AkgAscendKernelBuilder::GenJsonAndPreprocess4Fused(const std::vector<AnfNodePtr> &anf_nodes,
                                                        std::map<AnfNodePtr, nlohmann::json> *node_json_map) {
  for (auto const &anf_node : anf_nodes) {
    MS_EXCEPTION_IF_NULL(anf_node);
    std::string op_name = AnfAlgo::GetCNodeName(anf_node);
    if (!AnfAlgo::IsRealKernel(anf_node)) {
      MS_LOG(ERROR) << "Invalid anf node to build [" << anf_node->fullname_with_scope() << "].";
      return false;
    }
    auto it = kAkgKernelAttrsProcessMap.find(op_name);
    if (it != kAkgKernelAttrsProcessMap.end()) {
      it->second(anf_node);
    }

    nlohmann::json node_json;
    if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
      MS_LOG(ERROR) << "Op [" << op_name << "] create single kernel json failed.";
      return false;
    }
    // No need for composite op.
    node_json.erase("id");
    node_json.erase("op");
    node_json.erase("composite");

    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
    MS_EXCEPTION_IF_NULL(primitive);

    if (primitive->GetAttr("fusion") != nullptr) {
      node_json["fusion"] = primitive->GetAttr("fusion")->ToString();
    }

    (*node_json_map)[anf_node] = node_json;
  }
  return true;
 }

 bool AkgAscendKernelBuilder::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes,
                                              const std::vector<AnfNodePtr> &input_list,
                                              const std::vector<AnfNodePtr> &output_list) {
  if (anf_nodes.empty() || input_list.empty()) {
    MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size()
                  << "].";
    return false;
  }
  MS_LOG(INFO) << "anf_nodes [" << output_list.size() << "], input_list [" << anf_nodes.size() << "], output_list ["
               << input_list.size() << "].";

  std::map<AnfNodePtr, nlohmann::json> node_json_map;
  if (!GenJsonAndPreprocess4Fused(anf_nodes, &node_json_map)) {
    return false;
  }

  UpdateTensorNameInJson(anf_nodes, &node_json_map);

  nlohmann::json fused_node_json;
  std::vector<nlohmann::json> node_json_desc;
  std::transform(anf_nodes.begin(), anf_nodes.end(), std::back_inserter(node_json_desc),
                 [&node_json_map](const AnfNodePtr &anf_node) { return node_json_map[anf_node]; });
  fused_node_json[kOpDesc] = node_json_desc;
  fused_node_json[kInputDesc] = GetInputsJson(anf_nodes, input_list, &node_json_map);
  fused_node_json[kOutputDesc] =
    GetOutputsJson(anf_nodes, input_list, output_list, fused_node_json[kInputDesc], &node_json_map);

  size_t hash_id = std::hash<std::string>()(fused_node_json.dump());
  json_name_ = "Fused_";
  auto fg = anf_nodes[0]->func_graph();
  MS_EXCEPTION_IF_NULL(fg);
  auto attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
  if (attr_val != nullptr) {
    auto fg_attr = GetValue<std::string>(attr_val);
    (void)json_name_.append(fg_attr).append("_");
  }
  (void)json_name_.append(std::to_string(hash_id));
  fused_node_json["composite_graph"] = fg->ToString();
  fused_node_json["op"] = json_name_;
  fused_node_json["platform"] = "AKG";
  fused_node_json["process"] = "aicore";
  fused_node_json["composite"] = true;

  kernel_json_ = fused_node_json.dump();

  if (!GetIOSize(fused_node_json, &input_size_list_, &output_size_list_)) {
    MS_LOG(ERROR) << "Cal mem size failed.";
    return false;
  }

  return true;
 }

 void GenParallelCompileFuncArgs(const std::vector<std::string> &kernel_jsons, PyObject **p_args) {
  MS_EXCEPTION_IF_NULL(p_args);
  *p_args = PyTuple_New(PARALLEL_ARGS_SIZE);

  PyObject *arg1 = PyList_New(kernel_jsons.size());
  for (int i = 0; i < PyList_Size(arg1); ++i) {
    PyList_SetItem(arg1, i, Py_BuildValue("s", kernel_jsons[i].c_str()));
  }
  PyObject *arg2 = Py_BuildValue("i", PROCESS_NUM);
  PyObject *arg3 = Py_BuildValue("i", TIME_OUT);

  (void)PyTuple_SetItem(*p_args, 0, arg1);
  (void)PyTuple_SetItem(*p_args, 1, arg2);
  (void)PyTuple_SetItem(*p_args, 2, arg3);
 }

 bool AkgOpParallelBuild(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
  auto [jsons, repeat_nodes] = PreProcessJsonForBuild(build_args);
  if (jsons.empty()) {
    return true;
  }

  // Try to call python method to compile nodes parallely.
  PyObject *p_module = nullptr;
  PyObject *p_func = nullptr;
  PyObject *p_arg = nullptr;
  PyObject *p_res = nullptr;

  p_module = PyImport_ImportModule(kMultiProcModule);
  if (p_module == nullptr) {
    MS_LOG(ERROR) << "Failed to import [" << kMultiProcModule << "].";
    return false;
  }

  p_func = PyObject_GetAttrString(p_module, kCompileAkgKernelParallelFunc);
  GenParallelCompileFuncArgs(jsons, &p_arg);
  MS_LOG(DEBUG) << "Call function [" << kCompileAkgKernelParallelFunc << "], try to compile " << jsons.size()
                << " Akg kernels parallelly.";
  p_res = PyEval_CallObject(p_func, p_arg);
  if (p_res == nullptr) {
    PyErr_Print();
    MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
    return false;
  }
  if (PyObject_IsTrue(p_res) != 1) {
    PyErr_Print();
    MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
    return false;
  }

  if (!PostProcessAfterCompile(build_args, repeat_nodes)) {
    return false;
  }

  return true;
 }

 bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> json_and_node;
  for (const auto &anf_node : anf_nodes) {
    MS_EXCEPTION_IF_NULL(anf_node);
    AkgAscendKernelBuilder akg_cce_kernel_builder;
    KernelPackPtr kernel_pack = nullptr;
    auto cnode = anf_node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    if (AnfAlgo::IsGraphKernel(cnode)) {
      auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
      auto mng = func_graph->manager();
      if (mng == nullptr) {
        mng = Manage(func_graph, true);
        func_graph->set_manager(mng);
      }
      MS_EXCEPTION_IF_NULL(func_graph);
      std::vector<AnfNodePtr> node_list;
      std::vector<AnfNodePtr> input_list;
      std::vector<AnfNodePtr> output_list;
      std::string op_name = AnfAlgo::GetCNodeName(anf_node);
      MS_LOG(INFO) << "Akg start compile composite op[" << op_name << "]";
      GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
      if (!akg_cce_kernel_builder.CollectFusedJson(node_list, input_list, output_list)) {
        MS_EXCEPTION(UnknownError) << "Akg build failed composite op[" << op_name << "].";
      }
    } else {
      if (!akg_cce_kernel_builder.CollectJson(anf_node)) {
        MS_EXCEPTION(UnknownError) << "Akg build failed op[" << AnfAlgo::GetCNodeName(anf_node) << "].";
      }
    }
    json_and_node.push_back({akg_cce_kernel_builder, anf_node});
  }

  if (json_and_node.empty()) {
    MS_LOG(DEBUG) << "There is no kernel needed to be compiled.";
    return true;
  }

  return AkgOpParallelBuild(json_and_node);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h
@@ -0,0 +1,56 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_

 #include <string>
 #include <memory>
 #include <vector>
 #include <map>
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"

 namespace mindspore {
 namespace kernel {
 class AkgAscendKernelBuilder : public AkgKernelBuild {
 public:
  AkgAscendKernelBuilder() = default;
  ~AkgAscendKernelBuilder() = default;

  bool CollectJson(const AnfNodePtr &anf_node);
  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
                        const std::vector<AnfNodePtr> &output_list);
  std::string json_name() const { return json_name_; }
  std::string kernel_json() const { return kernel_json_; }
  const std::vector<size_t> &input_size_list() const { return input_size_list_; }
  const std::vector<size_t> &output_size_list() const { return output_size_list_; }

 private:
  bool GenJsonAndPreprocess4Fused(const std::vector<AnfNodePtr> &anf_nodes,
                                  std::map<AnfNodePtr, nlohmann::json> *node_json_map);

  std::string kernel_json_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
 };

 bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.cc
@@ -0,0 +1,132 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.h"
 #include <algorithm>
 #include <fstream>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
 #include "nlohmann/json.hpp"
 #include "runtime/rt.h"
 #include "utils/log_adapter.h"
 #include "utils/convert_utils.h"
 #include "utils/context/ms_context.h"

 namespace mindspore {
 namespace kernel {
 using std::fstream;
 using std::map;
 using std::mutex;
 using std::string;
 using TbeTaskInfoPtr = std::shared_ptr<ge::model_runner::TbeTaskInfo>;
 using tbe::KernelManager;
 constexpr uint32_t DEFAULT_BLOCK_DIM = 1;
 /**
 * @brief infotable contain func_stub\blockdim\kernel file buffer
 */
 AkgKernelMod::AkgKernelMod(const KernelPackPtr &kernel_pack) : kernel_pack_(kernel_pack) {}

 void AkgKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }

 void AkgKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }

 void AkgKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }

 const std::vector<size_t> &AkgKernelMod::GetInputSizeList() const { return input_size_list_; }

 const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return output_size_list_; }

 const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }

 bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == nullptr) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
    return false;
  }

  if (kernel_pack_ == nullptr) {
    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
    return false;
  }

  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
  if (func_stub == 0) {
    MS_LOG(ERROR) << "GenFuncStub failed.";
    return false;
  }

  // pack all addresses into a vector.
  std::vector<void *> runtime_args;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtime_args),
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
                       [](const AddressPtr &output) -> void * { return output->addr; });

  rtL2Ctrl_t *l2ctrl = nullptr;
  auto stream = reinterpret_cast<rtStream_t *>(stream_ptr);
  if (RT_ERROR_NONE != rtKernelLaunch(reinterpret_cast<void *>(func_stub), block_dim, runtime_args.data(),
                                      SizeToUint(sizeof(void *) * runtime_args.size()), l2ctrl, stream)) {
    MS_LOG(ERROR) << "Call runtime rtKernelLaunch error.";
    return false;
  }

  return true;
 }

 std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                               const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
  if (kernel_pack_ == nullptr) {
    MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
  }

  std::vector<uint8_t> args;
  const uint32_t args_size = 0;
  std::vector<uint8_t> sm_desc;
  void *binary = nullptr;
  const uint32_t binary_size = 0;
  std::vector<uint8_t> meta_data;
  std::vector<void *> input_data_addrs;
  std::vector<void *> output_data_addrs;
  std::vector<void *> workspace_addrs;

  // pack all addresses into a vector.
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs),
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
                       [](const AddressPtr &output) -> void * { return output->addr; });

  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
  if (func_stub == 0) {
    MS_LOG(EXCEPTION) << "GenFuncStub failed.";
  }

  std::string stub_func = KernelManager::GetStubFuncName(kernel_pack_);

  MS_LOG(DEBUG) << "The block_dim is:" << block_dim;

  TbeTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::TbeTaskInfo>(
    kernel_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
    input_data_addrs, output_data_addrs, workspace_addrs, NeedDump());
  return {task_info_ptr};
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
 #include <string>
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/ascend_kernel_mod.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"

 namespace mindspore {
 namespace kernel {
 class AkgKernelMod : public AscendKernelMod {
 public:
  explicit AkgKernelMod(const KernelPackPtr &kernel_pack);
  ~AkgKernelMod() final {}

  void SetInputSizeList(const std::vector<size_t> &size_list);
  void SetOutputSizeList(const std::vector<size_t> &size_list);
  void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
  const std::vector<size_t> &GetInputSizeList() const override;
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
  std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;

 private:
  KernelPackPtr kernel_pack_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 using AkgKernelModPtr = std::shared_ptr<AkgKernelMod>;
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
@@ -0,0 +1,43 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h"
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 KernelModPtr AkgGpuKernelBuild(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  AkgKernelBuild akg_kernel_build;

  std::vector<size_t> input_size_list;
  std::vector<size_t> output_size_list;
  KernelPackPtr kernel_pack = akg_kernel_build.BuildByJson(anf_node, &input_size_list, &output_size_list);
  MS_EXCEPTION_IF_NULL(kernel_pack);

  auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
  MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
  kernel_mod_ptr->SetInputSizeList(input_size_list);
  kernel_mod_ptr->SetOutputSizeList(output_size_list);
  return kernel_mod_ptr;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
 #include "backend/kernel_compiler/kernel.h"
 #include "base/base.h"

 namespace mindspore {
 namespace kernel {
 KernelModPtr AkgGpuKernelBuild(const AnfNodePtr &anf_node);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_BUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
@@ -0,0 +1,116 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h"
 #include <fstream>
 #include <algorithm>
 #include "nlohmann/json.hpp"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 using std::fstream;
 using std::string;
 using std::vector;

 GpuKernelManagerPtr GpuKernelMod::kernelmanager_ = std::make_shared<GpuKernelManager>();
 GpuKernelManager::GpuKernelManager() {}

 CUresult GpuKernelManager::GetFunction(const KernelPackPtr &kernel_pack, bool force_reload,
                                       vector<uint32_t> *thread_info, CUfunction *func) {
  if (kernel_pack->GetJson() == nullptr || kernel_pack->GetJson()->contents == nullptr ||
      kernel_pack->GetKernel() == nullptr || kernel_pack->GetKernel()->contents == nullptr) {
    MS_LOG(ERROR) << "GPU:Invalid kernel pack, json or kernel is nullptr.";
    return CUDA_ERROR_INVALID_IMAGE;
  }
  auto js = nlohmann::json::parse(kernel_pack->GetJson()->contents,
                                  kernel_pack->GetJson()->contents + kernel_pack->GetJson()->len);
  string fn = js["kernelName"];
  if (!force_reload) {
    auto iter = infotable_.find(fn);
    if (iter != infotable_.end()) {
      auto kernelmeta = iter->second;
      *thread_info = kernelmeta->thread_info_;
      *func = kernelmeta->func_addr_;
      return CUDA_SUCCESS;
    }
  }
  thread_info->emplace_back(js["blockIdx.x"]);
  thread_info->emplace_back(js["blockIdx.y"]);
  thread_info->emplace_back(js["blockIdx.z"]);
  thread_info->emplace_back(js["threadIdx.x"]);
  thread_info->emplace_back(js["threadIdx.y"]);
  thread_info->emplace_back(js["threadIdx.z"]);
  CUmodule module;
  CUresult result = cuModuleLoadData(&module, kernel_pack->GetKernel()->contents);
  if (result != CUDA_SUCCESS) {
    MS_LOG(ERROR) << "cuModuleLoadData failed.";
    return result;
  }
  result = cuModuleGetFunction(func, module, fn.c_str());
  if (result != CUDA_SUCCESS) {
    MS_LOG(ERROR) << "cuModuleGetFunction failed.";
    return result;
  }
  infotable_[fn] = std::make_shared<GpuKernelMeta>(*func, module, *thread_info);
  return result;
 }

 GpuKernelMod::GpuKernelMod(const KernelPackPtr &kernel_pack) : kernel_pack_(kernel_pack) {}

 void GpuKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }

 void GpuKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }

 const std::vector<size_t> &GpuKernelMod::GetInputSizeList() const { return input_size_list_; }

 const std::vector<size_t> &GpuKernelMod::GetOutputSizeList() const { return output_size_list_; }

 const std::vector<size_t> &GpuKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }

 bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == 0) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
    return false;
  }
  if (kernel_pack_ == nullptr) {
    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
    return false;
  }
  vector<uint32_t> thread_info;
  CUfunction kernel_addr;
  CUresult result = kernelmanager_->GetFunction(kernel_pack_, false, &thread_info, &kernel_addr);
  if (result != CUDA_SUCCESS) {
    MS_LOG(ERROR) << "GetFunction failed.";
    return false;
  }
  std::vector<void *> runtimeargs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &input) -> void * { return reinterpret_cast<void *>(&(input->addr)); });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); });
  result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4],
                          thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
                          reinterpret_cast<void **>(&runtimeargs[0]), 0);
  if (result != CUDA_SUCCESS) {
    MS_LOG(ERROR) << "Launch Kernel failed.";
    return false;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h
@@ -0,0 +1,82 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_MOD_H_
 #include <cuda.h>
 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"

 namespace mindspore {
 namespace kernel {
 struct GpuKernelMeta {
  CUfunction func_addr_;
  CUmodule module_;
  std::vector<uint32_t> thread_info_;
  GpuKernelMeta(CUfunction funcAddr, CUmodule module, const std::vector<uint32_t> &thread_info)
      : func_addr_(funcAddr), module_(module), thread_info_(thread_info) {}
 };
 using GpuKernelMetaPtr = std::shared_ptr<GpuKernelMeta>;

 class GpuKernelManager {
 public:
  GpuKernelManager();
  virtual ~GpuKernelManager() {
    for (auto iter = infotable_.begin(); iter != infotable_.end(); ++iter) {
      CUresult ret = cuModuleUnload(iter->second->module_);
      if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_DEINITIALIZED) {
        MS_LOG(ERROR) << "Unload GPU Module failed.";
      }
    }
  }
  CUresult GetFunction(const KernelPackPtr &kernel_pack, bool force_reload, std::vector<uint32_t> *thread_info,
                       CUfunction *func);

 private:
  std::unordered_map<std::string, GpuKernelMetaPtr> infotable_;
 };
 using GpuKernelManagerPtr = std::shared_ptr<GpuKernelManager>;

 class GpuKernelMod : public KernelMod {
 public:
  explicit GpuKernelMod(const KernelPackPtr &kernel_pack);
  virtual ~GpuKernelMod() {}

  void SetInputSizeList(const std::vector<size_t> &size_list);
  void SetOutputSizeList(const std::vector<size_t> &size_list);
  const std::vector<size_t> &GetInputSizeList() const override;
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;

  static GpuKernelManagerPtr kernelmanager_;

 private:
  KernelPackPtr kernel_pack_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 using GpuKernelModPtr = std::shared_ptr<GpuKernelMod>;
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_AKG_GPU_AKG_GPU_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@@ -0,0 +1,52 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_

 #include <vector>
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "backend/kernel_compiler/kernel.h"
 #ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
 #endif

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
 namespace kernel {
 class AscendKernelMod : public KernelMod {
 public:
  virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
  uint32_t stream_id() { return stream_id_; }
  virtual bool NeedDump() {
 #ifdef ENABLE_DATA_DUMP
    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
 #else
    return false;
 #endif
  }

 protected:
  uint32_t block_dim_{1};
  uint32_t stream_id_{0};
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -0,0 +1,145 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_COMMON_UTILS_H_
 #define MINDSPORE_CCSRC_KERNEL_COMMON_UTILS_H_

 #include <dirent.h>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <map>
 #include <string>
 #include <vector>
 #include <utility>
 #include <nlohmann/json.hpp>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/oplib/opinfo.h"
 #include "backend/kernel_compiler/kernel_build_info.h"

 namespace mindspore {
 namespace kernel {
 constexpr auto kCceKernelMeta = "./kernel_meta/";
 constexpr auto kGpuKernelMeta = "./cuda_meta";
 constexpr auto kProcessorAiCore = "aicore";
 constexpr auto kProcessorAiCpu = "aicpu";
 constexpr auto kProcessorCuda = "cuda";
 constexpr auto kJsonSuffix = ".json";
 constexpr auto kInfoSuffix = ".info";
 constexpr unsigned int AUTODIFF_COMPILE_OVERTIME = 600;
 constexpr auto kAkgModule = "_akg";
 constexpr auto kArgDataformat = "data_format";

 const std::vector<std::string> support_devices = {"aicore", "aicpu", "cuda"};

 struct KernelMetaInfo {
  uintptr_t func_stub_;
  uint32_t block_dim_;
 };
 using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>;

 class KernelMeta {
 public:
  KernelMeta() = default;
  void Initialize();
  void RemoveKernelCache();
  std::string Search(const std::string &kernel_name) const;
  bool Insert(const std::string &kernel_name, const std::string &kernel_json);
  std::string GetKernelMetaPath() { return kernel_meta_path_; }

  static KernelMeta *GetInstance() {
    static KernelMeta kernel_meta;
    return &kernel_meta;
  }
  ~KernelMeta() = default;

 private:
  bool initialized_ = false;
  std::string kernel_meta_path_;
  std::unordered_map<std::string, std::string> kernel_meta_map_;
 };

 struct SparseGradient {
  float *value_;
  int *indices_;
  size_t indices_size_;
 };

 struct MultiThreadComputeParams {
  float *var_;
  float *accum_;
  float *linear_;
  float *m_;
  float *m_t_;
  float *v_;
  float lr_;
  float l1_;
  float l2_;
  float lr_power_;
  float beta1_;
  float beta2_;
  float epsilon_;
  SparseGradient sparse_grad_;
  size_t var_first_dim_size_;
  size_t var_outer_dim_size_;
  bool use_nesterov_;
 };
 using MultiThreadComputeFunc = std::function<void(MultiThreadComputeParams *param, size_t start, size_t end)>;

 bool CheckCache(const std::string &kernel_name);
 KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
 KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
 TypeId DtypeToTypeId(const std::string &dtypes);
 std::string Dtype2ShortType(const std::string &dtypes);
 std::string TypeId2String(TypeId type_id);
 size_t GetDtypeNbyte(const std::string &dtypes);
 bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr, Processor processor,
                   std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list);
 void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
 void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                              size_t outer_dim);
 void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                          size_t outer_dim, bool use_multi_threads = true);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
 std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
                                                                            const std::vector<AnfNodePtr> &input_list);
 std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
                                                          const std::vector<AnfNodePtr> &input_list,
                                                          const std::vector<AnfNodePtr> &output_list);
 void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list);
 void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list);
 bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json);
 void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list);
 bool IsWeightBoundary(const AnfNodePtr &node);
 void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
                        size_t total_compute_size);
 void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
                                        std::vector<size_t> *slice_positions);
 void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
                               size_t outer_dim);
 void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_COMMON_UTILS_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/addn_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/addn_cpu_kernel.cc
@@ -0,0 +1,65 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/addn_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
 }

 bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> & /*workspace*/,
                           const std::vector<kernel::AddressPtr> &outputs) {
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);

  size_t offset = 0;
  for (size_t i = 0; i < output_shape_[0]; ++i) {
    for (size_t j = 0; j < output_shape_[1]; ++j) {
      for (size_t k = 0; k < output_shape_[2]; ++k) {
        for (size_t m = 0; m < output_shape_[3]; ++m) {
          float sum = 0;
          for (size_t index = 0; index < input_num_; ++index) {
            auto input_addr = reinterpret_cast<float *>(inputs[index]->addr);
            sum += input_addr[offset];
          }
          output_addr[offset++] = sum;
        }
      }
    }
  }

  return true;
 }

 void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", but AddNCPUKernel olny support 4d or lower.";
  }

  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/addn_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/addn_cpu_kernel.h
@@ -0,0 +1,48 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ADDN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ADDN_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class AddNCPUKernel : public CPUKernel {
 public:
  AddNCPUKernel() : input_num_(0) {}
  ~AddNCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  size_t input_num_;
  std::vector<size_t> output_shape_;
 };

 MS_REG_CPU_KERNEL(AddN,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  AddNCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_ADDN_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
@@ -0,0 +1,53 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr auto kRanksGroup = "group";
 constexpr auto kAllGatherInputNum = 1;
 }  // namespace

 void AllGatherCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != kAllGatherInputNum) {
    MS_LOG(EXCEPTION) << "allgather input num:" << input_num;
  }

  auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup);
  if (ranks_group != nullptr) {
    ranks_group_ = GetValue<std::vector<int>>(ranks_group);
  } else {
    MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup;
  }
 }

 bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                const std::vector<kernel::AddressPtr> & /*workspace*/,
                                const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_data_num = inputs[0]->size / sizeof(float);
  auto mpi_instance = device::cpu::MPIAdapter::Instance();
  MS_EXCEPTION_IF_NULL(mpi_instance);
  return mpi_instance->AllGather(input_addr, output_addr, ranks_group_, input_data_num);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h
@@ -0,0 +1,44 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class AllGatherCPUKernel : public CPUKernel {
 public:
  AllGatherCPUKernel() = default;
  ~AllGatherCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  std::vector<int> ranks_group_;
 };

 MS_REG_CPU_KERNEL(_HostAllGather, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  AllGatherCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc
@@ -0,0 +1,47 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void ApplyMomentumCPUKernel::InitKernel(const CNodePtr & /*kernel_node*/) {}

 bool ApplyMomentumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> & /*workspace*/,
                                    const std::vector<kernel::AddressPtr> & /*outputs*/) {
  if (inputs.size() < 5) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) {
    MS_LOG(EXCEPTION) << "error input data size!";
  }
  auto weight = reinterpret_cast<float *>(inputs[0]->addr);
  auto accumulate = reinterpret_cast<float *>(inputs[1]->addr);
  float learning_rate = reinterpret_cast<float *>(inputs[2]->addr)[0];
  auto gradient = reinterpret_cast<float *>(inputs[3]->addr);
  float moment = reinterpret_cast<float *>(inputs[4]->addr)[0];
  size_t elem_num = inputs[0]->size / sizeof(float);
  for (size_t i = 0; i < elem_num; ++i) {
    accumulate[i] = accumulate[i] * moment + gradient[i];
    weight[i] -= accumulate[i] * learning_rate;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class ApplyMomentumCPUKernel : public MKLCPUKernel {
 public:
  ApplyMomentumCPUKernel() = default;
  ~ApplyMomentumCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(ApplyMomentum,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  ApplyMomentumCPUKernel);
 MS_REG_CPU_KERNEL(ApplyMomentum,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  ApplyMomentumCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
@@ -0,0 +1,67 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/argmax_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void ArgmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (shape.size() != 2) {
    MS_LOG(EXCEPTION) << "argmax kernel dims invalid " << shape.size();
  }
  batch_size_ = shape[0];
  class_num_ = shape[1];

  int axis = AnfAlgo::GetNodeAttr<int>(kernel_node, AXIS);
  if (axis != -1 && axis != 1) {
    MS_LOG(EXCEPTION) << "argmax kernel not support axis " << axis;
  }
 }

 bool ArgmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> & /*workspaces*/,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "input or output empty!";
  }

  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
    MS_LOG(EXCEPTION) << "invalid input or output data size!";
  }
  auto input = reinterpret_cast<float *>(inputs[0]->addr);
  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  size_t row_start = 0;
  for (size_t i = 0; i < batch_size_; ++i) {
    size_t max_index = 0;
    float max_value = input[row_start];
    for (size_t j = 1; j < class_num_; ++j) {
      size_t index = row_start + j;
      if (input[index] > max_value) {
        max_value = input[index];
        max_index = j;
      }
    }
    output[i] = SizeToInt(max_index);
    row_start += class_num_;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
@@ -0,0 +1,45 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARGMAX_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ARGMAX_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class ArgmaxCPUKernel : public CPUKernel {
 public:
  ArgmaxCPUKernel() = default;
  ~ArgmaxCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  size_t class_num_{0};
  size_t batch_size_{0};
 };

 MS_REG_CPU_KERNEL(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                  ArgmaxCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_ARGMAX_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
@@ -0,0 +1,82 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/bias_add_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  bias_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  if (input_shape_.size() == 4) {
    data_shape_ = 4;
  } else if (input_shape_.size() == 2) {
    data_shape_ = 2;
  } else {
    MS_LOG(EXCEPTION) << "bias add input data format should be NCHW or NC";
  }
  if (input_shape_.size() != 2 && input_shape_.size() != 4) {
    MS_LOG(EXCEPTION) << "bias add input shape nchw or nc";
  }
  if (bias_shape_.size() != 1) {
    MS_LOG(EXCEPTION) << "bias shape invalid";
  }
  if (input_shape_[1] != bias_shape_[0]) {
    MS_LOG(EXCEPTION) << "bias shape not match";
  }
 }

 bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/,
                              const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 2 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "inputs outputs size not supoort";
  }

  auto src_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto bias_addr = reinterpret_cast<float *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);

  if (data_shape_ == 4) {
    size_t h_size = input_shape_[3];
    size_t c_size = input_shape_[2] * h_size;
    size_t n_size = input_shape_[1] * c_size;
    size_t hw_size = input_shape_[2] * input_shape_[3];
    size_t n_offset = 0;
    for (size_t n = 0; n < input_shape_[0]; ++n) {
      size_t c_offset = 0;
      for (size_t c = 0; c < input_shape_[1]; ++c) {
        for (size_t hw = 0; hw < hw_size; ++hw) {
          size_t offset = n_offset + c_offset + hw;
          output_addr[offset] = src_addr[offset] + bias_addr[c];
        }
        c_offset += c_size;
      }
      n_offset += n_size;
    }
  } else {
    size_t n_offset = 0;
    for (size_t n = 0; n < input_shape_[0]; ++n) {
      for (size_t c = 0; c < input_shape_[1]; ++c) {
        output_addr[n_offset + c] = src_addr[n_offset + c] + bias_addr[c];
      }
      n_offset += input_shape_[1];
    }
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIAS_ADD_CPU_KERNEL_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIAS_ADD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BiasAddCPUKernel : public CPUKernel {
 public:
  BiasAddCPUKernel() = default;
  ~BiasAddCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  uint8_t data_shape_{0};
  std::vector<size_t> input_shape_;
  std::vector<size_t> bias_shape_;
 };
 MS_REG_CPU_KERNEL(
  BiasAdd,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BiasAddCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIAS_ADD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
@@ -0,0 +1,68 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (input_shape_.size() != 4 && input_shape_.size() != 2) {
    MS_LOG(EXCEPTION) << "input data format not support";
  }
 }

 bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/,
                                  const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "input output size not support";
  }
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);

  if (input_shape_.size() == 4) {
    size_t h_size = input_shape_[3];
    size_t c_size = h_size * input_shape_[2];
    size_t n_size = c_size * input_shape_[1];
    size_t hw_size = input_shape_[2] * input_shape_[3];
    size_t c_offset = 0;
    for (size_t c = 0; c < input_shape_[1]; ++c) {
      output_addr[c] = 0;
      size_t n_offset = 0;
      for (size_t n = 0; n < input_shape_[0]; ++n) {
        for (size_t hw = 0; hw < hw_size; ++hw) {
          size_t offset = c_offset + n_offset + hw;
          output_addr[c] += input_addr[offset];
        }
        n_offset += n_size;
      }
      c_offset += c_size;
    }
  } else if (input_shape_.size() == 2) {
    for (size_t c = 0; c < input_shape_[1]; ++c) {
      output_addr[c] = 0;
      size_t n_offset = 0;
      for (size_t n = 0; n < input_shape_[0]; ++n) {
        output_addr[c] += input_addr[c + n_offset];
        n_offset += input_shape_[1];
      }
    }
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIASADDGRADCPUKERNEL_H_
 #define MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIASADDGRADCPUKERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BiasAddGradCPUKernel : public CPUKernel {
 public:
  BiasAddGradCPUKernel() = default;
  ~BiasAddGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  std::vector<size_t> input_shape_;
 };
 MS_REG_CPU_KERNEL(BiasAddGrad, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  BiasAddGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_CPU_BIASADDGRADCPUKERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
@@ -0,0 +1,106 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/concat_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void ConcatCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);

  axis_ = AnfAlgo::GetNodeAttr<int>(kernel_node, AXIS);
  auto input_1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (axis_ < 0) {
    axis_ = axis_ + SizeToInt(input_1_shape.size());
  }
  axis_ += 4 - input_1_shape.size();

  auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  for (size_t i = 0; i < input_num; i++) {
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
    CPUKernelUtils::ExpandDimsTo4(&input_shape);
    input_shape_list_.push_back(input_shape);
  }

  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
 }

 bool ConcatCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> & /*workspace*/,
                             const std::vector<kernel::AddressPtr> &outputs) {
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto buff_size = outputs[0]->size;
  size_t dim0 = output_shape_[0];
  size_t dim1 = output_shape_[1];
  size_t dim2 = output_shape_[2];

  if (axis_ == 3) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        for (size_t k = 0; k < dim2; ++k) {
          CopyDataToOutput(inputs, i, j, k, &output_addr, &buff_size);
        }
      }
    }
  } else if (axis_ == 2) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        CopyDataToOutput(inputs, i, j, 0, &output_addr, &buff_size);
      }
    }
  } else if (axis_ == 1) {
    for (size_t i = 0; i < dim0; ++i) {
      CopyDataToOutput(inputs, i, 0, 0, &output_addr, &buff_size);
    }
  } else if (axis_ == 0) {
    CopyDataToOutput(inputs, 0, 0, 0, &output_addr, &buff_size);
  }
  return true;
 }

 void ConcatCPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1,
                                       size_t dim2, float **output_addr, size_t *buff_size) {
  for (size_t i = 0; i < input_shape_list_.size(); ++i) {
    auto input_i_shape = input_shape_list_[i];
    auto input_i_addr = reinterpret_cast<float *>(inputs[i]->addr);

    size_t num = CPUKernelUtils::GetElementNumOnAxis(input_i_shape, axis_);
    num *= input_i_shape[axis_];
    auto pos = CPUKernelUtils::CalcOffset(input_i_shape, dim0, dim1, dim2, 0);
    auto ret = memcpy_s(*output_addr, *buff_size, input_i_addr + pos, num * sizeof(float));
    if (ret != EOK) {
      MS_LOG(EXCEPTION) << "memcpy failed.";
    }
    *output_addr += num;
    *buff_size -= num * sizeof(float);
  }
 }

 void ConcatCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", but ConcatCPUKernel olny support 4d or lower.";
  }

  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ConcatCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CONCAT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CONCAT_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class ConcatCPUKernel : public CPUKernel {
 public:
  ConcatCPUKernel() : axis_(0) {}
  ~ConcatCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  void CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                        float **output_addr, size_t *buff_size);
  int axis_;
  std::vector<std::vector<size_t>> input_shape_list_;
  std::vector<size_t> output_shape_;
 };

 MS_REG_CPU_KERNEL(Concat,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  ConcatCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CONCAT_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
@@ -0,0 +1,80 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  size_t type_size = sizeof(float);
  for (size_t input_index = 0; input_index < input_num; ++input_index) {
    std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    input_size_list_.emplace_back(tensor_size);
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  for (size_t output_index = 0; output_index < output_num; ++output_index) {
    std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    output_size_list_.emplace_back(tensor_size);
  }
 }

 void CPUKernel::Init(const CNodePtr &kernel_node) {
  InitKernel(kernel_node);
  InitInputOutputSize(kernel_node);
 }

 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
  auto len = shape->size();
  if (len < 4) {
    for (size_t i = 0; i < 4 - len; ++i) {
      shape->insert(shape->begin(), 1);
    }
  }
 }

 size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
                                  size_t dim3) {
  size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
  return offset;
 }

 size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
  if (axis < 0) {
    axis = axis + SizeToInt(shape.size());
  }
  size_t result = 1;
  for (int j = 3; j > axis; --j) {
    result *= shape[j];
  }
  return result;
 }

 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
  size_t accumulation = 1;
  element_num->emplace_back(1);
  for (size_t i = shape.size() - 1; i > 0; --i) {
    accumulation *= shape[i];
    element_num->emplace_back(accumulation);
  }
  std::reverse(element_num->begin(), element_num->end());
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@@ -0,0 +1,87 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include <memory>
 #include <numeric>
 #include <functional>
 #include "backend/kernel_compiler/kernel.h"
 #include "ir/anf.h"
 #include "backend/session/anf_runtime_algorithm.h"

 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
 namespace mindspore {
 namespace kernel {
 const char KSIZE[] = "ksize";
 const char STRIDE[] = "stride";
 const char STRIDES[] = "strides";
 const char DILATION[] = "dilation";
 const char PAD[] = "pad";
 const char PAD_MODE[] = "pad_mode";
 const char PADDING[] = "padding";
 const char PAD_MODE_LOWER_SAME[] = "same";
 const char PAD_MODE_LOWER_VALID[] = "valid";
 const char PAD_MODE_UPPER_SAME[] = "SAME";
 const char PAD_MODE_UPPER_VALID[] = "VALID";
 const char TRANSPOSE_A[] = "transpose_a";
 const char TRANSPOSE_B[] = "transpose_b";
 const char IS_GRAD[] = "is_grad";
 const char TRANSPOSE_NO = 'N';
 const char TRANSPOSE_YES = 'T';
 const char AXIS[] = "axis";
 const char BEGIN[] = "begin";
 const char END[] = "end";
 const char SIZE[] = "size";
 const char USE_NESTEROV[] = "use_nesterov";

 class CPUKernel : public kernel::KernelMod {
 public:
  CPUKernel() = default;
  ~CPUKernel() override = default;
  virtual void Init(const CNodePtr &kernel_node);
  virtual void InitKernel(const CNodePtr &kernel_node) = 0;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
    return Launch(inputs, workspace, outputs);
  };
  virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                      const std::vector<AddressPtr> &outputs) = 0;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

 protected:
  virtual void InitInputOutputSize(const CNodePtr &kernel_node);
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 class CPUKernelUtils {
 public:
  static void ExpandDimsTo4(std::vector<size_t> *shape);
  static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
  static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
  static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
@@ -0,0 +1,104 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 #include <memory>
 #include <iostream>
 #include <string>

 #include "runtime/device/kernel_info.h"

 namespace mindspore {
 namespace kernel {
 CPUKernelFactory &CPUKernelFactory::GetInstance() {
  static CPUKernelFactory instance;
  return instance;
 }

 void CPUKernelFactory::Register(const std::string &kernel_name, const KernelAttr &kernel_attr,
                                CPUKernelCreator &&kernel_creator) {
  (void)name_to_attr_creator_[kernel_name].emplace_back(kernel_attr, kernel_creator);
 #if !defined(_WIN32) && !defined(_WIN64)
  MS_LOG(DEBUG) << "CPUKernelFactory register operator: " << kernel_name;
 #endif
 }

 std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_name, const CNodePtr &apply_kernel) {
  auto kernel_info = dynamic_cast<device::KernelInfo *>(apply_kernel->kernel_info());
  MS_EXCEPTION_IF_NULL(kernel_info);
  const KernelBuildInfo *kernel_build_Info = kernel_info->select_kernel_build_info();
  MS_EXCEPTION_IF_NULL(kernel_build_Info);
  std::pair<bool, size_t> ret_pair = CPUKernelAttrCheck(kernel_name, *kernel_build_Info);
  if (ret_pair.first) {
    return (name_to_attr_creator_.find(kernel_name)->second)[ret_pair.second].second();
  }
  return nullptr;
 }

 std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string &kernel_name,
                                                             const KernelBuildInfo &kernel_info) {
  auto iter = name_to_attr_creator_.find(kernel_name);
  if (iter == name_to_attr_creator_.end()) {
    MS_LOG(INFO) << "Not registered CPU kernel: op[" << kernel_name << "]!";
    return std::make_pair(false, 0);
  }
  auto creators = iter->second;
  for (size_t index = 0; index < creators.size(); ++index) {
    auto attr_creator = creators[index];
    if (CPUKernelSingleAttrCheck(attr_creator.first, kernel_info)) {
      return std::make_pair(true, index);
    }
  }
  return std::make_pair(false, 0);
 }

 bool CPUKernelFactory::CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, const KernelBuildInfo &kernel_info) {
  for (size_t i = 0; i < kernel_info.GetInputNum(); ++i) {
    auto dtype = kernel_attr.GetAllSame() ? kernel_attr.GetInputAttr(0).first : kernel_attr.GetInputAttr(i).first;
    if (kernel_info.GetInputDeviceType(i) != dtype) {
      MS_LOG(DEBUG) << "input index:" << i << ", kernel info type:" << kernel_info.GetInputDeviceType(i)
                    << ", register type:" << dtype;
      return false;
    }
  }
  for (size_t i = 0; i < kernel_info.GetOutputNum(); ++i) {
    auto dtype = kernel_attr.GetAllSame() ? kernel_attr.GetOutputAttr(0).first : kernel_attr.GetOutputAttr(i).first;
    if (kernel_info.GetOutputDeviceType(i) != dtype) {
      MS_LOG(DEBUG) << "output index:" << i << ", kernel info type:" << kernel_info.GetOutputDeviceType(i)
                    << ", register type:" << dtype;
      return false;
    }
  }
  return true;
 }

 std::vector<KernelAttr> CPUKernelFactory::GetSupportedKernelAttrList(const std::string &kernel_name) {
  std::vector<KernelAttr> result;
  auto iter = name_to_attr_creator_.find(kernel_name);
  if (iter == name_to_attr_creator_.end()) {
    MS_LOG(WARNING) << "Not registered CPU kernel: op[" << kernel_name << "]!";
    return result;
  }
  auto creators = iter->second;
  for (size_t index = 0; index < creators.size(); ++index) {
    auto attr_creator = creators[index];
    result.push_back(attr_creator.first);
  }
  return result;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h
@@ -0,0 +1,79 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_FACTORY_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_FACTORY_H_

 #include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>

 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"

 namespace mindspore {
 namespace kernel {
 using mindspore::device::cpu::KernelAttr;
 using CPUKernelCreator = std::function<std::shared_ptr<CPUKernel>()>;
 class CPUKernelFactory {
 public:
  static CPUKernelFactory &GetInstance();
  void Register(const std::string &kernel_name, const KernelAttr &kernel_attr, CPUKernelCreator &&kernel_creator);
  std::shared_ptr<CPUKernel> Create(const std::string &kernel_name, const CNodePtr &apply_kernel);
  std::vector<KernelAttr> GetSupportedKernelAttrList(const std::string &kernel_name);

 private:
  CPUKernelFactory() = default;
  ~CPUKernelFactory() = default;
  DISABLE_COPY_AND_ASSIGN(CPUKernelFactory)
  std::pair<bool, size_t> CPUKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo &kernel_info);
  bool CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, const KernelBuildInfo &kernel_info);
  std::map<std::string, std::vector<std::pair<KernelAttr, CPUKernelCreator>>> name_to_attr_creator_;
 };

 class CPUKernelRegistrar {
 public:
  CPUKernelRegistrar(const std::string &kernel_name, const KernelAttr &kernel_attr, CPUKernelCreator &&kernel_creator) {
    CPUKernelFactory::GetInstance().Register(kernel_name, kernel_attr, std::move(kernel_creator));
  }
  ~CPUKernelRegistrar() = default;
 };

 #define MS_REG_CPU_KERNEL(OPNAME, ATTR, OPCLASS) MS_REG_CPU_KERNEL_(__COUNTER__, OPNAME, ATTR, OPCLASS)
 #define MS_REG_CPU_KERNEL_(COUNT, OPNAME, ATTR, OPCLASS) _MS_REG_CPU_KERNEL_(COUNT, OPNAME, ATTR, OPCLASS)
 #define _MS_REG_CPU_KERNEL_(COUNT, OPNAME, ATTR, OPCLASS)                                  \
  static_assert(std::is_base_of<CPUKernel, OPCLASS>::value, " must be base of CPUKernel"); \
  static const CPUKernelRegistrar g_cpu_kernel_##COUNT##_reg(#OPNAME, ATTR,                \
                                                             []() { return std::make_shared<OPCLASS>(); });

 #define MS_REG_CPU_KERNEL_T(OPNAME, ATTR, OPCLASS, T) MS_REG_CPU_KERNEL_T_(__COUNTER__, OPNAME, ATTR, OPCLASS, T)
 #define MS_REG_CPU_KERNEL_T_(COUNT, OPNAME, ATTR, OPCLASS, T) _MS_REG_CPU_KERNEL_T_(COUNT, OPNAME, ATTR, OPCLASS, T)
 #define _MS_REG_CPU_KERNEL_T_(COUNT, OPNAME, ATTR, OPCLASS, T)                                \
  static_assert(std::is_base_of<CPUKernel, OPCLASS<T>>::value, " must be base of CPUKernel"); \
  static const CPUKernelRegistrar g_cpu_kernel_##COUNT##_##OPNAME##_##T##_reg(                \
    #OPNAME, ATTR, []() { return std::make_shared<OPCLASS<T>>(); });

 #define MS_REG_CPU_KERNEL_T_S(OPNAME, ATTR, OPCLASS, T, S)                                       \
  static_assert(std::is_base_of<CPUKernel, OPCLASS<T, S>>::value, " must be base of CPUKernel"); \
  static const CPUKernelRegistrar g_cpu_kernel_##OPNAME##_##T##_##S##_reg(                       \
    #OPNAME, ATTR, []() { return std::make_shared<OPCLASS<T, S>>(); });
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CPU_KERNEL_FACTORY_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif

 namespace mindspore {
 namespace kernel {
 void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); }

 bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                            const std::vector<kernel::AddressPtr> & /*workspace*/,
                            const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 1 || outputs.empty()) {
    MS_LOG(EXCEPTION) << " input or output empty!";
  }
  auto val = reinterpret_cast<float *>(inputs[0]->addr);
  MS_LOG(DEBUG) << " launch DebugCountCPUKernel val " << *val;

  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  size_t elem_num = inputs[0]->size / sizeof(int);
  for (size_t i = 0; i < elem_num; i++) {
    output[i] = val[i];
  }

 #ifdef ENABLE_DEBUGGER
  // debugger will suspend execution is neccessary
  Debugger::GetInstance()->PostDebugOp();
 #endif

  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_DEBUG_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_DEBUG_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class DebugCPUKernel : public CPUKernel {
 public:
  DebugCPUKernel() = default;
  ~DebugCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(Debug, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32), DebugCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_DEBUG_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
@@ -0,0 +1,78 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>
 #include "backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"

 namespace mindspore {
 namespace kernel {
 void EmbeddingLookUpCommGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  split_num_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "split_num");
  MS_LOG(INFO) << "split_num: " << split_num_;
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape[0] % split_num_ != 0) {
    MS_LOG(EXCEPTION) << "Input shape[0] is " << input_shape[0] << ", but it must be multiple of split_num.";
  }
 }

 bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                              const std::vector<kernel::AddressPtr> & /*workspace*/,
                                              const std::vector<kernel::AddressPtr> &outputs) {
 #if defined(_WIN32) || defined(_WIN64)
  auto start_time = std::chrono::steady_clock::now();
 #else
  struct timeval start_time, end_time;
  (void)gettimeofday(&start_time, nullptr);
 #endif
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  size_t input_size = inputs[0]->size;
  size_t output_size = outputs[0]->size;
  MS_LOG(DEBUG) << "input addr: " << input_addr << "input size: " << input_size;
  MS_LOG(DEBUG) << "output addr: " << output_addr << "output size: " << output_size;
  memset_s(output_addr, output_size, 0, output_size);
  const std::vector<int> &rank_group = {0, 1, 2, 3, 4, 5, 6, 7};
  size_t input_split_lens = input_size / split_num_ / sizeof(float_t);
  size_t output_split_lens = output_size / split_num_ / sizeof(float_t);
  auto mpi_instance = device::cpu::MPIAdapter::Instance();
  MS_EXCEPTION_IF_NULL(mpi_instance);
  for (int i = 0; i < split_num_; i++) {
    mpi_instance->AllGather(input_addr + i * input_split_lens, output_addr + i * output_split_lens, rank_group,
                            input_split_lens);
  }
 #if defined(_WIN32) || defined(_WIN64)
  auto end_time = std::chrono::steady_clock::now();
  std::chrono::duration<double, std::ratio<1, 1000000>> cost = end_time - start_time;
  MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << cost.count() << " us";
 #else
  (void)gettimeofday(&end_time, nullptr);
  uint64_t time = 1000000 * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << time << " us";
 #endif
  return true;
 }

 void EmbeddingLookUpCommGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCommGradCPUKernel needs 1.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class EmbeddingLookUpCommGradCPUKernel : public CPUKernel {
 public:
  EmbeddingLookUpCommGradCPUKernel() : split_num_(1) {}
  ~EmbeddingLookUpCommGradCPUKernel() override{};

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  int split_num_;
 };

 MS_REG_CPU_KERNEL(EmbeddingLookupCommGrad,
                  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  EmbeddingLookUpCommGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.cc
@@ -0,0 +1,212 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>
 #include <string>
 #include "backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"
 #include "ir/primitive.h"

 namespace mindspore {
 namespace kernel {
 void EmbeddingLookUpCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  input_lens_ = 1;
  for (auto shape : input_shape_) {
    input_lens_ = input_lens_ * shape;
  }
  indices_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  indices_lens_ = 1;
  for (auto shape : indices_shape_) {
    indices_lens_ = indices_lens_ * shape;
  }
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  axis_ = 4 - input_shape_.size();
  if (AnfAlgo::HasNodeAttr(kAttrReduceScatterFlag, kernel_node)) {
    reduce_scatter_flag_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, kAttrReduceScatterFlag);
  }
 #ifdef ENABLE_MPI
  if (reduce_scatter_flag_) {
    size_t gatherv2_out_lens = 1;
    for (int i = 0; i < SizeToInt(input_shape_.size()); i++) {
      if (i == 0) {
        for (int j = 0; j < SizeToInt(indices_shape_.size()); j++) {
          gatherv2_out_lens = gatherv2_out_lens * indices_shape_[j];
        }
      } else {
        gatherv2_out_lens = gatherv2_out_lens * input_shape_[i];
      }
    }
    gatherv2_out_lens_ = gatherv2_out_lens * sizeof(float);
    gather_v2_out_ = malloc(gatherv2_out_lens_);
    if (gather_v2_out_ == nullptr) {
      MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel malloc failed, malloc lens: " << gatherv2_out_lens_;
    }
    auto ret = memset_s(gather_v2_out_, gatherv2_out_lens_, 0, gatherv2_out_lens_);
    if (ret != 0) {
      MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel memset gatherv2 out buff failed";
    }
    split_num_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "split_num");
  }
 #else
  if (reduce_scatter_flag_) {
    MS_LOG(EXCEPTION) << "Not Enable MPI, please build version with -M on when set reduce_scatter_flag true";
  }
 #endif
  if (AnfAlgo::HasNodeAttr(kAttrOffset, kernel_node)) {
    offset_ = AnfAlgo::GetNodeAttr<int>(kernel_node, kAttrOffset);
  }
  CPUKernelUtils::ExpandDimsTo4(&input_shape_);
  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
 }

 bool EmbeddingLookUpCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  float *gather_out_addr = reduce_scatter_flag_ ? reinterpret_cast<float *>(gather_v2_out_) : output_addr;
  size_t dim0 = input_shape_[0];
  size_t dim1 = input_shape_[1];
  size_t dim2 = input_shape_[2];
  if (axis_ == 3) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        for (size_t k = 0; k < dim2; ++k) {
          LookUpTable(inputs, i, j, k, &gather_out_addr);
        }
      }
    }
  } else if (axis_ == 2) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        LookUpTable(inputs, i, j, 0, &gather_out_addr);
      }
    }
  } else if (axis_ == 1) {
    for (size_t i = 0; i < dim0; ++i) {
      LookUpTable(inputs, i, 0, 0, &gather_out_addr);
    }
  } else if (axis_ == 0) {
    LookUpTable(inputs, 0, 0, 0, &gather_out_addr);
  }
 #ifdef ENABLE_MPI
  if (reduce_scatter_flag_) {
    size_t one_split_lens = gatherv2_out_lens_ / split_num_ / sizeof(float);
    size_t reduce_scatter_out_lens = one_split_lens / 8;
    const std::vector<int> &group = {0, 1, 2, 3, 4, 5, 6, 7};
    auto mpi_instance = device::cpu::MPIAdapter::Instance();
    MS_EXCEPTION_IF_NULL(mpi_instance);
    for (int i = 0; i < split_num_; i++) {
      mpi_instance->ReduceScatter(reinterpret_cast<float *>(gather_v2_out_) + i * one_split_lens,
                                  output_addr + i * reduce_scatter_out_lens, group, one_split_lens / 8, "sum");
    }
  }
 #endif
  return true;
 }

 void LookUpTable_task(const float *input_addr, float *output_addr, const int *indices_addr, size_t indices_lens,
                      size_t num, size_t dim0, size_t dim1, size_t dim2, int offset, size_t axis,
                      std::vector<size_t> input_shape, size_t input_lens) {
  size_t lens = num * sizeof(float);
  for (size_t i = 0; i < indices_lens; ++i) {
    int indices = indices_addr[i] - offset;
    if (indices >= 0) {
      size_t index = IntToSize(indices);
      if (index < input_shape[axis]) {
        size_t pos = 0;
        if (axis == 3) {
          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, dim1, dim2, index);
        } else if (axis == 2) {
          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, dim1, index, 0);
        } else if (axis == 1) {
          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, index, 0, 0);
        } else if (axis == 0) {
          pos = CPUKernelUtils::CalcOffset(input_shape, index, 0, 0, 0);
        }
        if (pos + num <= input_lens) {
          auto ret = memcpy_s(output_addr, lens, input_addr + pos, lens);
          if (ret != EOK) {
            MS_LOG(EXCEPTION) << "LookUpTable task memcpy failed.";
          }
        } else {
          auto ret = memset_s(output_addr, lens, 0, lens);
          if (ret != EOK) {
            MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
          }
        }
      } else {
        auto ret = memset_s(output_addr, lens, 0, lens);
        if (ret != EOK) {
          MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
        }
      }
    } else {
      auto ret = memset_s(output_addr, lens, 0, lens);
      if (ret != EOK) {
        MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
      }
    }
    output_addr += num;
  }
 }

 void EmbeddingLookUpCPUKernel::LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1,
                                           size_t dim2, float **output_addr) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
  size_t num = CPUKernelUtils::GetElementNumOnAxis(input_shape_, axis_);
  float *task_out_addr = *output_addr;
  const size_t thread_num = 8;
  std::thread threads[8];
  size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num;
  size_t i;
  size_t task_offset = 0;
  MS_LOG(DEBUG) << "indices_lens_: " << indices_lens_ << " one task proc lens:" << task_proc_lens;
  for (i = 0; i < thread_num; i++) {
    if (task_offset >= indices_lens_) {
      break;
    }
    MS_LOG(DEBUG) << "task_offset: " << task_offset << " task_proc_lenss:" << task_proc_lens;
    threads[i] =
      std::thread(LookUpTable_task, input_addr, task_out_addr + task_offset * num, indices_addr + task_offset,
                  task_proc_lens, num, dim0, dim1, dim2, offset_, axis_, input_shape_, input_lens_);
    task_offset += task_proc_lens;
    if (task_offset + task_proc_lens > indices_lens_) {
      task_proc_lens = indices_lens_ - task_offset;
    }
  }
  for (size_t j = 0; j < i; j++) {
    threads[j].join();
  }
  *output_addr += num * indices_lens_;
 }

 void EmbeddingLookUpCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size()
                      << ", but EmbeddingLookUpCPUKernel olny support 4d or lower.";
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCPUKernel needs 2.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@@ -0,0 +1,74 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class EmbeddingLookUpCPUKernel : public CPUKernel {
 public:
  EmbeddingLookUpCPUKernel() {
    axis_ = 0;
    offset_ = 0;
    split_num_ = 0;
    input_lens_ = 0;
    indices_lens_ = 0;
    gatherv2_out_lens_ = 0;
    reduce_scatter_flag_ = false;
    gather_v2_out_ = nullptr;
  }
  ~EmbeddingLookUpCPUKernel() override {
    if (gather_v2_out_ != nullptr) {
      free(gather_v2_out_);
      gather_v2_out_ = nullptr;
    }
  }

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                   float **output_addr);
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> indices_shape_;
  std::vector<size_t> output_shape_;
  int axis_;
  int offset_;
  int split_num_;
  size_t input_lens_;
  size_t indices_lens_;
  size_t gatherv2_out_lens_;
  bool reduce_scatter_flag_;

  void *gather_v2_out_;
 };

 MS_REG_CPU_KERNEL(
  EmbeddingLookup,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
  EmbeddingLookUpCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.cc
@@ -0,0 +1,46 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/equal_count_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void EqualCountCPUKernel::InitKernel(const CNodePtr & /*kernel_node*/) {}

 bool EqualCountCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                 const std::vector<kernel::AddressPtr> & /*workspace*/,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "input or output empty!";
  }
  if (inputs[0]->size != inputs[1]->size) {
    MS_LOG(EXCEPTION) << "input or output size!";
  }
  int count = 0;
  auto left = reinterpret_cast<int *>(inputs[0]->addr);
  auto right = reinterpret_cast<int *>(inputs[1]->addr);
  size_t elem_num = inputs[0]->size / sizeof(int);
  for (size_t i = 0; i < elem_num; i++) {
    if (left[i] == right[i]) {
      count++;
    }
  }
  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  output[0] = count;
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_EQUAL_COUNT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_EQUAL_COUNT_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class EqualCountCPUKernel : public CPUKernel {
 public:
  EqualCountCPUKernel() = default;
  ~EqualCountCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  EqualCount,
  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  EqualCountCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_EQUAL_COUNT_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc
@@ -0,0 +1,115 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/gather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void GatherV2CPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indices_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  axis_ = AnfAlgo::GetNodeAttr<int>(kernel_node, AXIS);
  if (axis_ < 0) {
    axis_ = axis_ + SizeToInt(input_shape_.size());
  }
  axis_ += 4 - input_shape_.size();
  CPUKernelUtils::ExpandDimsTo4(&input_shape_);
  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
 }

 bool GatherV2CPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                               const std::vector<kernel::AddressPtr> & /*workspace*/,
                               const std::vector<kernel::AddressPtr> &outputs) {
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto buff_size = outputs[0]->size;
  size_t dim0 = input_shape_[0];
  size_t dim1 = input_shape_[1];
  size_t dim2 = input_shape_[2];
  if (axis_ == 3) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        for (size_t k = 0; k < dim2; ++k) {
          CopyDataToOutput(inputs, i, j, k, &output_addr, &buff_size);
        }
      }
    }
  } else if (axis_ == 2) {
    for (size_t i = 0; i < dim0; ++i) {
      for (size_t j = 0; j < dim1; ++j) {
        CopyDataToOutput(inputs, i, j, 0, &output_addr, &buff_size);
      }
    }
  } else if (axis_ == 1) {
    for (size_t i = 0; i < dim0; ++i) {
      CopyDataToOutput(inputs, i, 0, 0, &output_addr, &buff_size);
    }
  } else if (axis_ == 0) {
    CopyDataToOutput(inputs, 0, 0, 0, &output_addr, &buff_size);
  }
  return true;
 }

 void GatherV2CPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1,
                                         size_t dim2, float **output_addr, size_t *buff_size) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
  size_t elem_num = inputs[1]->size / 4;
  size_t num = CPUKernelUtils::GetElementNumOnAxis(input_shape_, axis_);
  for (size_t i = 0; i < elem_num; ++i) {
    if (indices_addr[i] < 0) {
      MS_LOG(EXCEPTION) << "The indices value is less than 0.";
    }
    size_t index = IntToSize(indices_addr[i]);
    if (index >= input_shape_[IntToSize(axis_)]) {
      auto ret = memset_s(*output_addr, *buff_size, 0., num * sizeof(float));
      if (ret != EOK) {
        MS_LOG(EXCEPTION) << "memset failed.";
      }
    } else {
      size_t pos = 0;
      if (axis_ == 3) {
        pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, dim1, dim2, index);
      } else if (axis_ == 2) {
        pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, dim1, index, 0);
      } else if (axis_ == 1) {
        pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, index, 0, 0);
      } else if (axis_ == 0) {
        pos = CPUKernelUtils::CalcOffset(input_shape_, index, 0, 0, 0);
      }
      auto ret = memcpy_s(*output_addr, *buff_size, input_addr + pos, num * sizeof(float));
      if (ret != EOK) {
        MS_LOG(EXCEPTION) << "memcpy failed.";
      }
    }
    *output_addr += num;
    *buff_size -= num * sizeof(float);
  }
 }  // namespace kernel

 void GatherV2CPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", but GatherV2CPUKernel olny support 4d or lower.";
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherV2CPUKernel needs 2.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.h
@@ -0,0 +1,52 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_GATHER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_GATHER_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class GatherV2CPUKernel : public CPUKernel {
 public:
  GatherV2CPUKernel() : axis_(0) {}
  ~GatherV2CPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                        float **output_addr, size_t *buff_size);
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> indices_shape_;
  std::vector<size_t> output_shape_;
  int axis_;
 };

 MS_REG_CPU_KERNEL(
  GatherV2,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
  GatherV2CPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_GATHER_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_cpu_kernel.cc
@@ -0,0 +1,91 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/conv2d_cpu_kernel.h"
 #include <string>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void Conv2dCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 || weight_shape.size() != 4) {
    MS_LOG(EXCEPTION) << "conv2d only support nchw input!";
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
  dnnl::memory::desc weights_desc = GetDefaultMemDesc(weight_shape);
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);

  int kernel_size = SizeToInt(weight_shape[3]);
  auto stride_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDE);
  auto dilation_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, DILATION);
  if (stride_ori.size() != 4 || stride_ori[2] != stride_ori[3]) {
    MS_LOG(EXCEPTION) << "conv2d only support equal stride, and stride must be 4d!";
  }
  if (stride_ori[0] != 1 || stride_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "conv2d stride only support 1 in N axis and C axis!";
  }
  if (dilation_ori.size() != 4 || dilation_ori[2] != 1 || dilation_ori[3] != 1) {
    MS_LOG(EXCEPTION) << "conv2d dilation only support 1, and dilation must be 4d!";
  }
  if (dilation_ori[0] != 1 || dilation_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "conv2d dilation only support 1 in N axis and C axis!";
  }
  int stride = stride_ori[2];
  int dilation = dilation_ori[2];

  dnnl::memory::dims strides{stride, stride};
  dnnl::memory::dims dilates{dilation - 1, dilation - 1};
  std::vector<int> int_padding_l;
  std::vector<int> int_padding_r;

  const std::string pad_mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, PAD_MODE);
  GetPadding(kernel_node, pad_mode, src_shape, kernel_size, stride, &int_padding_l, &int_padding_r);
  if (int_padding_l.size() != 2 || int_padding_r.size() != 2) {
    MS_LOG(EXCEPTION) << "get padding failed";
  }
  dnnl::memory::dims padding_l{int_padding_l[0], int_padding_l[1]};
  dnnl::memory::dims padding_r{int_padding_r[0], int_padding_r[1]};
  dnnl::convolution_forward::desc desc =
    dnnl::convolution_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::convolution_auto, src_desc,
                                    weights_desc, dst_desc, strides, dilates, padding_l, padding_r);

  auto prim_desc = dnnl::convolution_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::convolution_forward>(prim_desc);

  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_WEIGHTS, weights_desc);
  AddArgument(DNNL_ARG_DST, dst_desc);
 }

 bool Conv2dCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> & /*workspace*/,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class Conv2dCPUKernel : public MKLCPUKernel {
 public:
  Conv2dCPUKernel() = default;
  ~Conv2dCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  Conv2D,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  Conv2dCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.cc
@@ -0,0 +1,93 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h"
 #include <string>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void Conv2dGradFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> weight_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  std::vector<size_t> dst_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 || weight_shape.size() != 4) {
    MS_LOG(EXCEPTION) << ("conv2d grad filter only support nchw input!");
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
  dnnl::memory::desc weights_desc = GetDefaultMemDesc(weight_shape);
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);

  int kernel_size = SizeToInt(weight_shape[3]);
  auto stride_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDE);
  auto dilation_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, DILATION);
  if (stride_ori.size() != 2 || stride_ori[0] != stride_ori[1]) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel only support equal stride, and stride must be 2d!";
  }
  if (dilation_ori.size() != 4 || dilation_ori[2] != 1 || dilation_ori[3] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel dilation only support 1, and dilation must be 4d!";
  }
  if (dilation_ori[0] != 1 || dilation_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel dilation only support 1 in N axis and C axis!";
  }
  int stride = stride_ori[0];
  int dilation = dilation_ori[2];

  dnnl::memory::dims strides{stride, stride};
  dnnl::memory::dims dilates{dilation - 1, dilation - 1};
  const std::string pad_mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, PAD_MODE);
  std::vector<int> int_padding_l;
  std::vector<int> int_padding_r;
  GetPadding(kernel_node, pad_mode, src_shape, kernel_size, stride, &int_padding_l, &int_padding_r);
  if (int_padding_l.size() != 2 || int_padding_r.size() != 2) {
    MS_LOG(EXCEPTION) << "get padding failed";
  }
  dnnl::memory::dims padding_l{int_padding_l[0], int_padding_l[1]};
  dnnl::memory::dims padding_r{int_padding_r[0], int_padding_r[1]};
  dnnl::convolution_forward::desc forward_desc =
    dnnl::convolution_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::convolution_auto, src_desc,
                                    weights_desc, dst_desc, strides, dilates, padding_l, padding_r);

  auto forward_prim_desc = dnnl::convolution_forward::primitive_desc(forward_desc, MKLKernelEngine::Get().engine());

  dnnl::convolution_backward_weights::desc backward_desc = dnnl::convolution_backward_weights::desc(
    dnnl::algorithm::convolution_auto, src_desc, weights_desc, dst_desc, strides, dilates, padding_l, padding_r);

  auto backward_prim_desc = dnnl::convolution_backward_weights::primitive_desc(
    backward_desc, MKLKernelEngine::Get().engine(), forward_prim_desc);
  primitive_ = std::make_shared<dnnl::convolution_backward_weights>(backward_prim_desc);

  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DIFF_DST, dst_desc);
  AddArgument(DNNL_ARG_DIFF_WEIGHTS, weights_desc);
 }

 bool Conv2dGradFilterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                       const std::vector<kernel::AddressPtr> & /*workspace*/,
                                       const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_FILTER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_FILTER_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class Conv2dGradFilterCPUKernel : public MKLCPUKernel {
 public:
  Conv2dGradFilterCPUKernel() = default;
  ~Conv2dGradFilterCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  Conv2DBackpropFilter,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  Conv2dGradFilterCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_FILTER_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.cc
@@ -0,0 +1,92 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h"
 #include <string>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  std::vector<size_t> weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 || weight_shape.size() != 4) {
    MS_LOG(EXCEPTION) << "conv2d grad filter only support nchw input!";
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
  dnnl::memory::desc weights_desc = GetDefaultMemDesc(weight_shape);
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);

  int kernel_size = SizeToInt(weight_shape[3]);
  auto stride_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDE);
  auto dilation_ori = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, DILATION);
  if (stride_ori.size() != 2 || stride_ori[0] != stride_ori[1]) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel only support equal stride, and stride must be 2d!";
  }
  if (dilation_ori.size() != 4 || dilation_ori[2] != 1 || dilation_ori[3] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel dilation only support 1, and dilation must be 4d!";
  }
  if (dilation_ori[0] != 1 || dilation_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel dilation only support 1 in N axis and C axis!";
  }
  int stride = stride_ori[0];
  int dilation = dilation_ori[2];
  dnnl::memory::dims strides{stride, stride};
  dnnl::memory::dims dilates{dilation - 1, dilation - 1};
  std::vector<int> int_padding_l;
  std::vector<int> int_padding_r;
  const std::string pad_mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, PAD_MODE);
  GetPadding(kernel_node, pad_mode, src_shape, kernel_size, stride, &int_padding_l, &int_padding_r);
  if (int_padding_l.size() != 2 || int_padding_r.size() != 2) {
    MS_LOG(EXCEPTION) << "conv2d grad get padding failed";
  }
  dnnl::memory::dims padding_l{int_padding_l[0], int_padding_l[1]};
  dnnl::memory::dims padding_r{int_padding_r[0], int_padding_r[1]};
  dnnl::convolution_forward::desc forward_desc =
    dnnl::convolution_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::convolution_auto, src_desc,
                                    weights_desc, dst_desc, strides, dilates, padding_l, padding_r);

  auto forward_prim_desc = dnnl::convolution_forward::primitive_desc(forward_desc, MKLKernelEngine::Get().engine());

  dnnl::convolution_backward_data::desc backward_desc = dnnl::convolution_backward_data::desc(
    dnnl::algorithm::convolution_auto, src_desc, weights_desc, dst_desc, strides, dilates, padding_l, padding_r);

  auto backward_prim_desc =
    dnnl::convolution_backward_data::primitive_desc(backward_desc, MKLKernelEngine::Get().engine(), forward_prim_desc);
  primitive_ = std::make_shared<dnnl::convolution_backward_data>(backward_prim_desc);

  AddArgument(DNNL_ARG_DIFF_SRC, src_desc);
  AddArgument(DNNL_ARG_DIFF_DST, dst_desc);
  AddArgument(DNNL_ARG_WEIGHTS, weights_desc);
 }

 bool Conv2dGradInputCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_INPUT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_INPUT_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class Conv2dGradInputCPUKernel : public MKLCPUKernel {
 public:
  Conv2dGradInputCPUKernel() = default;
  ~Conv2dGradInputCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  Conv2DBackpropInput,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  Conv2dGradInputCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_CONV2D_GRAD_INPUT_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
@@ -0,0 +1,141 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
 #include <string>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 #ifdef PLATFORM_86
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 #endif
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "error iteration shape!";
  }
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "layers must be greater than zero!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "conv2d only support 3-D input!";
  }
  const int gate_size = 4 * hidden_size_;
  for (int i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::stream s(eng);
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  auto desc = std::make_shared<dnnl::lstm_forward::desc>(dnnl::prop_kind::forward_training, direction, src_desc,
                                                         src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
                                                         formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
                                                         dst_h_desc, dst_c_desc);
  prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
  primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
  AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
 }

 bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> & /*workspace*/,
                           const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    auto ret =
      memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, prim_desc_.bias_desc().get_size());
    if (ret != 0) {
      MS_LOG(EXCEPTION) << "bias memset error";
    }
  }
  // set handle
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
@@ -0,0 +1,70 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H_
 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
 #define PLATFORM_86
 #endif
 #ifdef PLATFORM_86
 #include <pmmintrin.h>
 #endif
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
 namespace mindspore {
 namespace kernel {
 class LstmCPUKernel : public MKLCPUKernel {
 public:
  LstmCPUKernel() = default;
  ~LstmCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  int weight_size_ = 0;
  int weight_h_size_ = 0;
  int input_size_;
  int hidden_size_;
  int num_layers_;
  int batch_size_;
  int seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_forward::primitive_desc prim_desc_;
 };

 MS_REG_CPU_KERNEL(LSTM,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LstmCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@@ -0,0 +1,196 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
 #include <cstring>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  auto eng = MKLKernelEngine::Get().engine();
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "error iteration shape!";
  }
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "layers must be greater than zero!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "conv2d only support 3-D input!";
  }
  const int gate_size = 4 * hidden_size_;
  for (int i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
    dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
    formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
    dst_c_desc);
  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
  auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
    src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
    dst_h_desc, dst_c_desc);
  prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
  primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);

  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
  AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
  AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
  AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
 }

 bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                               const std::vector<kernel::AddressPtr> &workspace /*workspace*/,
                               const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  // construct fw memory
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
  auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
                 prim_backward_desc_.bias_desc().get_size())) {
      MS_LOG(EXCEPTION) << "bias memset error";
    }
  }
  // construct bw memory
  auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
  auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
  auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  user_diff_weights_memory.set_data_handle(outputs[3]->addr);
  user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
  if (memset_s(user_diff_weights_memory.get_data_handle(), user_diff_weights_memory.get_desc().get_size(), 0,
               user_diff_weights_memory.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << "user weights grad memset error";
  }
  if (memset_s(user_diff_weights_h_memory.get_data_handle(), user_diff_weights_h_memory.get_desc().get_size(), 0,
               user_diff_weights_h_memory.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << "user weights iter grad memset error";
  }
  if (has_bias_) {
    diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
  }
  if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
               prim_backward_desc_.diff_bias_desc().get_size())) {
    MS_LOG(EXCEPTION) << "bias grad memset error";
  }
  if (memset_s(diff_weights_memory.get_data_handle(), diff_weights_memory.get_desc().get_size(), 0,
               diff_weights_memory.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << "weights grad memset error";
  }
  if (memset_s(diff_weights_h_memory.get_data_handle(), diff_weights_h_memory.get_desc().get_size(), 0,
               diff_weights_h_memory.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << "weights iter grad memset error";
  }
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
  SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
  ExecutePrimitive();
  Reorder(&diff_weights_memory, &user_diff_weights_memory);
  Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
@@ -0,0 +1,71 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class LSTMGradCPUKernel : public MKLCPUKernel {
 public:
  LSTMGradCPUKernel() = default;
  ~LSTMGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  int weight_size_ = 0;
  int weight_h_size_ = 0;
  int input_size_;
  int hidden_size_;
  int num_layers_;
  int batch_size_;
  int seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_backward::primitive_desc prim_backward_desc_;
 };

 MS_REG_CPU_KERNEL(LSTMGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LSTMGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc
@@ -0,0 +1,71 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h"
 #include <algorithm>
 #include <utility>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "common/utils.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);

  if (src_shape.size() != 2 || weight_shape.size() != 2 || dst_shape.size() != 2) {
    MS_LOG(EXCEPTION) << "matmul invalid input size";
  }
  bool trans_a = AnfAlgo::GetNodeAttr<bool>(kernel_node, TRANSPOSE_A);
  bool trans_b = AnfAlgo::GetNodeAttr<bool>(kernel_node, TRANSPOSE_B);
  if (trans_a) {
    trans_a_ = TRANSPOSE_YES;
    dim_m_ = static_cast<dnnl_dim_t>(src_shape[1]);
    dim_k_ = static_cast<dnnl_dim_t>(src_shape[0]);
  } else {
    dim_m_ = static_cast<dnnl_dim_t>(src_shape[0]);
    dim_k_ = static_cast<dnnl_dim_t>(src_shape[1]);
  }
  if (trans_b) {
    trans_b_ = TRANSPOSE_YES;
  }
  dim_n_ = static_cast<dnnl_dim_t>(dst_shape[1]);
 }

 bool MatMulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> & /*workspace*/,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "matmul error input output size!";
  }
  dnnl_dim_t lda = dim_m_;
  if (trans_a_ == TRANSPOSE_NO) {
    lda = dim_k_;
  }
  dnnl_dim_t ldb = dim_k_;
  if (trans_b_ == TRANSPOSE_NO) {
    ldb = dim_n_;
  }
  auto input_a = reinterpret_cast<float *>(inputs[0]->addr);
  auto input_b = reinterpret_cast<float *>(inputs[1]->addr);
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  (void)dnnl_sgemm(trans_a_, trans_b_, dim_m_, dim_n_, dim_k_, 1.f, input_a, lda, input_b, ldb, 0.f, output, dim_n_);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_MATMUL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_MATMUL_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class MatMulCPUKernel : public MKLCPUKernel {
 public:
  MatMulCPUKernel() = default;
  ~MatMulCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  char trans_a_{TRANSPOSE_NO};
  char trans_b_{TRANSPOSE_NO};
  dnnl_dim_t dim_m_{0};
  dnnl_dim_t dim_n_{0};
  dnnl_dim_t dim_k_{0};
 };

 MS_REG_CPU_KERNEL(
  MatMul,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  MatMulCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_MATMUL_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
@@ -0,0 +1,106 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"

 namespace mindspore {
 namespace kernel {
 void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode,
                              const std::vector<size_t> &src_shape, int kernel_size, int stride,
                              std::vector<int> *padding_l, std::vector<int> *padding_r) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  if (src_shape.size() < 2) {
    MS_LOG(EXCEPTION) << "set pad only support src dim >= 2!";
  }
  std::vector<int> weight_height;
  weight_height.emplace_back(src_shape[src_shape.size() - 2]);
  weight_height.emplace_back(src_shape[src_shape.size() - 1]);
  int rad = kernel_size / 2;
  int need_pad = kernel_size - 1;
  MS_LOG(INFO) << "pad mode " << pad_mode;
  if (pad_mode == PAD_MODE_LOWER_SAME || pad_mode == PAD_MODE_UPPER_SAME) {
    for (auto wh : weight_height) {
      int re = (wh - 1) % stride;
      int pad = std::max(rad - (re / 2), 0);
      padding_r->emplace_back(pad);
      pad = std::max(need_pad - pad - re, 0);
      padding_l->emplace_back(pad);
    }
  } else if (pad_mode == PAD_MODE_LOWER_VALID || pad_mode == PAD_MODE_UPPER_VALID) {
    MS_LOG(INFO) << "pad valid";
    padding_l->emplace_back(0);
    padding_l->emplace_back(0);
    padding_r->emplace_back(0);
    padding_r->emplace_back(0);
  } else {
    std::vector<int> pad = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, PAD);
    if (pad.size() != 4) {
      MS_LOG(EXCEPTION) << "wrong pad size in max pooling " << pad.size();
    }
    padding_l->emplace_back(pad[0]);
    padding_l->emplace_back(pad[1]);
    padding_r->emplace_back(pad[2]);
    padding_r->emplace_back(pad[3]);
  }
 }

 dnnl::memory::format_tag MKLCPUKernel::GetDefaultFormatTag(const dnnl::memory::dims &dims) const {
  dnnl::memory::format_tag mem_tag;
  auto dim_size = dims.size();
  if (dim_size == 4) {
    mem_tag = dnnl::memory::format_tag::abcd;
  } else if (dim_size == 3) {
    mem_tag = dnnl::memory::format_tag::abc;
  } else if (dim_size == 2) {
    mem_tag = dnnl::memory::format_tag::ab;
  } else if (dim_size == 1) {
    mem_tag = dnnl::memory::format_tag::a;
  } else {
    MS_LOG(EXCEPTION) << "kernel dims invalid " << dim_size;
  }
  return mem_tag;
 }

 dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) {
  dnnl::memory::dims dims;
  dims.insert(dims.end(), shape.begin(), shape.end());
  dnnl::memory::format_tag mem_tag = GetDefaultFormatTag(dims);
  dnnl::memory::desc mem_desc(dims, dnnl::memory::data_type::f32, mem_tag);
  return mem_desc;
 }

 void MKLCPUKernel::AddArgument(int arg_key, const dnnl::memory::desc &mem_desc, bool alloc) {
  arguments_[arg_key] = MKLKernelEngine::Get().CreateMemory(mem_desc, alloc);
 }

 void MKLCPUKernel::SetArgumentHandle(int arg_key, void *ptr) {
  auto arg_iter = arguments_.find(arg_key);
  if (arg_iter != arguments_.end()) {
    arg_iter->second.set_data_handle(ptr);
  }
 }

 void MKLCPUKernel::ExecutePrimitive() { MKLKernelEngine::Get().Execute(primitive_, arguments_); }

 void MKLCPUKernel::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
  MKLKernelEngine::Get().Reorder(src_mem, dst_mem);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h
@@ -0,0 +1,52 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_MKL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_MKL_CPU_KERNEL_H_

 #include <string>
 #include <unordered_map>
 #include <memory>
 #include <vector>
 #include "dnnl.hpp"
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class MKLCPUKernel : public CPUKernel {
 public:
  MKLCPUKernel() = default;
  ~MKLCPUKernel() override = default;

 protected:
  void GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode, const std::vector<size_t> &src_shape,
                  int kernel_size, int stride, std::vector<int> *padding_l, std::vector<int> *padding_r);
  void AddArgument(int arg_key, const dnnl::memory::desc &mem_desc, bool alloc = false);
  void SetArgumentHandle(int arg_key, void *ptr);
  dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
  dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
  void ExecutePrimitive();
  std::unordered_map<int, dnnl::memory> arguments_;
  std::shared_ptr<dnnl::primitive> primitive_{nullptr};
  inline dnnl::memory::desc formatted_md(const dnnl::memory::dims &dimensions, dnnl::memory::format_tag layout) {
    return dnnl::memory::desc{{dimensions}, dnnl::memory::data_type::f32, layout};
  }
  void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem);
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_MKL_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc
@@ -0,0 +1,40 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "utils/log_adapter.h"
 #include "dnnl.hpp"

 namespace mindspore {
 namespace kernel {
 void MKLKernelEngine::Execute(const std::shared_ptr<dnnl::primitive> &primitive,
                              const std::unordered_map<int, dnnl::memory> &arguments) {
  MS_EXCEPTION_IF_NULL(primitive);
  primitive->execute(stream_, arguments);
  (void)stream_.wait();
 }

 dnnl::memory MKLKernelEngine::CreateMemory(const dnnl::memory::desc &mem_desc, bool alloc) {
  if (alloc) {
    return dnnl::memory(mem_desc, engine_);
  } else {
    return dnnl::memory(mem_desc, engine_, nullptr);
  }
 }
 void MKLKernelEngine::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
  dnnl::reorder(*src_mem, *dst_mem).execute(stream_, *src_mem, *dst_mem);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
@@ -0,0 +1,61 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void MulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  if (src0_shape.size() != src1_shape.size() && src1_shape.size() > 1) {
    MS_LOG(EXCEPTION) << "mul only support same dim input or tensor * scalar " << src0_shape.size() << " vs "
                      << src1_shape.size();
  }
  if (src1_shape.size() < src0_shape.size()) {
    for (size_t i = src1_shape.size(); i < src0_shape.size(); ++i) {
      src1_shape.emplace_back(1);
    }
  }
  dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
  dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
  dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_mem_desc, src1_mem_desc, dst_mem_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::binary>(prim_desc);
  AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
  AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
  AddArgument(DNNL_ARG_DST, dst_mem_desc);
 }

 bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                          const std::vector<kernel::AddressPtr> & /*workspace*/,
                          const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "mul error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
@@ -0,0 +1,42 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_MUL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_MUL_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class MulCPUKernel : public MKLCPUKernel {
 public:
  MulCPUKernel() = default;
  ~MulCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  MulCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_MUL_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.cc
@@ -0,0 +1,69 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.h"
 #include <string>
 #include <algorithm>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void PoolingCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);
  std::vector<int> kernel_sizes = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, KSIZE);
  std::vector<int> strides = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDES);
  if (kernel_sizes.size() != 4 || strides.size() != 4) {
    MS_LOG(EXCEPTION) << "invalid kernel size " << kernel_sizes.size() << " or stride size " << strides.size();
  }
  dnnl::memory::dims strides_dims{strides[2], strides[3]};
  dnnl::memory::dims kernels_dims{kernel_sizes[2], kernel_sizes[3]};
  const std::string pad_mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, PADDING);
  std::vector<int> int_padding_l;
  std::vector<int> int_padding_r;
  GetPadding(kernel_node, pad_mode, src_shape, kernel_sizes[3], strides[3], &int_padding_l, &int_padding_r);
  if (int_padding_l.size() != 2 || int_padding_r.size() != 2) {
    MS_LOG(EXCEPTION) << "pooling get padding failed";
  }
  dnnl::memory::dims padding_l{int_padding_l[0], int_padding_l[1]};
  dnnl::memory::dims padding_r{int_padding_r[0], int_padding_r[1]};
  dnnl::pooling_forward::desc desc =
    dnnl::pooling_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::pooling_max, src_desc, dst_desc,
                                strides_dims, kernels_dims, padding_l, padding_r);
  auto prim_desc = dnnl::pooling_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::pooling_forward>(prim_desc);
  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DST, dst_desc);
  AddArgument(DNNL_ARG_WORKSPACE, prim_desc.workspace_desc());
 }

 bool PoolingCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                              const std::vector<kernel::AddressPtr> & /*workspace*/,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_POOLING_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_POOLING_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class PoolingCPUKernel : public MKLCPUKernel {
 public:
  PoolingCPUKernel() = default;
  ~PoolingCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(MaxPool, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  PoolingCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_POOLING_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_grad_cpu_kernel.cc
@@ -0,0 +1,124 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/pooling_grad_cpu_kernel.h"
 #include <string>
 #include <utility>
 #include <algorithm>
 #include "common/utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void PoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  src_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  dst_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<int> kernel_sizes = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, KSIZE);
  std::vector<int> strides = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDES);
  if (kernel_sizes.size() != 4 || strides.size() != 4 || src_shape_.size() != 4 || dst_shape_.size() != 4) {
    MS_LOG(EXCEPTION) << "pooling grad invalid input size";
  }
  std::vector<int> padding_r;
  const std::string pad_mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, PADDING);
  kernel_size_ = kernel_sizes[3];
  stride_ = strides[3];
  GetPadding(kernel_node, pad_mode, src_shape_, kernel_size_, stride_, &padding_l_, &padding_r);
 }

 void PoolingGradCPUKernel::RowPoolingGrad(const float *input, float *output, float diff,
                                          const std::vector<std::pair<size_t, size_t>> &box,
                                          std::vector<std::pair<size_t, float>> *row_max_pair) {
  float max_value = 0;
  size_t max_index = box[1].second;
  size_t src_width = src_shape_[3];
  size_t index_start;
  size_t index;
  for (size_t i = box[1].first; i < box[1].second; ++i) {
    if ((*row_max_pair)[i].first == 0) {
      index_start = box[0].first * src_width;
      for (size_t j = box[0].first; j < box[0].second; ++j) {
        index = index_start + i;
        if (input[index] > (*row_max_pair)[i].second || j == box[0].first) {
          (*row_max_pair)[i].second = input[index];
          (*row_max_pair)[i].first = index;
        }
        index_start += src_width;
      }
    }
    if ((*row_max_pair)[i].second > max_value || max_index == box[1].second) {
      max_value = (*row_max_pair)[i].second;
      max_index = i;
    }
  }

  output[(*row_max_pair)[max_index].first] += diff;
 }

 void PoolingGradCPUKernel::ChannelPoolingGrad(const float *input, const float *diff, float *output) {
  int src_width = SizeToInt(src_shape_[3]);
  int src_height = SizeToInt(src_shape_[2]);
  std::vector<std::pair<size_t, float>> row_max_pair(src_shape_[3]);
  std::vector<std::pair<size_t, size_t>> box(2);
  int h_start = -padding_l_[0];
  size_t diff_index = 0;
  for (size_t h = 0; h < dst_shape_[2]; ++h) {
    box[0].first = IntToSize(std::max(h_start, 0));
    box[0].second = IntToSize(std::min(h_start + kernel_size_, src_height));
    for (size_t w = 0; w < src_shape_[3]; ++w) {
      row_max_pair[w].first = 0;
      row_max_pair[w].second = 0;
    }
    int w_start = -padding_l_[1];
    for (size_t w = 0; w < dst_shape_[3]; ++w) {
      box[1].first = IntToSize(std::max(w_start, 0));
      box[1].second = IntToSize(std::min(w_start + kernel_size_, src_width));
      RowPoolingGrad(input, output, diff[diff_index], box, &row_max_pair);
      diff_index += 1;
      w_start += stride_;
    }
    h_start += stride_;
  }
 }

 bool PoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> & /*workspace*/,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 3 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "pooling grad error input output size!";
  }

  auto input = reinterpret_cast<float *>(inputs[0]->addr);
  auto diff = reinterpret_cast<float *>(inputs[2]->addr);
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  auto ret = memset_s(output, outputs[0]->size, 0, outputs[0]->size);
  if (ret != 0) {
    MS_LOG(EXCEPTION) << "pooling grad memset error";
  }
  size_t src_wh = src_shape_[2] * src_shape_[3];
  size_t dst_wh = dst_shape_[2] * dst_shape_[3];
  for (size_t n = 0; n < src_shape_[0]; ++n) {
    for (size_t c = 0; c < src_shape_[1]; ++c) {
      ChannelPoolingGrad(input, diff, output);
      input = input + src_wh;
      output = output + src_wh;
      diff = diff + dst_wh;
    }
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_grad_cpu_kernel.h
@@ -0,0 +1,56 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_POOLING_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_POOLING_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include <utility>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class PoolingGradCPUKernel : public MKLCPUKernel {
 public:
  PoolingGradCPUKernel() = default;
  ~PoolingGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void RowPoolingGrad(const float *input, float *output, float diff, const std::vector<std::pair<size_t, size_t>> &box,
                      std::vector<std::pair<size_t, float>> *row_max_pair);
  void ChannelPoolingGrad(const float *input, const float *diff, float *output);
  int stride_{0}, kernel_size_{0};
  std::vector<int> padding_l_;
  std::vector<size_t> src_shape_;
  std::vector<size_t> dst_shape_;
 };

 MS_REG_CPU_KERNEL(MaxPoolGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  PoolingGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_POOLING_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.cc
@@ -0,0 +1,52 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void ReluCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 && src_shape.size() != 2) {
    MS_LOG(EXCEPTION) << "relu kernel dims invalid " << src_shape.size();
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);

  dnnl::eltwise_forward::desc desc =
    dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
  auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc);

  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DST, src_desc);
 }

 bool ReluCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> & /*workspace*/,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_RELU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_RELU_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class ReluCPUKernel : public MKLCPUKernel {
 public:
  ReluCPUKernel() = default;
  ~ReluCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), ReluCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_RELU_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.cc
@@ -0,0 +1,69 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void ReluGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 && src_shape.size() != 2) {
    MS_LOG(EXCEPTION) << "relu grad kernel dims invalid " << src_shape.size();
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);

  dnnl::eltwise_forward::desc forward_desc =
    dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
  auto forward_prim_desc = dnnl::eltwise_forward::primitive_desc(forward_desc, MKLKernelEngine::Get().engine());

  dnnl::eltwise_backward::desc backward_desc =
    dnnl::eltwise_backward::desc(dnnl::algorithm::eltwise_relu, src_desc, src_desc, 0.0, 0.0);
  auto backward_prim_desc =
    dnnl::eltwise_backward::primitive_desc(backward_desc, MKLKernelEngine::Get().engine(), forward_prim_desc);
  primitive_ = std::make_shared<dnnl::eltwise_backward>(backward_prim_desc);

  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DIFF_SRC, src_desc);
  AddArgument(DNNL_ARG_DIFF_DST, src_desc);
 }

 bool ReluGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                               const std::vector<kernel::AddressPtr> & /*workspace*/,
                               const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "relu grad error input output size!";
  }
  if (inputs[0]->size != outputs[0]->size) {
    MS_LOG(EXCEPTION) << "relu grad error input output data size!";
  }

  SetArgumentHandle(DNNL_ARG_SRC, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
  ExecutePrimitive();
  size_t mem_bits = outputs[0]->size;
  auto ret = memcpy_s(outputs[0]->addr, mem_bits, inputs[0]->addr, mem_bits);
  if (ret != 0) {
    MS_LOG(EXCEPTION) << "memcpy_s error, errorno " << ret;
    return false;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_RELU_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_RELU_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class ReluGradCPUKernel : public MKLCPUKernel {
 public:
  ReluGradCPUKernel() = default;
  ~ReluGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
 };

 MS_REG_CPU_KERNEL(
  ReluGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  ReluGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_RELU_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.cc
@@ -0,0 +1,54 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 void SoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<int> axis_list = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, AXIS);
  if (axis_list.size() != 1) {
    MS_LOG(EXCEPTION) << "cpu softmax only support input axis size 1";
  }
  int axis = axis_list[0];
  if (axis == -1 || axis >= SizeToInt(src_shape.size())) {
    axis = SizeToInt(src_shape.size()) - 1;
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
  dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, src_desc, axis);
  auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DST, src_desc);
 }

 bool SoftmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                              const std::vector<kernel::AddressPtr> & /*workspace*/,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "softmax error input output size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore