Compare commits

...

206 Commits
master ... r0.6

Author SHA1 Message Date
  mindspore-ci-bot a2edfcb09b !8171 【轻量级 PR】:update RELEASE.md. 5 years ago
  mindspore-ci-bot bbff1828ba !8185 fix securec download links due to mistakes made by openeuler community 5 years ago
  yanghaoran 94c644cb52 update graphengine, fix securec download links 5 years ago
  shenwei41 df90cf1538 update RELEASE.md. 5 years ago
  mindspore-ci-bot 4ca658319b !6531 【MD】r0.6 Branch: MD5 value update in the file - icu4c.cmake of branch r0.6 5 years ago
  mayang aa89c9f33c MD5 value update in the file icu4c.cmake of branch r0.6 5 years ago
  mindspore-ci-bot d1b1a626c2 !5447 Support manual convert to quantative network of resnet 5 years ago
  chenfei d27f7bf88b add manual quantative network of resnet 5 years ago
  mindspore-ci-bot 50d7480a4e !4457 modify yolov3_quant eval script 5 years ago
  chengxianbin ef9e3a5360 modify yolov3_darknet53 5 years ago
  mindspore-ci-bot 04a6612baf !4424 modify quant DenseBnAct API 5 years ago
  chengxianbin 7bc5b71b44 modify quant DenseBnAct code 5 years ago
  mindspore-ci-bot 30452899ec !4351 modify yolov3-darknet quant net codes 5 years ago
  chengxianbin 59863abcd3 modify yolov3-darknet53 quant code 5 years ago
  mindspore-ci-bot a15ae5238d !4304 upload yolov3-darknet quant net codes 5 years ago
  chengxianbin c80a1da8ac upload yolov3-darknet53 quant code 5 years ago
  mindspore-ci-bot 7d483cd09c !4115 runpackage sync C75B050 for mindspore r0.6 5 years ago
  wuweikang 9f3dcd7ab9 runpackage sync C75B050 for r0.6 5 years ago
  mindspore-ci-bot 801660ef08 !3912 fix numpyslice bug 5 years ago
  YangLuo 16f54c900b fix numpyslice bug 5 years ago
  mindspore-ci-bot 5465525f09 !3812 upgrade dockerfile version to 0.6.0-beta 5 years ago
  mindspore-ci-bot d9320b1606 !3805 modify release note for 0.6 5 years ago
  yanghaoran 34f2e94bd4 update mindspore version to 0.6.0-beta 5 years ago
  changzherui a0e575a17d modify release 5 years ago
  lujiale dc4e15d32c update RELEASE.md. 5 years ago
  lujiale 917a7e227f update RELEASE.md. 5 years ago
  lujiale 1b7daf777a update build.sh. 5 years ago
  mindspore-ci-bot 7d6160516f !3761 simplify googlenet 5 years ago
  panfengfeng ca881ec03e add maxpool_with_argmax/grad cuda kernel 5 years ago
  mindspore-ci-bot 983437feaf !3757 debug mindspore hub 5 years ago
  chenzomi a059e8910f debug mindspore hub 5 years ago
  mindspore-ci-bot 9dc23eeb98 !3602 Delete hard code in pull node 5 years ago
  ZPaC 78e3cb4bc4 Delete hard code in pull kernel. 5 years ago
  mindspore-ci-bot 0db3ff5773 !3742 fix GetInputReshapeType reports ERROR 5 years ago
  mindspore-ci-bot c9583ad3a4 !3730 fix bug of cast dtype when using mix_presion in pynative mode 5 years ago
  mindspore-ci-bot 294520e1fd !3548 Pass optimzier attributes to push kernel and parameter server. 5 years ago
  mindspore-ci-bot 4621565258 !3733 block trans data to change format 5 years ago
  mindspore-ci-bot b3b71e1d3f !3724 modify readme and timemoniter steps 5 years ago
  liubuyu 7d5e523743 fix set/get reshape type bug 5 years ago
  mindspore-ci-bot 0fb669190a !3703 Enlarge the threshold of resnet50 performance st in pynative 5 years ago
  jinyaohui db216a077a fix bug of cast dtype when using mix_presion in pynative mode 5 years ago
  lvchangquan f298e55072 block use trans data to change format 5 years ago
  mindspore-ci-bot dcd471eb96 !3718 add mindspore hub for download ckpt file 5 years ago
  wanghua c9a675f4e5 modify readme and timemoniter steps 5 years ago
  mindspore-ci-bot fdc183ad36 !3704 [r0.6][bug][auto_mixed_precision]fix amp bug in eval 5 years ago
  lvliang 937c5b5d8e enlarge the threshold of resnet50 performance in pynative 5 years ago
  chenzomi 783b823a25 add mindspore hub for download ckpt file 5 years ago
  mindspore-ci-bot 30ffcd8a1f !3681 modelzoo: support vgg16 in GPU 5 years ago
  mindspore-ci-bot 9ab94fa076 !3685 add tinybert scripts 5 years ago
  mindspore-ci-bot 944929f980 !3682 add googlenet gpu 5 years ago
  mindspore-ci-bot 09dd4128d5 !3689 fix cpu multi graph mem error 5 years ago
  Wei Luning ca4b2f6c0b fix eval in amp 5 years ago
  mindspore-ci-bot 7f3926429b !3628 fix log bug 5 years ago
  kswang 7360a2fa07 fix cpu multi graph mem error 5 years ago
  mindspore-ci-bot 10f0f0d5a5 !3673 fix serving input numbers 5 years ago
  mindspore-ci-bot 6b81f9f7f7 !3683 Modify patches and alerts 5 years ago
  mindspore-ci-bot 5a36b19e80 !3666 Modify the order of init and open of TDT 5 years ago
  mindspore-ci-bot 6944af09ee !3596 fix batchnorm issue under mix precision in pynative mode 5 years ago
  ms_yan e497117b74 init add vgg16 gpu version 5 years ago
  mindspore-ci-bot 78375e104a !3680 lowering value checking threshold to fix bug of pass eps 5 years ago
  mindspore-ci-bot abd346e84b !3649 modify setup.py version number for r0.6 5 years ago
  mindspore-ci-bot 9156775655 !3677 support multy node training in deeplabv3 5 years ago
  mindspore-ci-bot df7f0c8a7c !3659 modify readme for maskrcnn 5 years ago
  wanghua 9da1c96c4a add tinybert scripts 5 years ago
  panfengfeng 7d5a67e9f0 googlenet-gpu 5 years ago
  kingfo fc92598881 fix batchnorm issue in pynative auto mix precision 5 years ago
  mindspore-ci-bot e3fe1d76ca !3558 Fix a racing condition in CacheMergeOp when the leaf hits an error and exit too early 5 years ago
  mindspore-ci-bot b429a8421f !3586 fix python api doc for mindspore .dataset 5 years ago
  mindspore-ci-bot bb4339e3ca !3584 Fix a DatasetCache sharing scenario 5 years ago
  shenwei41 e49a2f83e7 Modify patches and alerts 5 years ago
  mindspore-ci-bot 1ec63700c7 !3632 Fix resource not release bug 5 years ago
  wangnan39@huawei.com fc5d419422 Lowering value checking threshold to support fix the bug of pass add eps 5 years ago
  mindspore-ci-bot d4b5cda934 !3604 Fix minor errors in probabilistic programming 5 years ago
  ZPaC d6a56cd6fd Pass optimizer attributes to push nodes. 5 years ago
  mindspore-ci-bot f04243b1f1 !3663 Fix multi worker 5 years ago
  mindspore-ci-bot 6b57b4f0e1 !3652 add epoch_num description 5 years ago
  zhouyaqiang b096a6cbe9 support multy node training and remove code 5 years ago
  hanjun996 c718774538 modify tdt 5 years ago
  mindspore-ci-bot 68128f87a9 !3634 Spilt unsupported transdata 5 years ago
  mindspore-ci-bot 22dbd1a233 !3646 [MD] fix minddataset core dump when file list size ia greater than 1000. 5 years ago
  hexia 52776820d8 fix_input_check 5 years ago
  guansongsong 5b15f40598 Fix a DatasetCache sharing scenario 5 years ago
  cristoval bf74164df3 fix sync sgd under multi-worker 5 years ago
  meixiaowei e5b9776b86 modify readme 5 years ago
  panfengfeng 8803c6258d add epoch_num 5 years ago
  changzherui 614841aa39 modify setup version number 5 years ago
  wuyongkang 983cb9b23d Fix resource not release bug 5 years ago
  guansongsong 68f27eb62b fix python api doc for mindspore.dataset 5 years ago
  mindspore-ci-bot 924a34acb8 !3639 fix GeneratorDataset time out 5 years ago
  mindspore-ci-bot db01f3eafe !3640 support bprop for const in pynative and develop stridedslice and isinstance 5 years ago
  liyong 66d8395fea fix coredump when number of file list more than 1000. 5 years ago
  mindspore-ci-bot e33b5e435e !3633 fix dataset & train gil lock of gpu process 5 years ago
  mindspore-ci-bot 477bf42fe5 !3641 Update submodule akg to r0.6 branch 5 years ago
  WilliamLian edba641ddb split unsupported transdata 5 years ago
  mindspore-ci-bot 338a225410 !3623 [r0.6][bug][auto_mixed_precision]fix amp doc and eval network build 5 years ago
  looop5 13d8bedbf4 update submodule akg to r0.6 branch 5 years ago
  mindspore-ci-bot 9a43468fee !3626 fix: device occupied tdt hung 5 years ago
  buxue 6beb8071d7 support bprop for const in pynative and develop stridedslice and isinstance. 5 years ago
  mindspore-ci-bot cc233f66ab !3629 Fix numpyslice issue 5 years ago
  yanghaitao 248130e5d1 fix generator time out 5 years ago
  mindspore-ci-bot 8f6eafdfcd !3589 fix the description of cache 5 years ago
  xiefangqi 30ed5a25ce fix numpyslice issue to r0.6 5 years ago
  panfengfeng 4eea891730 fix dataset train gil of gpu 5 years ago
  gukecai fe29a2501f fix log bug 5 years ago
  jonyguo 0d375bbaa3 fix: device occupied tdt hung 5 years ago
  mindspore-ci-bot 4f1e586ee3 !3579 fix maskrcnn dataset rescale bug 5 years ago
  Wei Luning dd26d85caf fix doc and eval network build in amp 5 years ago
  peixu_ren 49cdeb3f78 Fix minor errors in probabilistic programming 5 years ago
  mindspore-ci-bot d9ca3f2e88 !3566 dataset: api format problem in totype, totensor, slice 5 years ago
  mindspore-ci-bot c5f8b6b0c7 !3599 merge fix sparse doc to r0.6 5 years ago
  panyifeng 3714a07d71 fix sparse api doc 5 years ago
  mindspore-ci-bot 950367c102 !3595 add desc about sink_size 5 years ago
  jinyaohui 40b859395d add description about sink_size 5 years ago
  mindspore-ci-bot d7caa7955b !3582 Fix minddata cache include flatbuffer head problem 5 years ago
  mindspore-ci-bot 552490326f !3572 [MD] fix save pydoc and log 5 years ago
  guansongsong 543b75f366 fix the description of cache 5 years ago
  mindspore-ci-bot 3d87436bb0 !3580 fix allreduce fusion case in grad reducer 5 years ago
  ms_yan 47efc83bcd repair api format problem in totype, totensor, slice 5 years ago
  xiefangqi 0e4065f0ef fix flatbuffer head to r0.6 5 years ago
  Ziyan fdb21ecf74 update 5 years ago
  meixiaowei 7df05b1da7 fix rescale dataset bug 5 years ago
  mindspore-ci-bot c617a07dff !3533 modify serving readme 5 years ago
  liyong f52859a2fc fix save op pydoc and log 5 years ago
  mindspore-ci-bot 2a6884d97c !3564 [Auto parallel] Cost model for GPU 5 years ago
  dinghao b54fc35cde modify serving readme 5 years ago
  Xiaoda Zhang ab676ba81a add costmodel for gpu 5 years ago
  Jesse Lee f118869869 Fix a merge_op timing hole 5 years ago
  mindspore-ci-bot c31c1c808a !3530 Fix a bug for Parameter 5 years ago
  mindspore-ci-bot 67600c1d8c !3539 Change at-most collected tensor summary from 50 to 20 when auto-calculated 5 years ago
  mindspore-ci-bot 49e8727d37 !3518 fix python import r0.6 5 years ago
  mindspore-ci-bot 36c2bbdbcc !3501 fix sparse feature bug for auto parallel 5 years ago
  mindspore-ci-bot a536e922c2 !3524 add bert ci script to r0.6 branch 5 years ago
  Li Hongzhang d86668d216 change at-most collected tensor from 50 to 20 5 years ago
  mindspore-ci-bot bcba696a62 !3482 `max_file_size` includes metadata and drops the last step 5 years ago
  He Wei 1f6771256d Fix a bug for Parameter 5 years ago
  yoonlee666 1dcf9abf6a add bert ci script 5 years ago
  hexia 5fb1280e12 fix python import 5 years ago
  mindspore-ci-bot dfab48d532 !3492 Change readme.txt in WarpCTC and checkpoint directory 5 years ago
  Li Hongzhang 5a517f3a49 max_file_size include metadata length and drop last step 5 years ago
  mindspore-ci-bot 62cf01fc7b !3509 Add parameter server mode_zoo case and CI test cases. 5 years ago
  ZPaC b109e6f643 Add parameter server model_zoo case and CI test cases. 5 years ago
  mindspore-ci-bot fdf198eee9 !3493 Modify comment of register_backward_hook [r0.6] 5 years ago
  mindspore-ci-bot 7f6f140d94 !3498 Fix getting output address of internal output 5 years ago
  mindspore-ci-bot ec3e7269ba !3505 merge eager mode enable sparse to r0.6 5 years ago
  panyifeng 032c5e0fdc eager mode enable sparse 5 years ago
  mindspore-ci-bot 9626532e0b !3499 Delete parameter name hard code for embedding-lookup 5 years ago
  mindspore-ci-bot 304ae51a25 !3470 Init CSV column default list when it's empty r0.6 5 years ago
  yangyongjie 2241017e3f fix word missing in readme.txt 5 years ago
  ZPaC c1b36c3d4f Delete parameter name hard code for embedding table. 5 years ago
  lirongzhen1 8af4a16d9d fix sparse feature bug for auto parallel 5 years ago
  yujianfeng 67ed5451ad Fix getting output address of internal output 5 years ago
  mindspore-ci-bot ac564a9e86 !3466 fix cpu nonop net fp16 error 5 years ago
  mindspore-ci-bot 375078cf55 !3471 Fixing Bug with AutoContrast/Equalize supporting uint8 dtype/mnist 5 years ago
  simson 63bb52b408 Modify comment of register_backward_hook 5 years ago
  mindspore-ci-bot c9f25d0d5c !3477 upload maskrcnn scripts 5 years ago
  mindspore-ci-bot b0cb13d265 !3463 [MD]Fix Segementation Falut when SentencepieceTokenizer Op before zipOp and ConcatOp 5 years ago
  mindspore-ci-bot 14ce0afab3 !3478 Add Warpctc GPU network 5 years ago
  mindspore-ci-bot 26733198e9 !3458 fix getdataset size error r0.6 5 years ago
  mindspore-ci-bot 73f58dc937 !3480 Graceful shutdown for ps modules 5 years ago
  cristoval c1332c03e5 support graceful shutdown for ps components 5 years ago
  meixiaowei 10c74de9b6 upload maskrcnn scripts 5 years ago
  yangyongjie 28b9074e9b add warpctc GPU 5 years ago
  mindspore-ci-bot 63442d563f !3402 [AutoParallel]Fix autoparallel gpu bug 5 years ago
  islam_amin b0e83c5a06 Fixing AutoContrast/Equalize Bug 5 years ago
  kswang 9f5315fc80 fix cpu nonop net fp16 error 5 years ago
  panfengfeng 4e7cb1a7a4 fix get daataset size error 5 years ago
  jiangzhiwen d408cdf0e0 init column_default_list_ when it is empty 5 years ago
  mindspore-ci-bot c5e6cfebe7 !3436 fix mix precision operator issue 5 years ago
  cristoval aac2275d1b support graceful shutdown for ps components 5 years ago
  mindspore-ci-bot 70aee2fe7a !3401 cpp client example 5 years ago
  xulei2020 c43bc92d7c add code 5 years ago
  kingfo 5916da1763 fix mix precision operator issue 5 years ago
  mindspore-ci-bot 50e20e4042 !3443 Restore the code to collect the graph network 5 years ago
  Li Hongzhang 2373e94384 restore the ability to collect network graph 5 years ago
  mindspore-ci-bot cda920b21b !3432 add single quotes and modify parameters 5 years ago
  mindspore-ci-bot af4b4fb36d !3417 fix bug of group lr when save ckpt 5 years ago
  mindspore-ci-bot 927a52fdf8 !3388 Transfer tuple getitem's control to new added memcpy_async 5 years ago
  mindspore-ci-bot 0f8c4d6794 !3428 modify annotation: wegith_decay modify weight_decay 5 years ago
  李嘉琪 8feb9450f2 add single quotes and modify parameters 5 years ago
  lilei f304fe9614 modify weight_decay annotation 5 years ago
  mindspore-ci-bot e62137f7c0 !3406 fix optimizer parallel problems 5 years ago
  mindspore-ci-bot c005dfd803 !3389 merge sparse hot fix to r0.6 5 years ago
  mindspore-ci-bot a051d7c5dc !3410 [bug][ci]fix bug when remove the phis 5 years ago
  wangnan39@huawei.com 3c93ff3385 fix_bug_of_group_lr_when_save_ckpt 5 years ago
  Wei Luning 43d4f80428 fix bug in remove phiphi should replace the inner ones first 5 years ago
  Ziyan 9f264b6e55 fix optimizer parallel problems 5 years ago
  panyifeng 2cebc62bbf fix sparse related issues 5 years ago
  mindspore-ci-bot f9aec99c01 !3379 modify the vgg16/lstm path to offical/{cv/nlp} 5 years ago
  CaoJian 80a655099a modify the vgg16/lstm path to offical/{cv/nlp} 5 years ago
  hexia f14974392c cpp_client_example_r0.6 5 years ago
  huanghui 3901c0414f deal tuple getitem control for new added memcpy 5 years ago
  lichenever 12738ceda7 fix auto parallel gpu bug 5 years ago
  mindspore-ci-bot fe0348b3d7 !3380 Fix visit depend node 5 years ago
  mindspore-ci-bot 93ce266ae5 !3373 support call super when class define in test_case 5 years ago
  WilliamLian 35b466f8f7 fix visit depend node 5 years ago
  buxue 15487759ff support call super when class define in test_case. 5 years ago
  mindspore-ci-bot 251fba00f5 !3363 fix cloner when funcgraph return is null 5 years ago
  mindspore-ci-bot 984be47299 !3365 restructure client example 5 years ago
  mindspore-ci-bot 45d8a9eea3 !3354 improve performance of bert by adding order paramters 5 years ago
  mindspore-ci-bot 5cdfbf0e82 !3359 fix cpu nonop net 5 years ago
  hexia 9daa8a890b restructure client example 5 years ago
  leopz 61bf0c5d99 fix cloner when funcgraph is null 5 years ago
  mindspore-ci-bot 27982ebbe8 !3347 Fix internal multiple outputs check 5 years ago
  kswang 926120ef95 cpu support nonop net 5 years ago
  shibeiji 1ae2d2d6c8 add order params for bert to improve performance 5 years ago
  yujianfeng 16035dc62c Fix internal multiple outputs check 5 years ago
100 changed files with 1868 additions and 294 deletions
Split View
  1. +1
    -0
      CMakeLists.txt
  2. +1
    -1
      README.md
  3. +76
    -1
      RELEASE.md
  4. +1
    -1
      akg
  5. +2
    -2
      build.sh
  6. +1
    -1
      cmake/external_libs/icu4c.cmake
  7. +1
    -0
      cmake/external_libs/jpeg_turbo.cmake
  8. +7
    -0
      cmake/package.cmake
  9. +67
    -0
      docker/mindspore-cpu/0.6.0-beta/Dockerfile
  10. +83
    -0
      docker/mindspore-gpu/0.6.0-beta/Dockerfile
  11. +1
    -1
      graphengine
  12. +19
    -0
      mindspore/_extends/builtin_operations.py
  13. +19
    -19
      mindspore/_extends/parse/parser.py
  14. +3
    -1
      mindspore/_extends/parse/standard_method.py
  15. +2
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h
  16. +3
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pull_kernel.h
  17. +4
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
  18. +3
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
  19. +2
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h
  20. +17
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
  21. +2
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h
  22. +3
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.cc
  23. +2
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h
  24. +226
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cu
  25. +25
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cuh
  26. +149
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cu
  27. +25
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cuh
  28. +30
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.cc
  29. +160
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.h
  30. +36
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.cc
  31. +168
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.h
  32. +6
    -0
      mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel_metadata.cc
  33. +40
    -2
      mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc
  34. +31
    -4
      mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h
  35. +2
    -2
      mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc
  36. +2
    -0
      mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
  37. +4
    -4
      mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
  38. +39
    -20
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc
  39. +2
    -2
      mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
  40. +1
    -1
      mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
  41. +65
    -0
      mindspore/ccsrc/backend/optimizer/ascend/format_type/split_unsupported_transdata.cc
  42. +37
    -0
      mindspore/ccsrc/backend/optimizer/ascend/format_type/split_unsupported_transdata.h
  43. +1
    -1
      mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
  44. +40
    -19
      mindspore/ccsrc/backend/session/ascend_inference_session.cc
  45. +3
    -1
      mindspore/ccsrc/backend/session/ascend_inference_session.h
  46. +1
    -3
      mindspore/ccsrc/backend/session/ascend_session.cc
  47. +1
    -4
      mindspore/ccsrc/backend/session/cpu_session.cc
  48. +5
    -4
      mindspore/ccsrc/backend/session/gpu_session.cc
  49. +10
    -20
      mindspore/ccsrc/backend/session/kernel_graph.cc
  50. +1
    -4
      mindspore/ccsrc/backend/session/kernel_graph.h
  51. +6
    -8
      mindspore/ccsrc/backend/session/session_basic.cc
  52. +1
    -2
      mindspore/ccsrc/backend/session/session_basic.h
  53. +13
    -4
      mindspore/ccsrc/frontend/operator/prim_others.cc
  54. +3
    -2
      mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
  55. +8
    -1
      mindspore/ccsrc/frontend/parallel/costmodel_context.cc
  56. +1
    -0
      mindspore/ccsrc/frontend/parallel/costmodel_context.h
  57. +7
    -0
      mindspore/ccsrc/frontend/parallel/ps/common.h
  58. +7
    -4
      mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc
  59. +119
    -19
      mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
  60. +2
    -3
      mindspore/ccsrc/frontend/parallel/ps/scheduler.cc
  61. +14
    -0
      mindspore/ccsrc/frontend/parallel/ps/util.cc
  62. +2
    -0
      mindspore/ccsrc/frontend/parallel/ps/util.h
  63. +54
    -11
      mindspore/ccsrc/frontend/parallel/ps/worker.h
  64. +25
    -1
      mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
  65. +3
    -1
      mindspore/ccsrc/minddata/dataset/CMakeLists.txt
  66. +4
    -0
      mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
  67. +4
    -0
      mindspore/ccsrc/minddata/dataset/core/tensor.cc
  68. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/cache/cache_client.h
  69. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/cache/cache_request.h
  70. +2
    -1
      mindspore/ccsrc/minddata/dataset/engine/cache/cache_service.h
  71. +9
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc
  72. +2
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
  73. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc
  74. +3
    -2
      mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
  75. +18
    -13
      mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.cc
  76. +1
    -0
      mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h
  77. +7
    -2
      mindspore/ccsrc/minddata/dataset/util/task_manager.cc
  78. +1
    -0
      mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc
  79. +2
    -1
      mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
  80. +1
    -1
      mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
  81. +2
    -2
      mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
  82. +1
    -1
      mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
  83. +14
    -4
      mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
  84. +11
    -5
      mindspore/ccsrc/pipeline/jit/parse/parse.cc
  85. +9
    -1
      mindspore/ccsrc/pipeline/jit/pipeline.cc
  86. +12
    -1
      mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
  87. +1
    -1
      mindspore/ccsrc/pipeline/pynative/base.h
  88. +18
    -9
      mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
  89. +1
    -0
      mindspore/ccsrc/pipeline/pynative/pynative_execute.h
  90. +2
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
  91. +10
    -6
      mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.cc
  92. +4
    -0
      mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
  93. +5
    -0
      mindspore/ccsrc/runtime/device/cpu/cpu_device_address.cc
  94. +8
    -4
      mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
  95. +6
    -7
      mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.cc
  96. +3
    -4
      mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.h
  97. +3
    -10
      mindspore/ccsrc/runtime/device/cpu/cpu_simple_mem_plan.cc
  98. +1
    -6
      mindspore/ccsrc/runtime/device/cpu/cpu_simple_mem_plan.h
  99. +0
    -1
      mindspore/ccsrc/runtime/device/kernel_runtime.cc
  100. +1
    -11
      mindspore/ccsrc/transform/graph_ir/op_declare.cc

+ 1
- 0
CMakeLists.txt View File

@@ -106,6 +106,7 @@ endif() # NOT ENABLE_ACL

if (ENABLE_SERVING)
add_subdirectory(serving)
add_subdirectory(serving/example/cpp_client)
endif()

if (NOT ENABLE_ACL)


+ 1
- 1
README.md View File

@@ -75,7 +75,7 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex
1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package.

```
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.6.0-cp37-cp37m-linux_x86_64.whl
```

2. Run the following command to verify the install.


+ 76
- 1
RELEASE.md View File

@@ -1,3 +1,76 @@
# Release 0.6.0-beta

## Major Features and Improvements
### Ascend 910 Training and Inference Framework
* New models
* There are official, research and community under modelzoo.
* Official is maintained with the newest APIs by MindSpore team, MaskRCNN are added.
* Research is uploaded by researchers for official review, and APIs may not be updated in time.
* Community reprints the relevant links of partner research results.
* Hub added on the same level as modelzoo, synchronous storage of materials needed for official hub web pages which will be launched soon.
* Support pre-trained models, few lines of code can be used to download and load pre-trained models, supporting inference or transfer learning.
* Frontend and user interface
* Supports user side operator compilation and graph execution error rendering.
* Uniform definition dynamic learning rate behavior in optimizers.
* Support IndexSlice in sparse expression.
* Support use parent construct method during construct.
* Support asynchronous execution save checkpoint file.
* Support implicit type conversion in pynative mode.
* User interfaces change log
* unform learning rate behavior in optimizers([!2755](https://gitee.com/mindspore/mindspore/pulls/2755))
* rename operator of sparse optimizer([!3217](https://gitee.com/mindspore/mindspore/pulls/3217))
* move profiler module from mindinsight to mindspore([!3075](https://gitee.com/mindspore/mindspore/pulls/3075))
* VOCDataset output change to multi-columns([!3093](https://gitee.com/mindspore/mindspore/pulls/3093))
* GetDatasize feature([!3212](https://gitee.com/mindspore/mindspore/pulls/3212))
* dataset: modify config api([!2936](https://gitee.com/mindspore/mindspore/pulls/2936))
* Executor and performance optimization
* Decouple C++ and python, so make the architecture more extensible.
* Parameter Server for distributed deep learning supported.
* Serving:a flexible service deployment framework for deep learning models.
* Memory reuse is enhanced, and the batch size of Bert large model is increased from 96 to 160 on a single server.
* Data processing, augmentation, and save format
* Support MindRecord save operator after date processing
* Support automatic fusion operator, such as decode/resize/crop
* Support CSV dataset loading
### Other Hardware Support
* GPU platform
* New model supported: ResNext50, WarpCTC and GoogLeNet.
* Support hyperparametric search and data enhanced automl on GPU.
* Support Resnet50 automatic parallel in GPU backend.

## Bugfixes
* Models
* Improved the performance and accuracy on ResNet50([!3456](https://gitee.com/mindspore/mindspore/pulls/3456))
* Fixed the performance test case of bert([!3486](https://gitee.com/mindspore/mindspore/pulls/3486))
* Python API
* Fix assign used in while loop([!2720](https://gitee.com/mindspore/mindspore/pulls/2720))
* Revert optimize the graph output of all nop node.([!2857](https://gitee.com/mindspore/mindspore/pulls/2857))
* Print tensor as numpy.([!2859](https://gitee.com/mindspore/mindspore/pulls/2859))
* Support weight decay for sparse optimizer([!2668](https://gitee.com/mindspore/mindspore/pulls/2668))
* Fix BatchToSpaceND([!2741](https://gitee.com/mindspore/mindspore/pulls/2741))
* Fixing type check mistakes of InplaceAdd and Inplace Sub ops([!2744](https://gitee.com/mindspore/mindspore/pulls/2744]))
* Change order param only equal to group param([!2748](https://gitee.com/mindspore/mindspore/pulls/2748))
* Executor
* The performance of graph whith control flow is optimized([!2931](https://gitee.com/mindspore/mindspore/pulls/2931))
* Fix bug of wrong number of tuple layers([!3390](https://gitee.com/mindspore/mindspore/pulls/3390))
* Fix cpu multi graph memory exception([!3631](https://gitee.com/mindspore/mindspore/pulls/3631))
* Enable data sync when calling operator without defining a cell([!3081](https://gitee.com/mindspore/mindspore/pulls/3081))
* Fix argmaxwith value error in pynative mode on GPU([!3082](https://gitee.com/mindspore/mindspore/pulls/3082))
* Fix precision error with fp16 input on pynative mode([!3196](https://gitee.com/mindspore/mindspore/pulls/3196))
* Data processing
* Fix bug of RandomColor and RandomSharpness default parameter checking ([!2833](https://gitee.com/mindspore/mindspore/pulls/2833))
* Fix process hung when training and eval ([!3469](https://gitee.com/mindspore/mindspore/pulls/3469))
* Third party
* Sqlite : Update sqlite to 3.32.2 to handle [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656), [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871), [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655), [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327), [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630), [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358), [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631), [CVE-2020-13632](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13632), [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434), [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435), and [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655).
* Libjpeg-turbo : Update libjpeg-turbo to 2.0.4 to handle [CVE-2020-13790](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13790).

## Contributors
Thanks goes to these wonderful people:

Alexey Shevlyakov, avakh, baihuawei, BowenK, buxue, caifubi, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenjianping, chentingting, chenzomi, chujinjin, Danish Farid, dayschan, dengwentao, dinghao, etone-chan, fangzehua, fary86, geekun, Giancarlo Colmenares, gong chen, gukecai, guohongzilong, hangangqiang, heleiwang, hesham, He Wei, hexia, hongxing, huangdongrun, huanghui, islam_amin, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jojobugfree, Jonathan Yan, jonyguo, Junhan Hu, Kang, kingfo, kouzhenzhong, kpy, kswang, laiyongqiang, leopz, liangzelang, lichenever, lihongkang, Li Hongzhang, lilei, limingqi107, lirongzhen1, liubuyu, liuchongming74, liuwenhao4, liuxiao, Lixia Chen, liyanliu, liyong, lizhenyu, lvliang, Mahdi, Margaret_wangrui, meixiaowei, ms_yan, nhussain, ougongchang, panfengfeng, panyifeng, peilinwang, Peilin Wang, pkuliuliu, qianlong, rick_sanchez, shibeiji, Shida He, shijianning, simson, sunsuodong, suteng, Tinazhang, Tron Zhang, unknown, VectorSL, wandongdong, wangcong, wangdongxu, wangdongxu6, wanghua, wangnan39, Wei Luning, wenchunjiang, wenkai, wilfChen, WilliamLian, wukesong, Xian Weizhao, Xiaoda Zhang, xiefangqi, xulei2020, xunxue, xutianchun, Yang, yanghaitao, yanghaitao1, yanghaoran, yangjie, yangjie159, YangLuo, Yanjun Peng, yankai, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yuchaojie, yujianfeng, zhangzhongpeng, zhangdengcheng, Zhang Qinghua, zhangyinxia, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang, wangdongxu

Contributions of any kind are welcome!

# Release 0.5.0-beta

## Major Features and Improvements
@@ -62,6 +135,8 @@
* Fix bug of Cifar dataset reading([!2096](https://gitee.com/mindspore/mindspore/pulls/2096))
* Fix bug of C++ behavior in RandomCropAndResize([!2026](https://gitee.com/mindspore/mindspore/pulls/2026))
* Fix the bug of mindrecord shuffle([!2420](https://gitee.com/mindspore/mindspore/pulls/2420))
* Third party
* Sqlite : Update sqlite to 3.32.2 to handle [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656), [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871), [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655), [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327), [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630), [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358), [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631), [CVE-2020-13632](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13632), [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434), [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435), and [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655).

## Contributors
Thanks goes to these wonderful people:
@@ -134,7 +209,7 @@ Contributions of any kind are welcome!
* Fix sens shape error of TrainOneStepWithLossScaleCell([!1050](https://gitee.com/mindspore/mindspore/pulls/1050))
* Fix BatchNormGrad operator([!1344](https://gitee.com/mindspore/mindspore/pulls/1344))
* Executor
* Fix dropouttopK and addn errors in PyNative mode ([!1285](https://gitee.com/mindspore/mindspore/pulls/1285), [!1138](https://gitee.com/mindspore/mindspore/pulls/1138), [!1033](https://gitee.com/mindspore/mindspore/pulls/1033)).
* Fix dropout, topK and addn errors in PyNative mode ([!1285](https://gitee.com/mindspore/mindspore/pulls/1285), [!1138](https://gitee.com/mindspore/mindspore/pulls/1138), [!1033](https://gitee.com/mindspore/mindspore/pulls/1033)).
* Fix memory leaks after execution in PyNatvie mode ([!1201](https://gitee.com/mindspore/mindspore/pulls/1201)).
* Fix HCCL failure in some special scenes ([!1204](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1204), [!1252](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1252)).
* Fix SSD network when Select failed, cann't find kernel info([!1449](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1449)).


+ 1
- 1
akg

@@ -1 +1 @@
Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625
Subproject commit 5c0e3d2ffb6ba7650453c3b11163237a43d206d6

+ 2
- 2
build.sh View File

@@ -491,9 +491,9 @@ build_predict()

cd "${BASEPATH}/predict/output/"
if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then
tar -cf MSPredict-0.5.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.6.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then
tar -cf MSPredict-0.5.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.6.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
fi
echo "success to build predict project!"
}


+ 1
- 1
cmake/external_libs/icu4c.cmake View File

@@ -8,7 +8,7 @@ else()
VER 67.1
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
MD5 0c2662a2b0bc80b0eb56495205247c8f
MD5 fd525fb47d8827b0b7da78b51dd2d93f
CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh
)
include_directories(${icu4c_INC})


+ 1
- 0
cmake/external_libs/jpeg_turbo.cmake View File

@@ -12,6 +12,7 @@ mindspore_add_pkg(jpeg_turbo
URL https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.4.tar.gz
MD5 44c43e4a9fb352f47090804529317c88
CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE
PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/jpeg_turbo/jpeg_turbo.patch001
)
include_directories(${jpeg_turbo_INC})
add_library(mindspore::jpeg_turbo ALIAS jpeg_turbo::jpeg)

+ 7
- 0
cmake/package.cmake View File

@@ -278,6 +278,13 @@ if (ENABLE_SERVING)
COMPONENT mindspore
)

file(GLOB MS_SERVING_PY_LIST ${CMAKE_SOURCE_DIR}/serving/*.py)
install(
FILES ${MS_SERVING_PY_LIST}
DESTINATION ${INSTALL_PY_DIR}
COMPONENT mindspore
)

install(
TARGETS inference
DESTINATION ${INSTALL_LIB_DIR}


+ 67
- 0
docker/mindspore-cpu/0.6.0-beta/Dockerfile View File

@@ -0,0 +1,67 @@
FROM ubuntu:18.04

MAINTAINER leonwanghui <leon.wanghui@huawei.com>

# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH

# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion

# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex

# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash

# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf

# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.6.0-cp37-cp37m-linux_x86_64.whl

+ 83
- 0
docker/mindspore-gpu/0.6.0-beta/Dockerfile View File

@@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04

MAINTAINER leonwanghui <leon.wanghui@huawei.com>

# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH

# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion

# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1

# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash

# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf

# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz

# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.6.0-cp37-cp37m-linux_x86_64.whl

+ 1
- 1
graphengine

@@ -1 +1 @@
Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009
Subproject commit 885af56694eff438a4ea079c0c34de30993f1473

+ 19
- 0
mindspore/_extends/builtin_operations.py View File

@@ -14,7 +14,10 @@
# ============================================================================
"""builtin_operations"""
import numpy as np
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor
import mindspore.common.dtype as mstype
from mindspore.common.dtype import dtype_to_nptype, get_py_obj_dtype


@@ -113,6 +116,7 @@ def bool_or(x, y):
"""Implement `bool_or`."""
return x or y


def vm_compare(*args):
"""Implement `vm_compare` for tensor."""
obj_str = args[-1]
@@ -141,10 +145,12 @@ def list_len(x):
"""Implement `list_len`."""
return len(x)


def Depend(value, expr):
"""Implement `Depend`."""
return value


# only used in PyNative mode
def make_ref(key, value, ref):
return value
@@ -171,3 +177,16 @@ def tuple_to_array(x):
def stop_gradient(x):
"""Implement `stop_gradient`."""
return x


hyper_map = C.HyperMap()


def mixed_precision_cast(dst_type, x):
"""Implement `mixed_precision_cast`."""
def cast_inner(data):
if isinstance(data, Tensor) and data.dtype in (mstype.float32, mstype.float16):
return F.cast(data, dst_type)
return data

return hyper_map(cast_inner, x)

+ 19
- 19
mindspore/_extends/parse/parser.py View File

@@ -459,27 +459,27 @@ class Parser:
logger.debug("ops info = %r", ops_info)
return ops_info

def analyze_super(self, father_class_node, subclass_instance):
def analyze_super(self, class_type_node, subclass_instance):
"""Analyze super and return a class instance."""
father_class = None
if father_class_node is None:
father_class = type(subclass_instance)
if isinstance(father_class_node, ast.Name):
father_class_name = getattr(father_class_node, 'id')
father_class = self.global_namespace[father_class_name]
if isinstance(father_class_node, ast.Attribute):
value = getattr(father_class_node, 'value')
attr = getattr(father_class_node, 'attr')
module_name = getattr(value, 'id')
father_class_module = self.global_namespace[module_name]
father_class = getattr(father_class_module, attr)
if father_class is None:
raise ValueError("When call 'super', the father class is None.")
if not isinstance(subclass_instance, father_class):
sub_class = type(subclass_instance)
if class_type_node is None:
return super(sub_class, subclass_instance)
if isinstance(class_type_node, ast.Name):
class_name = getattr(class_type_node, 'id')
elif isinstance(class_type_node, ast.Attribute):
class_name = getattr(class_type_node, 'attr')
else:
raise ValueError(f"When call 'super', the first arg should be a class type, "
f"but got {class_type_node.__class__.__name__}.")

target_father_class = None
for class_element in sub_class.mro():
if class_element.__name__ == class_name:
target_father_class = class_element
break
if target_father_class is None:
raise ValueError("When call 'super', the second arg should be an instance of first arg.")

target_class_instance = super(father_class, subclass_instance)
return target_class_instance
return super(target_father_class, subclass_instance)

def get_location(self, node):
"""


+ 3
- 1
mindspore/_extends/parse/standard_method.py View File

@@ -132,7 +132,9 @@ def while_cond(x):
@constexpr
def check_type_same(x_type, base_type):
"""Check x_type is same as base_type."""
return mstype.issubclass_(x_type, base_type)
if mstype.issubclass_(x_type, base_type):
return True
raise TypeError(f"The arg 'x' should be a {base_type}, but got {x_type}.")


@constexpr


+ 2
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h View File

@@ -31,8 +31,9 @@ class PServerKernel {
~PServerKernel() = default;
PServerKernel(const PServerKernel &) = delete;
PServerKernel &operator=(const PServerKernel &) = delete;
virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;


+ 3
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pull_kernel.h View File

@@ -33,8 +33,9 @@ class PullKernel : public CPUKernel {
~PullKernel() override = default;

bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &) {
// If the paramter is embedding table, don't Pull from PServer.
if (param_name_.find("embedding") == std::string::npos && param_name_.find("wide_w") == std::string::npos) {
bool init_in_server = mindspore::parallel::ps::Worker<float>::GetInstance().GetParamInitInServer(param_name_);
// If init_in_server, forward kernel should run in server too.
if (!init_in_server) {
parallel::ps::Worker<T>::GetInstance().Pull(key_, inputs[1]->addr, inputs[1]->size);
}
return true;


+ 4
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h View File

@@ -43,7 +43,10 @@ class PushKernel : public CPUKernel {
sizes.push_back(SizeToInt(input->size) / sizeof(T));
}
parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
auto ret = memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
}
return true;
}



+ 3
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc View File

@@ -23,7 +23,7 @@ namespace mindspore {
namespace kernel {
namespace ps {
void SparseApplyAdamPSKernel::InitKernel(
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
std::vector<size_t> &var_shape = *(shape_vec[0]);
std::vector<size_t> &m_shape = *(shape_vec[1]);
@@ -55,11 +55,9 @@ void SparseApplyAdamPSKernel::InitKernel(
if (grad_shape[0] != indices_size_) {
MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices";
}
/*
if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) {
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov");
}
*/
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));


+ 2
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h View File

@@ -30,7 +30,8 @@ class SparseApplyAdamPSKernel : public SparseApplyAdamCPUKernel, public PServerK
SparseApplyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {}
~SparseApplyAdamPSKernel() override = default;

void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;


+ 17
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc View File

@@ -20,7 +20,7 @@ namespace mindspore {
namespace kernel {
namespace ps {
void SparseApplyFtrlPSKernel::InitKernel(
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
std::vector<size_t> var_shape = *(shape_vec[0]);
std::vector<size_t> accum_shape = *(shape_vec[1]);
@@ -46,10 +46,22 @@ void SparseApplyFtrlPSKernel::InitKernel(
if (grad_shape[0] != indices_size_) {
MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
}
lr_ = 0.01;
l1_ = 1e-8;
l2_ = 1e-8;
lr_power_ = -0.5;
lr_ = AnfAlgo::GetNodeAttr<float>(cnode, "lr");
if (lr_ <= 0) {
MS_LOG(EXCEPTION) << "lr should be a positive scalar";
}
l1_ = AnfAlgo::GetNodeAttr<float>(cnode, "l1");
if (l1_ < 0) {
MS_LOG(EXCEPTION) << "l1 should be a non-negative scalar";
}
l2_ = AnfAlgo::GetNodeAttr<float>(cnode, "l2");
if (l2_ < 0) {
MS_LOG(EXCEPTION) << "l2 should be a non-negative scalar";
}
lr_power_ = AnfAlgo::GetNodeAttr<float>(cnode, "lr_power");
if (lr_power_ > 0) {
MS_LOG(EXCEPTION) << "lr_power should be a non-positive scalar";
}
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));


+ 2
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h View File

@@ -30,7 +30,8 @@ class SparseApplyFtrlPSKernel : public SparseApplyFtrlCPUKernel, public PServerK
SparseApplyFtrlPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {}
~SparseApplyFtrlPSKernel() override = default;

void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;

bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,


+ 3
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.cc View File

@@ -23,7 +23,7 @@ namespace mindspore {
namespace kernel {
namespace ps {
void SparseApplyLazyAdamPSKernel::InitKernel(
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
std::vector<size_t> &var_shape = *(shape_vec[0]);
std::vector<size_t> &m_shape = *(shape_vec[1]);
@@ -55,11 +55,9 @@ void SparseApplyLazyAdamPSKernel::InitKernel(
if (grad_shape[0] != indices_size_) {
MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices";
}
/*
if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) {
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov");
}
*/
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));


+ 2
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h View File

@@ -30,7 +30,8 @@ class SparseApplyLazyAdamPSKernel : public SparseApplyLazyAdamCPUKernel, public
SparseApplyLazyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {}
~SparseApplyLazyAdamPSKernel() override = default;

void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void InitKernel(const CNodePtr &cnode,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override;
bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;


+ 226
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cu View File

@@ -0,0 +1,226 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <algorithm>
#include "maxpool_with_argmax_grad_impl.cuh"
#include "runtime/device/gpu/cuda_common.h"
#include "include/cuda_fp16.h"

template <typename T, typename S>
__global__ void MaxPoolWithArgmaxGrad(const T* x,
const T* dy,
const S* index,
const int n,
const int c,
const int xHeight,
const int xWidth,
const int dyHeight,
const int dyWidth,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int xNCHW,
const int xCHW,
const int xHW,
const int dyCHW,
const int dyHW,
T* dx) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x;
pos < (xNCHW);
pos += blockDim.x * gridDim.x) {
const int posn = pos / xCHW;
const int posc = pos / xHW % c;
const int posh = pos / xHeight % xHeight;
const int posw = pos % xWidth;
const S posIdx = posh*xWidth + posw;
int hstart = posh+padTop;
if (hstart < windowHeight) {
hstart = 0;
} else {
hstart = (hstart-windowHeight)/strideHeight + 1;
}
int wstart = posw+padLeft;
if (wstart < windowWidth) {
wstart = 0;
} else {
wstart = (wstart-windowWidth)/strideWidth + 1;
}
const int hend = min((posh+padTop)/strideHeight +1, dyHeight);
const int wend = min((posw+padLeft)/strideWidth +1, dyWidth);
const int channelStart = posn*dyCHW + posc*dyHW;
T dySum = static_cast<T>(0.0);
for (int hcur = hstart; hcur < hend; ++hcur) {
for (int wcur = wstart; wcur < wend; ++wcur) {
const int curIdx = hcur*dyWidth + wcur;
S maxIdx = index[channelStart+curIdx];
if (maxIdx == posIdx) {
dySum += dy[channelStart+curIdx];
}
}
}
dx[pos] = dySum;
}
return;
}

template <>
__global__ void MaxPoolWithArgmaxGrad(const half* x,
const half* dy,
const int* index,
const int n,
const int c,
const int xHeight,
const int xWidth,
const int dyHeight,
const int dyWidth,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int xNCHW,
const int xCHW,
const int xHW,
const int dyCHW,
const int dyHW,
half* dx) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x;
pos < (xNCHW);
pos += blockDim.x * gridDim.x) {
const int posn = pos / xCHW;
const int posc = pos / xHW % c;
const int posh = pos / xHeight % xHeight;
const int posw = pos % xWidth;
const int posIdx = posh*xWidth + posw;
int hstart = posh+padTop;
if (hstart < windowHeight) {
hstart = 0;
} else {
hstart = (hstart-windowHeight)/strideHeight + 1;
}
int wstart = posw+padLeft;
if (wstart < windowWidth) {
wstart = 0;
} else {
wstart = (wstart-windowWidth)/strideWidth + 1;
}
const int hend = min((posh+padTop)/strideHeight +1, dyHeight);
const int wend = min((posw+padLeft)/strideWidth +1, dyWidth);
const int channelStart = posn*dyCHW + posc*dyHW;
float dySum = 0.0f;
for (int hcur = hstart; hcur < hend; ++hcur) {
for (int wcur = wstart; wcur < wend; ++wcur) {
const int curIdx = hcur*dyWidth + wcur;
int maxIdx = index[channelStart+curIdx];
if (maxIdx == posIdx) {
dySum += __half2float(dy[channelStart+curIdx]);
}
}
}
dx[pos] = __float2half(dySum);
}
return;
}

template <typename T, typename S>
void CalMaxPoolWithArgmaxGrad(const T* x,
const T* dy,
const S* index,
const int n,
const int c,
const int xHeight,
const int xWidth,
const int dyHeight,
const int dyWidth,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
T* dx,
cudaStream_t cuda_stream) {
const int xHW = xHeight*xWidth;
const int xCHW = c*xHW;
const int xNCHW = n*xCHW;
const int dyHW = dyHeight*dyWidth;
const int dyCHW = c*dyHW;
MaxPoolWithArgmaxGrad<<<GET_BLOCKS(xNCHW),
GET_THREADS,
0,
cuda_stream>>>(
x,
dy,
index,
n,
c,
xHeight,
xWidth,
dyHeight,
dyWidth,
windowHeight,
windowWidth,
strideHeight,
strideWidth,
padTop,
padLeft,
xNCHW,
xCHW,
xHW,
dyCHW,
dyHW,
dx);
return;
}

template void CalMaxPoolWithArgmaxGrad<float, int>(const float* x,
const float* dy,
const int* index,
const int n,
const int c,
const int xHeight,
const int xWidth,
const int dyHeight,
const int dyWidth,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
float* dx,
cudaStream_t cuda_stream);
template void CalMaxPoolWithArgmaxGrad<half, int>(const half* x,
const half* dy,
const int* index,
const int n,
const int c,
const int xHeight,
const int xWidth,
const int dyHeight,
const int dyWidth,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
half* dx,
cudaStream_t cuda_stream);

+ 25
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cuh View File

@@ -0,0 +1,25 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_
template <typename T, typename S>
void CalMaxPoolWithArgmaxGrad(const T* x, const T* dy, const S* index, const int n, const int c, const int xHeight,
const int xWidth, const int dyHeight, const int dyWidth, const int windowHeight,
const int windowWidth, const int strideHeight, const int strideWidth, const int padTop,
const int padLeft, T* dx, cudaStream_t cuda_stream);

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_

+ 149
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cu View File

@@ -0,0 +1,149 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <algorithm>
#include "maxpool_with_argmax_impl.cuh"
#include "runtime/device/gpu/cuda_common.h"
#include "include/cuda_fp16.h"
template <typename T, typename S>
__global__ void MaxPoolWithArgmax(const T* input,
const int n,
const int c,
const int h,
const int w,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int outputHeight,
const int outputWidth,
const int outputNCHW,
const int outputCHW,
const int outputHW,
T* output,
S *index) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x;
pos < (outputNCHW);
pos += blockDim.x * gridDim.x) {
const int posn = pos / outputCHW;
const int posc = pos / outputHW % c;
const int posh = pos / outputHeight % outputHeight;
const int posw = pos % outputWidth;
int hstart = posh * strideHeight - padTop;
int wstart = posw * strideWidth - padLeft;
const int hend = min(hstart + windowHeight, h);
const int wend = min(wstart + windowWidth, w);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
S inputStart = posn*c*h*w + posc*h*w;
S maxIdx = hstart*w + wstart;
T maxData = input[inputStart+maxIdx];
for (int hcur = hstart; hcur < hend; ++hcur) {
for (int wcur = wstart; wcur < wend; ++wcur) {
S inputIdx = hcur*w + wcur;
T inputData = input[inputStart+inputIdx];
if (inputData > maxData) {
maxIdx = inputIdx;
maxData = inputData;
}
}
}
output[pos] = maxData;
index[pos] = maxIdx;
}
return;
}

template <typename T, typename S>
void CalMaxPoolWithArgmax(const T* input,
const int n,
const int c,
const int h,
const int w,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int outputHeight,
const int outputWidth,
T* output,
S *index,
cudaStream_t cuda_stream) {
const int outputNCHW = n*c*outputHeight*outputWidth;
const int outputCHW = c*outputHeight*outputWidth;
const int outputHW = outputHeight*outputWidth;
MaxPoolWithArgmax<<<GET_BLOCKS(n*c*outputHeight*outputWidth),
GET_THREADS,
0,
cuda_stream>>>(
input,
n,
c,
h,
w,
windowHeight,
windowWidth,
strideHeight,
strideWidth,
padTop,
padLeft,
outputHeight,
outputWidth,
outputNCHW,
outputCHW,
outputHW,
output,
index);
return;
}

template void CalMaxPoolWithArgmax<float, int>(const float* input,
const int n,
const int c,
const int h,
const int w,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int outputHeight,
const int outputWidth,
float* output,
int* index,
cudaStream_t cuda_stream);

template void CalMaxPoolWithArgmax<half, int>(const half* input,
const int n,
const int c,
const int h,
const int w,
const int windowHeight,
const int windowWidth,
const int strideHeight,
const int strideWidth,
const int padTop,
const int padLeft,
const int outputHeight,
const int outputWidth,
half* output,
int* index,
cudaStream_t cuda_stream);

+ 25
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cuh View File

@@ -0,0 +1,25 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_
template <typename T, typename S>
void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, const int windowHeight,
const int windowWidth, const int strideHeight, const int strideWidth, const int padTop,
const int padLeft, const int outputHeight, const int outputWidth, T* output, S *index,
cudaStream_t cuda_stream);

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_

+ 30
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.cc View File

@@ -0,0 +1,30 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.h"

namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(
MaxPoolWithArgmax,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
MaxPoolWithArgmaxGpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(
MaxPoolWithArgmax,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32),
MaxPoolWithArgmaxGpuFwdKernel, half, int)
} // namespace kernel
} // namespace mindspore

+ 160
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.h View File

@@ -0,0 +1,160 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_

#include <algorithm>
#include <vector>
#include <string>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cuh"
#include "backend/kernel_compiler/gpu/kernel_constants.h"

namespace mindspore {
namespace kernel {
template <typename T, typename S>
class MaxPoolWithArgmaxGpuFwdKernel : public GpuKernel {
public:
MaxPoolWithArgmaxGpuFwdKernel()
: n_(0),
c_(0),
input_height_(0),
input_width_(0),
window_height_(0),
window_width_(0),
pad_height_(0),
pad_width_(0),
pad_top_(0),
pad_left_(0),
stride_height_(0),
stride_width_(0),
output_height_(0),
output_width_(0),
input_size_(0),
output_size_(0) {}
~MaxPoolWithArgmaxGpuFwdKernel() override = default;

const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
S *index_addr = GetDeviceAddress<S>(outputs, 1);
CalMaxPoolWithArgmax(input_addr, n_, c_, input_height_, input_width_, window_height_, window_width_, stride_height_,
stride_width_, pad_top_, pad_left_, output_height_, output_width_, output_addr, index_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}

bool Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolWithArgmax needs 1 inputs.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolWithArgmax needs 2 output.";
return false;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
input_size_ = sizeof(T);
for (auto x : input_shape) {
input_size_ *= x;
}
output_size_ = sizeof(T);
for (auto x : output_shape) {
output_size_ *= x;
}
n_ = SizeToInt(input_shape[0]);
c_ = SizeToInt(input_shape[1]);
input_height_ = SizeToInt(input_shape[2]);
input_width_ = SizeToInt(input_shape[3]);
output_height_ = SizeToInt(output_shape[2]);
output_width_ = SizeToInt(output_shape[3]);
auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize"));
window_height_ = window[1];
window_width_ = window[2];
auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides"));
stride_height_ = stride[1];
stride_width_ = stride[2];
pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding"));
pad_top_ = 0;
pad_left_ = 0;
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) {
SetPad();
}
InitSizeLists();
return true;
}

protected:
void InitSizeLists() override {
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
output_size_list_.push_back(output_size_ / sizeof(T) * sizeof(S));
}

private:
void SetPad() {
pad_height_ = std::max<int>(
0, (((input_height_ / stride_height_) * stride_height_ == input_height_ ? (input_height_ / stride_height_)
: (input_height_ / stride_height_) + 1) -
1) *
stride_height_ +
window_height_ - input_height_);
pad_width_ = std::max<int>(
0, (((input_width_ / stride_width_) * stride_width_ == input_width_ ? (input_width_ / stride_width_)
: (input_width_ / stride_width_) + 1) -
1) *
stride_width_ +
window_width_ - input_width_);
pad_top_ = pad_height_ / 2;
pad_left_ = pad_width_ / 2;
}

std::string pad_mode_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;

int n_;
int c_;
int input_height_;
int input_width_;
int window_height_;
int window_width_;
int pad_height_;
int pad_width_;
int pad_top_;
int pad_left_;
int stride_height_;
int stride_width_;
int output_height_;
int output_width_;

size_t input_size_;
size_t output_size_;
};
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_

+ 36
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.cc View File

@@ -0,0 +1,36 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.h"

namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32),
MaxPoolWithArgmaxGradGpuKernel, float, int)
MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat16),
MaxPoolWithArgmaxGradGpuKernel, half, int)
} // namespace kernel
} // namespace mindspore

+ 168
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.h View File

@@ -0,0 +1,168 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_

#include <algorithm>
#include <vector>
#include <string>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cuh"
#include "backend/kernel_compiler/gpu/kernel_constants.h"

namespace mindspore {
namespace kernel {
template <typename T, typename S>
class MaxPoolWithArgmaxGradGpuKernel : public GpuKernel {
public:
MaxPoolWithArgmaxGradGpuKernel()
: n_(0),
c_(0),
x_height_(0),
x_width_(0),
dy_height_(0),
dy_width_(0),
x_size_(0),
dy_size_(0),
index_size_(0),
dx_size_(0) {}
~MaxPoolWithArgmaxGradGpuKernel() override = default;

const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
T *x_addr = GetDeviceAddress<T>(inputs, 0);
T *dy_addr = GetDeviceAddress<T>(inputs, 1);
S *index_addr = GetDeviceAddress<S>(inputs, 2);
T *dx_addr = GetDeviceAddress<T>(outputs, 0);
CalMaxPoolWithArgmaxGrad(x_addr, dy_addr, index_addr, n_, c_, x_height_, x_width_, dy_height_, dy_width_,
window_height_, window_width_, stride_height_, stride_width_, pad_top_, pad_left_, dx_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}

bool Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolGradWithArgmax needs 3 inputs.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolGradWithArgmax needs 1 output.";
return false;
}
auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
auto index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
auto dx_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
x_size_ = sizeof(T);
for (auto x : x_shape) {
x_size_ *= x;
}
dy_size_ = sizeof(T);
for (auto x : dy_shape) {
dy_size_ *= x;
}
index_size_ = sizeof(S);
for (auto x : index_shape) {
index_size_ *= x;
}
dx_size_ = sizeof(T);
for (auto x : dx_shape) {
dx_size_ *= x;
}
n_ = SizeToInt(x_shape[0]);
c_ = SizeToInt(x_shape[1]);
x_height_ = SizeToInt(x_shape[2]);
x_width_ = SizeToInt(x_shape[3]);
dy_height_ = SizeToInt(dy_shape[2]);
dy_width_ = SizeToInt(dy_shape[3]);
auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize"));
window_height_ = window[1];
window_width_ = window[2];
auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides"));
stride_height_ = stride[1];
stride_width_ = stride[2];
pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding"));
pad_top_ = 0;
pad_left_ = 0;
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) {
SetPad();
}
InitSizeLists();
return true;
}

protected:
void InitSizeLists() override {
input_size_list_.push_back(x_size_);
input_size_list_.push_back(dy_size_);
input_size_list_.push_back(index_size_);
output_size_list_.push_back(dx_size_);
}

private:
void SetPad() {
pad_height_ = std::max<int>(
0, (((x_height_ / stride_height_) * stride_height_ == x_height_ ? (x_height_ / stride_height_)
: (x_height_ / stride_height_) + 1) -
1) *
stride_height_ +
window_height_ - x_height_);
pad_width_ =
std::max<int>(0, (((x_width_ / stride_width_) * stride_width_ == x_width_ ? (x_width_ / stride_width_)
: (x_width_ / stride_width_) + 1) -
1) *
stride_width_ +
window_width_ - x_width_);
pad_top_ = pad_height_ / 2;
pad_left_ = pad_width_ / 2;
}

std::string pad_mode_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;

int n_;
int c_;
int x_height_;
int x_width_;
int dy_height_;
int dy_width_;
int window_height_;
int window_width_;
int pad_height_;
int pad_width_;
int pad_top_;
int pad_left_;
int stride_height_;
int stride_width_;

size_t x_size_;
size_t dy_size_;
size_t index_size_;
size_t dx_size_;
};
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_

+ 6
- 0
mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel_metadata.cc View File

@@ -20,11 +20,17 @@
#include "utils/utils.h"
#include "backend/kernel_compiler/hccl/hcom_util.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "frontend/parallel/context.h"

namespace mindspore {
namespace kernel {
namespace {
std::string GetKernelFormat(const CNodePtr &kernel_node, size_t index) {
auto parallel_context_instance = parallel::ParallelContext::GetInstance();
MS_EXCEPTION_IF_NULL(parallel_context_instance);
if (parallel_context_instance->enable_parallel_optimizer()) {
return kOpFormat_DEFAULT;
}
const std::set<std::string> kReduceNoSupportedSet = {kOpFormat_FRAC_Z, kOpFormat_FRACTAL_Z_C04, kOpFormat_C1HWNCoC0};
auto op_name = AnfAlgo::GetCNodeName(kernel_node);
auto format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, index);


+ 40
- 2
mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc View File

@@ -65,6 +65,9 @@ size_t KernelBuildInfo::GetInputNum() const { return inputs_format_.size(); }
size_t KernelBuildInfo::GetOutputNum() const { return outputs_format_.size(); }

std::vector<Axis> KernelBuildInfo::GetInputReshapeType(size_t input_index) const {
if (input_reshape_type_.empty()) {
return {};
}
if (input_index >= input_reshape_type_.size()) {
MS_LOG(EXCEPTION) << "The index [" << input_index << "] is exceed the number of input node size "
<< input_reshape_type_.size();
@@ -73,6 +76,9 @@ std::vector<Axis> KernelBuildInfo::GetInputReshapeType(size_t input_index) const
}

std::vector<Axis> KernelBuildInfo::GetOutputReshapeType(size_t output_index) const {
if (output_reshape_type_.empty()) {
return {};
}
if (output_index >= output_reshape_type_.size()) {
MS_LOG(EXCEPTION) << "The index [" << output_index << "] is exceed the number of output node size "
<< output_reshape_type_.size();
@@ -158,13 +164,13 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetProcessor(Processor processor)

std::shared_ptr<KernelBuildInfo> KernelBuildInfo::KernelBuildInfoBuilder::Build() { return kernel_build_info_; }

void KernelBuildInfo::KernelBuildInfoBuilder::SetInputReshapeType(
void KernelBuildInfo::KernelBuildInfoBuilder::SetInputsReshapeType(
const std::vector<std::vector<Axis>> &input_reshape_type) {
MS_EXCEPTION_IF_NULL(kernel_build_info_);
kernel_build_info_->input_reshape_type_ = input_reshape_type;
}

void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputReshapeType(
void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputsReshapeType(
const std::vector<std::vector<Axis>> &output_reshape_type) {
MS_EXCEPTION_IF_NULL(kernel_build_info_);
kernel_build_info_->output_reshape_type_ = output_reshape_type;
@@ -189,5 +195,37 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputFormat(const std::string
}
kernel_build_info_->outputs_format_[index] = format;
}

void KernelBuildInfo::KernelBuildInfoBuilder::SetInputReshapeType(const std::vector<Axis> &input_reshape_type,
size_t index) {
if (index >= kernel_build_info_->input_reshape_type_.size()) {
MS_LOG(EXCEPTION) << "index outof range!";
}
std::copy(input_reshape_type.begin(), input_reshape_type.end(),
std::back_inserter(kernel_build_info_->input_reshape_type_[index]));
}

void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputReshapeType(const std::vector<Axis> &output_reshape_type,
size_t index) {
if (index >= kernel_build_info_->output_reshape_type_.size()) {
MS_LOG(EXCEPTION) << "index outof range!";
}
std::copy(output_reshape_type.begin(), output_reshape_type.end(),
std::back_inserter(kernel_build_info_->output_reshape_type_[index]));
}

void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputDeviceType(const TypeId &output_device_type, size_t index) {
if (index >= kernel_build_info_->outputs_device_type_.size()) {
MS_LOG(EXCEPTION) << "index outof range!";
}
kernel_build_info_->outputs_device_type_[index] = output_device_type;
}

void KernelBuildInfo::KernelBuildInfoBuilder::SetInputDeviceType(const TypeId &input_device_type, size_t index) {
if (index >= kernel_build_info_->inputs_device_type_.size()) {
MS_LOG(EXCEPTION) << "index outof range!";
}
kernel_build_info_->inputs_device_type_[index] = input_device_type;
}
} // namespace kernel
} // namespace mindspore

+ 31
- 4
mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h View File

@@ -71,6 +71,10 @@ class KernelBuildInfo {

std::vector<TypeId> GetAllOutputDeviceTypes() const;

std::vector<std::vector<Axis>> GetAllOutputReshapeType() const;

std::vector<std::vector<Axis>> GetAllInputReshapeType() const;

OpPattern op_pattern() const { return op_pattern_; }

FusionType fusion_type() const { return fusion_type_; }
@@ -108,8 +112,23 @@ class KernelBuildInfo::KernelBuildInfoBuilder {
public:
KernelBuildInfoBuilder() { kernel_build_info_ = std::make_shared<KernelBuildInfo>(); }

explicit KernelBuildInfoBuilder(std::shared_ptr<KernelBuildInfo> kernel_build_info)
: kernel_build_info_(std::move(kernel_build_info)) {}
explicit KernelBuildInfoBuilder(const std::shared_ptr<KernelBuildInfo> &kernel_build_info)
: kernel_build_info_(std::make_shared<KernelBuildInfo>()) {
SetKernelType(kernel_build_info->kernel_type());
SetFusionType(kernel_build_info->fusion_type());
SetProcessor(kernel_build_info->processor());
OpPattern(kernel_build_info->op_pattern());
for (size_t index = 0; index < kernel_build_info->GetInputNum(); ++index) {
kernel_build_info_->inputs_device_type_.emplace_back(kernel_build_info->GetInputDeviceType(index));
kernel_build_info_->inputs_format_.emplace_back(kernel_build_info->GetInputFormat(index));
kernel_build_info_->input_reshape_type_.emplace_back(kernel_build_info->GetInputReshapeType(index));
}
for (size_t index = 0; index < kernel_build_info->GetOutputNum(); ++index) {
kernel_build_info_->outputs_device_type_.emplace_back(kernel_build_info->GetOutputDeviceType(index));
kernel_build_info_->outputs_format_.emplace_back(kernel_build_info->GetOutputFormat(index));
kernel_build_info_->output_reshape_type_.emplace_back(kernel_build_info->GetOutputReshapeType(index));
}
}

~KernelBuildInfoBuilder() = default;

@@ -123,9 +142,9 @@ class KernelBuildInfo::KernelBuildInfoBuilder {

void SetOutputsDeviceType(const std::vector<TypeId> &outputs_device_type);

void SetInputReshapeType(const std::vector<std::vector<Axis>> &input_reshape_type);
void SetInputsReshapeType(const std::vector<std::vector<Axis>> &input_reshape_type);

void SetOutputReshapeType(const std::vector<std::vector<Axis>> &output_reshape_type);
void SetOutputsReshapeType(const std::vector<std::vector<Axis>> &output_reshape_type);

void SetFusionType(FusionType fusion_type);

@@ -137,6 +156,14 @@ class KernelBuildInfo::KernelBuildInfoBuilder {

void SetOutputFormat(const std::string &format, size_t index);

void SetInputReshapeType(const std::vector<Axis> &input_reshape_type, size_t index);

void SetOutputReshapeType(const std::vector<Axis> &output_reshape_type, size_t index);

void SetInputDeviceType(const TypeId &input_device_type, size_t index);

void SetOutputDeviceType(const TypeId &output_device_type, size_t index);

std::shared_ptr<KernelBuildInfo> Build();

private:


+ 2
- 2
mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc View File

@@ -118,7 +118,7 @@ void TbeKernelSelect::GetCommonPatternKernelInfo(const OpInfo &op_info) {
}
builder.SetInputsDeviceType(inputs_device_type);
builder.SetInputsFormat(inputs_format);
builder.SetInputReshapeType(inputs_reshape_type);
builder.SetInputsReshapeType(inputs_reshape_type);
// output
std::vector<std::string> outputs_format;
std::vector<TypeId> outputs_device_type;
@@ -129,7 +129,7 @@ void TbeKernelSelect::GetCommonPatternKernelInfo(const OpInfo &op_info) {
}
builder.SetOutputsDeviceType(outputs_device_type);
builder.SetOutputsFormat(outputs_format);
builder.SetOutputReshapeType(outputs_reshape_type);
builder.SetOutputsReshapeType(outputs_reshape_type);
kernel_info_list_->emplace_back(builder.Build());
}
MS_LOG(INFO) << "end.";


+ 2
- 0
mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc View File

@@ -47,6 +47,7 @@
#include "backend/optimizer/ascend/ir_fission/transdata_split.h"
#include "backend/optimizer/ascend/ir_fission/topk_split.h"
#include "backend/optimizer/ascend/ir_fusion/momentum_lossscale_fusion.h"
#include "backend/optimizer/ascend/format_type/split_unsupported_transdata.h"
#include "backend/optimizer/ascend/ir_fusion/mul_add_fusion.h"
#include "backend/optimizer/ascend/ir_fusion/mul_addn_fusion.h"
#include "backend/optimizer/ascend/ir_fusion/matmul_biasadd_fusion.h"
@@ -228,6 +229,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>());
mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
mixed_precision_pm->AddPass(std::make_shared<SplitUnsupportedTransData>());
mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
optimizer->AddPassManager(mixed_precision_pm);


+ 4
- 4
mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc View File

@@ -153,7 +153,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node, output_idx)) {
kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
}
make_tuple_inputs.emplace_back(trans_op);
@@ -174,8 +174,8 @@ void RefreshKernelBuildInfo(const std::string &input_format, const std::string &
MS_EXCEPTION_IF_NULL(ori_build_info);
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(ori_build_info);
builder->SetInputsFormat({input_format});
builder->SetInputReshapeType({reshape_type});
builder->SetOutputReshapeType({reshape_type});
builder->SetInputsReshapeType({reshape_type});
builder->SetOutputsReshapeType({reshape_type});
builder->SetOutputsFormat({output_format});
if (type_id != kTypeUnknown) {
builder->SetOutputsDeviceType({type_id});
@@ -265,7 +265,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
// Single output
if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node, 0)) {
kernel_graph->ReplaceInternalOutput(node, new_node);
}
return new_node;


+ 39
- 20
mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc View File

@@ -40,6 +40,38 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) {
return real_node->isa<ValueNode>();
}

void SetInput(const CNodePtr &control_depend, const int index, const FuncGraphPtr &graph, const CNodePtr &hccl_node,
const std::vector<AnfNodePtr> &memcpy_async_list) {
MS_EXCEPTION_IF_NULL(control_depend);
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(hccl_node);
std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple)};
make_tuple_inputs.insert(make_tuple_inputs.end(), memcpy_async_list.begin(), memcpy_async_list.end());
make_tuple_inputs.emplace_back(hccl_node);
auto make_tuple = graph->NewCNode(make_tuple_inputs);
MS_EXCEPTION_IF_NULL(make_tuple);
control_depend->set_input(IntToSize(index), make_tuple);
}

void DealControlForGetitem(const CNodePtr &tuple_getitem, const FuncGraphPtr &graph, const CNodePtr &hccl_node,
const std::vector<AnfNodePtr> &memcpy_async_list) {
MS_EXCEPTION_IF_NULL(tuple_getitem);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
auto &node_users = manager->node_users();
auto iter = node_users.find(tuple_getitem);
if (iter == node_users.end()) {
MS_LOG(EXCEPTION) << "node has no output in manager";
}
for (const auto &node_index : iter->second) {
AnfNodePtr output = node_index.first;
MS_EXCEPTION_IF_NULL(output);
if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimControlDepend)) {
SetInput(output->cast<CNodePtr>(), node_index.second, graph, hccl_node, memcpy_async_list);
}
}
}

void TransferControl(const CNodePtr &hccl_node, const std::vector<AnfNodePtr> &memcpy_async_list,
const FuncGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(hccl_node);
@@ -53,25 +85,13 @@ void TransferControl(const CNodePtr &hccl_node, const std::vector<AnfNodePtr> &m
}
// find hccl_node's output which is a control depend
for (const auto &node_index : iter->second) {
if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimControlDepend)) {
continue;
}
CNodePtr control_depend = node_index.first->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(control_depend);
std::vector<AnfNodePtr> new_inputs;
for (size_t i = 0; i < control_depend->size(); ++i) {
if (i == IntToSize(node_index.second)) {
std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple)};
make_tuple_inputs.insert(make_tuple_inputs.end(), memcpy_async_list.begin(), memcpy_async_list.end());
make_tuple_inputs.emplace_back(hccl_node);
auto make_tuple = graph->NewCNode(make_tuple_inputs);
MS_EXCEPTION_IF_NULL(make_tuple);
new_inputs.push_back(make_tuple);
} else {
new_inputs.push_back(control_depend->input(i));
}
AnfNodePtr output = node_index.first;
MS_EXCEPTION_IF_NULL(output);
if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimControlDepend)) {
SetInput(output->cast<CNodePtr>(), node_index.second, graph, hccl_node, memcpy_async_list);
} else if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimTupleGetItem)) {
DealControlForGetitem(output->cast<CNodePtr>(), graph, hccl_node, memcpy_async_list);
}
control_depend->set_inputs(new_inputs);
}
}
} // namespace
@@ -148,11 +168,10 @@ const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_gr
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
return nullptr;
}
auto cnode = node->cast<CNodePtr>();
if (!AnfAlgo::IsCommunicationOp(node)) {
return nullptr;
}
InsertMemcpyAsync(func_graph, cnode);
InsertMemcpyAsync(func_graph, node->cast<CNodePtr>());
return nullptr;
}
} // namespace opt


+ 2
- 2
mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc View File

@@ -65,7 +65,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode, output_idx)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
}
} else {
@@ -114,7 +114,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode, 0)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node);
}
}


+ 1
- 1
mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc View File

@@ -58,7 +58,7 @@ const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, c
if (kernel_graph == nullptr) {
return nullptr;
}
if (!kernel_graph->IsInternalOutput(node)) {
if (!kernel_graph->IsInternalOutput(node, 0)) {
return nullptr;
}
if (!UsedForOutputOnly(func_graph, node)) {


+ 65
- 0
mindspore/ccsrc/backend/optimizer/ascend/format_type/split_unsupported_transdata.cc View File

@@ -0,0 +1,65 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/optimizer/ascend/format_type/split_unsupported_transdata.h"
#include <vector>
#include <memory>
#include "backend/session/anf_runtime_algorithm.h"

namespace mindspore {
namespace opt {
const BaseRef SplitUnsupportedTransData::DefinePattern() const {
VarPtr X = std::make_shared<Var>();
return VectorRef({prim::KPrimTransData, X});
}

const AnfNodePtr SplitUnsupportedTransData::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) {
return nullptr;
}
auto ori_trans_data = node->cast<CNodePtr>();
if (AnfAlgo::GetCNodeName(ori_trans_data) != prim::KPrimTransData->name()) {
return nullptr;
}
auto kernel_info = AnfAlgo::GetSelectKernelBuildInfo(ori_trans_data);
MS_EXCEPTION_IF_NULL(kernel_info);
if (kernel_info->GetInputNum() != 1 || kernel_info->GetOutputNum() != 1) {
MS_LOG(EXCEPTION) << "Transdata node's kernel info's input and output format size is not 1"
<< ori_trans_data->DebugString();
}
return SplitTransData(func_graph, ori_trans_data);
}
AnfNodePtr SplitUnsupportedTransData::SplitTransData(const FuncGraphPtr &func_graph, const CNodePtr &trans_node) const {
auto kernel_info = AnfAlgo::GetSelectKernelBuildInfo(trans_node);
if (kHWSpecialFormatSet.find(kernel_info->GetInputFormat(0)) == kHWSpecialFormatSet.end() ||
kHWSpecialFormatSet.find(kernel_info->GetOutputFormat(0)) == kHWSpecialFormatSet.end()) {
return trans_node;
}
auto builder_info_to_default = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_info);
auto builder_info_to_special_foramt = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_info);
builder_info_to_default->SetOutputsFormat({kOpFormat_DEFAULT});
builder_info_to_special_foramt->SetInputsFormat({kOpFormat_DEFAULT});
std::vector<AnfNodePtr> next_trans_node_inputs = {
NewValueNode(std::make_shared<Primitive>(prim::KPrimTransData->name())), trans_node};
auto next_trans_node = func_graph->NewCNode(next_trans_node_inputs);
next_trans_node->set_abstract(trans_node->abstract());
AnfAlgo::SetSelectKernelBuildInfo(builder_info_to_default->Build(), trans_node.get());
AnfAlgo::SetSelectKernelBuildInfo(builder_info_to_special_foramt->Build(), next_trans_node.get());
return next_trans_node;
}
} // namespace opt
} // namespace mindspore

+ 37
- 0
mindspore/ccsrc/backend/optimizer/ascend/format_type/split_unsupported_transdata.h View File

@@ -0,0 +1,37 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H

#include "backend/optimizer/common/optimizer.h"

namespace mindspore {
namespace opt {
class SplitUnsupportedTransData : public PatternProcessPass {
public:
explicit SplitUnsupportedTransData(bool multigraph = true)
: PatternProcessPass("split_unsupported_transdata", multigraph) {}
~SplitUnsupportedTransData() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

private:
AnfNodePtr SplitTransData(const FuncGraphPtr &func_graph, const CNodePtr &trans_node) const;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H

+ 1
- 1
mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc View File

@@ -405,7 +405,7 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
}
auto node = cnode->input(input_idx + 1);
MS_EXCEPTION_IF_NULL(node);
return VisitKernel(node, 0);
return VisitKernelWithReturnType(node, 0);
}

std::string AnfRuntimeAlgorithm::GetPrevNodeOutputFormat(const AnfNodePtr &anf_node, size_t input_idx) {


+ 40
- 19
mindspore/ccsrc/backend/session/ascend_inference_session.cc View File

@@ -94,25 +94,33 @@ bool AscendInferenceSession::CheckModelInputs(uint32_t graph_id, const std::vect
MS_EXCEPTION_IF_NULL(kernel_graph);
auto kernel_graph_inputs = kernel_graph->inputs();
size_t no_weight_input = 0;
vector<ParameterPtr> paras;
// find parameters of graph inputs
for (size_t i = 0; i < kernel_graph_inputs.size(); ++i) {
tensor::TensorPtr tensor = nullptr;
if (!kernel_graph_inputs[i]->isa<Parameter>()) {
MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter.";
continue;
}
auto parameter = kernel_graph_inputs[i]->cast<ParameterPtr>();
if (!AnfAlgo::IsParameterWeight(parameter)) {
// compare input number
if (no_weight_input >= inputs.size()) {
MS_LOG(ERROR) << "Input number is inconsistent. The actual input number [" << inputs.size()
<< "] less than that of graph.";
return false;
}
auto input = inputs[no_weight_input++];
if (!CompareInput(input, parameter)) {
MS_LOG(ERROR) << "Please check the input information.";
return false;
}
paras.push_back(parameter);
}
}

// check inputs
for (size_t i = 0; i < paras.size(); ++i) {
// compare input number
if (paras.size() != inputs.size()) {
MS_LOG(ERROR) << "Input number is inconsistent. The actual input number [" << inputs.size()
<< "] but the graph input number is [" << paras.size() << "]";
MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs);
return false;
}
auto input = inputs[no_weight_input++];
if (!CompareInput(input, paras[i])) {
MS_LOG(ERROR) << "Please check the input information.";
MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs);
return false;
}
}
return true;
@@ -123,12 +131,6 @@ bool AscendInferenceSession::CompareInput(const tensor::TensorPtr &input, const
MS_EXCEPTION_IF_NULL(parameter);
// compare dims
auto parameter_shape = AnfAlgo::GetOutputDeviceShape(parameter, 0);
if (input->shape().size() != parameter_shape.size()) {
MS_LOG(ERROR) << "Input dim is inconsistent. The actual dim is " << input->shape().size()
<< ", but the parameter dim is " << parameter_shape.size()
<< ". parameter : " << parameter->DebugString();
return false;
}

// compare shape
auto input_shape = input->shape();
@@ -153,12 +155,31 @@ bool AscendInferenceSession::CompareInput(const tensor::TensorPtr &input, const
return true;
}

std::string AscendInferenceSession::PrintInputShape(std::vector<size_t> shape) const {
template <typename T>
std::string AscendInferenceSession::PrintInputShape(std::vector<T> shape) const {
string res = "[";
for (auto dim : shape) {
res += " " + std::to_string(dim);
}
return res + " ]";
}

std::string AscendInferenceSession::InputsInfo(const std::vector<ParameterPtr> &paras,
const std::vector<tensor::TensorPtr> &inputs) const {
std::string graph = "graph inputs:{ ";
for (size_t i = 0; i < paras.size(); ++i) {
graph += std::to_string(i) + ": dims " + std::to_string(AnfAlgo::GetOutputDeviceShape(paras[i], 0).size()) +
", shape " + PrintInputShape(AnfAlgo::GetOutputDeviceShape(paras[i], 0)) + ", data type " +
std::to_string(AnfAlgo::GetSelectKernelBuildInfo(paras[i])->GetOutputDeviceType(0)) + " }";
}

std::string actual = "actual inputs:{ ";
for (size_t i = 0; i < inputs.size(); ++i) {
actual += std::to_string(i) + ": dims " + std::to_string(inputs[i]->shape().size()) + ", shape " +
PrintInputShape(inputs[i]->shape()) + ", data type " + std::to_string(inputs[i]->data_type()) + " }";
}
return graph + " " + actual;
}

} // namespace session
} // namespace mindspore

+ 3
- 1
mindspore/ccsrc/backend/session/ascend_inference_session.h View File

@@ -41,7 +41,9 @@ class AscendInferenceSession : public AscendSession {
GraphId CompileGraph(NotNull<FuncGraphPtr> func_graph) override;
bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs) const override;
bool CompareInput(const tensor::TensorPtr &input, const ParameterPtr &parameter) const;
std::string PrintInputShape(std::vector<size_t> shape) const;
template <typename T>
std::string PrintInputShape(std::vector<T> shape) const;
std::string InputsInfo(const std::vector<ParameterPtr> &paras, const std::vector<tensor::TensorPtr> &inputs) const;
};
MS_REG_SESSION(kDavinciInferenceDevice, AscendInferenceSession);
} // namespace session


+ 1
- 3
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -517,9 +517,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
LoadInputData(kernel_graph, inputs);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
InitPSParamAndOptim(kernel_graph, inputs);
#endif
// convert inputs to model
predictmodel::StepConvertWeight(inputs);


+ 1
- 4
mindspore/ccsrc/backend/session/cpu_session.cc View File

@@ -91,10 +91,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
auto &kernel_graph = graphs_[graph_id];
MS_EXCEPTION_IF_NULL(kernel_graph);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
InitPSParamAndOptim(kernel_graph, inputs);
#endif
MS_LOG(INFO) << "Bind input output address";
std::vector<tensor::TensorPtr> need_sync_outputs;


+ 5
- 4
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -233,9 +233,7 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
LoadInputData(kernel_graph, inputs);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
InitPSParamAndOptim(kernel_graph, inputs);
#endif
MS_EXCEPTION_IF_NULL(kernel_graph);
// Convert inputs to model
@@ -281,7 +279,10 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
RunOpAllocateMemory(input_tensors, kernel_graph.get());
// Execute the computation
LoadInputData(kernel_graph, input_tensors);
Execute(kernel_graph);
{
py::gil_scoped_release gil_release;
Execute(kernel_graph);
}
// Fetch outputs
VectorRef outputs;
UpdateOutputs(kernel_graph, &outputs, input_tensors);


+ 10
- 20
mindspore/ccsrc/backend/session/kernel_graph.cc View File

@@ -1021,26 +1021,16 @@ AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_nod
return nullptr;
}

bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
if (internal_outputs_to_front_map_.find(node) != internal_outputs_to_front_map_.end()) {
return true;
}
return false;
}

void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
if (node == nullptr) {
return;
}
(void)final_output_kernels_.insert(node);
}

bool KernelGraph::IsFinalOutputKernel(const AnfNodePtr &node) const {
if (node == nullptr) {
return false;
}
if (final_output_kernels_.find(node) != final_output_kernels_.end()) {
return true;
bool KernelGraph::IsInternalOutput(const AnfNodePtr &node, int output_idx) const {
auto front_nodes_iter = internal_outputs_to_front_map_.find(node);
if (front_nodes_iter != internal_outputs_to_front_map_.end()) {
if (output_idx == -1) {
return true;
}
auto &front_nodes = front_nodes_iter->second;
if (front_nodes.find(output_idx) != front_nodes.end()) {
return true;
}
}
return false;
}


+ 1
- 4
mindspore/ccsrc/backend/session/kernel_graph.h View File

@@ -153,9 +153,7 @@ class KernelGraph : public FuncGraph {
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
int dst_output_idx = -1);
AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
bool IsInternalOutput(const AnfNodePtr &node) const;
void AddFinalOutputKernel(const AnfNodePtr &node);
bool IsFinalOutputKernel(const AnfNodePtr &node) const;
bool IsInternalOutput(const AnfNodePtr &node, int output_idx = -1) const;
uint32_t current_epoch() const { return current_epoch_; }
void set_current_epoch(uint32_t epoch) { current_epoch_ = epoch; }
void UpdateChildGraphOrder();
@@ -230,7 +228,6 @@ class KernelGraph : public FuncGraph {
bool null_output_;
std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
std::set<AnfNodePtr> final_output_kernels_;
uint32_t current_epoch_;
};
} // namespace session


+ 6
- 8
mindspore/ccsrc/backend/session/session_basic.cc View File

@@ -89,7 +89,7 @@ BaseRef CreateOneTensor(const AnfNodePtr &node, size_t output_index, const Kerne
TypeId type_id = kNumberTypeFloat32;
type_id = AnfAlgo::GetOutputInferDataType(node, output_index);
std::vector<int> temp_shape;
if (graph.IsInternalOutput(node)) {
if (graph.IsInternalOutput(node, output_index)) {
temp_shape.emplace_back(1);
tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape);
tensor->set_device_address(address);
@@ -307,18 +307,17 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
auto ref_real_node = real_kernel.first;
auto ref_real_node_index = real_kernel.second;
if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
node_graph->IsFinalOutputKernel(ref_real_node)) {
if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node, ref_real_node_index)) {
auto kernel_info = ref_real_node->kernel_info();
if (kernel_info == nullptr || !kernel_info->has_build_info()) {
MS_LOG(INFO) << "No kernel info";
return;
}
auto address = AnfAlgo::GetMutableOutputAddr(ref_real_node, ref_real_node_index);
if (address == nullptr) {
if (!opt::IsNopNode(ref_real_node) && !AnfAlgo::OutputAddrExist(ref_real_node, ref_real_node_index)) {
MS_LOG(INFO) << "No kernel address";
return;
}
auto address = AnfAlgo::GetMutableOutputAddr(ref_real_node, ref_real_node_index);
auto format = AnfAlgo::GetOutputFormat(ref_real_node, ref_real_node_index);
auto type = AnfAlgo::GetOutputDeviceDataType(ref_real_node, ref_real_node_index);
auto d_kernel_info = std::make_shared<device::KernelInfo>();
@@ -1004,6 +1003,7 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
break;
}
}

if (internal_output) {
MS_LOG(INFO) << "Internal output1: " << out->DebugString() << "To " << backend_real_kernel.first->DebugString();
graph->AddInternalOutput(out, backend_real_kernel.first);
@@ -1203,11 +1203,9 @@ void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
auto pk_node = input_node->cast<ParameterPtr>();
mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(pk_node->fullname_with_scope(), tensor);
}
}
ps_init_ = true;
}
#endif
} // namespace session


+ 1
- 2
mindspore/ccsrc/backend/session/session_basic.h View File

@@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;

class SessionBasic {
public:
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
#ifdef ENABLE_DEBUGGER
debugger_ = nullptr;
#endif
@@ -152,7 +152,6 @@ class SessionBasic {
CallBackFunc summary_callback_;
static GraphId graph_sum_;
uint32_t device_id_;
bool ps_init_;
#ifdef ENABLE_DEBUGGER
std::shared_ptr<Debugger> debugger_;
#endif


+ 13
- 4
mindspore/ccsrc/frontend/operator/prim_others.cc View File

@@ -378,10 +378,19 @@ AbstractBasePtr InferImplMakeIndexedSlices(const AnalysisEnginePtr &, const Prim
auto elem = GetValue<int>(e);
return elem;
});
for (auto dense_shape_elem : dense_shape_vec) {
if (dense_shape_elem < 0) {
MS_EXCEPTION(TypeError) << "The element of dense_shape must be positive, but got "
<< dense_shape_value->ToString();
if (dense_shape_vec.size() != values_shp.size()) {
MS_EXCEPTION(TypeError) << "The size of dense_shape must be the same with the dimension of values "
<< values_shp.size() << ", but got " << dense_shape_value->size();
}
for (size_t i = 0; i < dense_shape_vec.size(); i++) {
if (dense_shape_vec[i] < 0) {
MS_EXCEPTION(TypeError) << "The " << i << "th element of dense_shape must be positive, but got "
<< dense_shape_vec[i];
}
// The 0th mode might be less or exceed dense_shape[0] due to duplicated selection
if (i != 0 && dense_shape_vec[i] != values_shp[i]) {
MS_EXCEPTION(TypeError) << "The " << i << "th element of dense_shape must be same with the " << i
<< "th dimension of values " << values_shp[i] << ", but got " << dense_shape_vec[i];
}
}
auto ret = std::make_shared<AbstractIndexedSlices>(values->element()->BuildType(), dense_shape_vec);


+ 3
- 2
mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h View File

@@ -34,7 +34,8 @@ namespace parallel {
#define OPERATOR_TO_OPERATOR_CONNECTOR "-"
#define DEFAULT_DEVICE_MEMORY_CAPACITY (1024.0 * 1024.0 * 1024.0 * 16.0)
#define DEFAULT_COST_MODEL_ALPHA 1.0
#define DEFAULT_COST_MODEL_BETA 400.0
#define DEFAULT_COST_MODEL_BETA_ASCEND 400.0 // for 'device_target = Ascend'
#define DEFAULT_COST_MODEL_BETA_GPU 50.0 // for 'device_target = GPU'
#define DEFAULT_COST_MODEL_GAMMA 0.001
#define DEFAULT_COST_MODEL_SIMPLIFY_CALCULATION true
#define DEFAULT_COST_MODEL_COMMUNI_THRESHOLD 2048.0
@@ -73,7 +74,7 @@ class CostGraph {
CostGraph() {
dev_memory_ = DEFAULT_DEVICE_MEMORY_CAPACITY;
costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA;
costmodel_beta_ = DEFAULT_COST_MODEL_BETA;
costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND;
}
~CostGraph() = default;
void AddOperator(const OperatorInfoPtr &op) { ops_.push_back(op); }


+ 8
- 1
mindspore/ccsrc/frontend/parallel/costmodel_context.cc View File

@@ -20,6 +20,7 @@

#include "frontend/parallel/allreduce_fusion/allreduce_fusion.h"
#include "frontend/parallel/auto_parallel/graph_costmodel.h"
#include "utils/context/ms_context.h"

namespace mindspore {
namespace parallel {
@@ -41,7 +42,7 @@ CostModelContext::CostModelContext() {
void CostModelContext::ResetCostModel() {
device_memory_capacity_ = DEFAULT_DEVICE_MEMORY_CAPACITY;
costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA;
costmodel_beta_ = DEFAULT_COST_MODEL_BETA;
costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND;
costmodel_gamma_ = DEFAULT_COST_MODEL_GAMMA;
costmodel_communi_threshold_ = DEFAULT_COST_MODEL_COMMUNI_THRESHOLD;
costmodel_communi_const_ = DEFAULT_COST_MODEL_COMMUNI_CONST;
@@ -66,6 +67,12 @@ void CostModelContext::ResetAlgoParameters() {
elementwise_stra_follow_ = DEFAULT_ELEMENTWISE_OP_STRA_FOLLOW;
}

void CostModelContext::set_costmodel_context_for_device(const std::string &device_target) {
if (device_target == kGPUDevice) {
costmodel_beta_ = DEFAULT_COST_MODEL_BETA_GPU;
}
}

void CostModelContext::set_device_memory_capacity(double dm_capacity) { device_memory_capacity_ = dm_capacity; }

void CostModelContext::set_costmodel_alpha(double cm_alpha) { costmodel_alpha_ = cm_alpha; }


+ 1
- 0
mindspore/ccsrc/frontend/parallel/costmodel_context.h View File

@@ -35,6 +35,7 @@ class CostModelContext {

static std::shared_ptr<CostModelContext> GetInstance();

void set_costmodel_context_for_device(const std::string &);
// DEVICE_MEMORY_CAPACITY
void set_device_memory_capacity(double);
double device_memory_capacity() const { return device_memory_capacity_; }


+ 7
- 0
mindspore/ccsrc/frontend/parallel/ps/common.h View File

@@ -57,15 +57,22 @@ constexpr char kMomentum[] = "momentum";
constexpr char kApplyMomentum[] = "ApplyMomentum";
constexpr char kSparseAdam[] = "Adam";
constexpr char kSparseFtrl[] = "Ftrl";
constexpr char kApplyMomentumOp[] = "Momentum";
constexpr char kSparseAdamOp[] = "Adam";
constexpr char kSparseFtrlOp[] = "FTRL";

constexpr int kInitWeightsCmd = 10;
constexpr int kInitWeightToOptimIdCmd = 11;
constexpr int kInitOptimInputsShapeCmd = 12;
constexpr int kInitKeyToPushNodeIdCmd = 13;
constexpr int kInitEmbeddingsCmd = 20;
constexpr int kCheckReadyForPushCmd = 25;
constexpr int kCheckReadyForPullCmd = 26;
constexpr int kEmbeddingLookupCmd = 30;
constexpr int kFinalizeCmd = 40;

constexpr size_t kInvalidKey = UINT64_MAX;
constexpr int kInvalidID = -1;

using Key = ::ps::Key;
using Keys = ::ps::SArray<Key>;


+ 7
- 4
mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc View File

@@ -158,16 +158,19 @@ OptimizerInfo *SparseFtrlOptimInfoBuilder::BuildInputs(const WeightPtr &weight,
}
AddressPtr linear = std::make_shared<kernel::Address>();
linear->addr = new float[weight->size()];
memcpy_s(linear->addr, weight->size() * sizeof(float), 0x00, weight->size() * sizeof(float));
auto ret = memset_s(linear->addr, weight->size() * sizeof(float), 0x00, weight->size() * sizeof(float));
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
}
linear->size = weight->size() * sizeof(float);

const std::shared_ptr<std::vector<size_t>> &grad_shape = (*inputs_shape)[3];
size_t total_grad_size = std::accumulate((*grad_shape).begin(), (*grad_shape).end(), 1, std::multiplies<size_t>());
AddressPtr grad = std::make_shared<kernel::Address>();
grad->addr = new float[total_grad_size * worker_num];
auto ret = memcpy_s(grad->addr, lens[0] * sizeof(float), values.data(), lens[0] * sizeof(float));
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
auto ret1 = memcpy_s(grad->addr, lens[0] * sizeof(float), values.data(), lens[0] * sizeof(float));
if (ret1 != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret1 << ")";
}
grad->size = lens[0] * sizeof(float);



+ 119
- 19
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h View File

@@ -28,6 +28,7 @@
#include <thread>
#include <cmath>
#include <random>
#include <list>
#include "ir/func_graph.h"
#include "backend/session/session_basic.h"
#include "backend/session/anf_runtime_algorithm.h"
@@ -70,6 +71,7 @@ class ParameterServer {
handler_(nullptr),
func_graph_(nullptr),
sess_(nullptr),
running_(true),
thread_(nullptr) {}
~ParameterServer() = default;
ParameterServer(const ParameterServer &) = delete;
@@ -89,6 +91,8 @@ class ParameterServer {
::ps::KVPairs<T> *res);
void HandleInitInputsShape(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleInitEmbeddings(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleCheckReadyForPush(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleCheckReadyForPull(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
@@ -96,6 +100,9 @@ class ParameterServer {
typedef void (ServerHandler::*RequestHandler)(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res);
std::unordered_map<int, RequestHandler> handlers_;
std::unordered_map<Key, bool> init_weights_;
std::unordered_map<Key, bool> init_weight_to_optim_;
std::unordered_map<Key, bool> init_optim_info_;
};
bool Init(const FuncGraphPtr &func_graph);
@@ -106,14 +113,18 @@ class ParameterServer {
void InitGrad(const Key &key, const GradPtr &grad);
void InitEmbeddingTable(const Key &key,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes);
void Finalize();
void UpdateWeights();
void AccumGrad(const Keys &key, const Values &values, const Lengths &lengths);
WeightPtr weight(const Key &key);
void DoEmbeddingLookup(Key key, const LookupIds &lookup_ids, ::ps::KVPairs<T> *res);
int SumOfShapes(const std::vector<int> &shapes) const;
bool ReadyForUpdateWeights();
bool ReadyForAccumGrads();
bool ReadyForPush(const Key &key);
bool ReadyForPull(const Key &key);
void ResetGradAccumCount();
std::mutex &mutex();
const CNodePtr GetCNode(const std::string &name) const;
size_t pserver_num_;
size_t worker_num_;
@@ -123,20 +134,23 @@ class ParameterServer {
std::unique_ptr<ServerHandler> handler_;
FuncGraphPtr func_graph_;
std::shared_ptr<session::SessionBasic> sess_;
bool running_;
std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_;
std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_;
std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_;
std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_;
std::unordered_map<Key, std::string> weight_key_to_optims_;
std::unordered_map<Key, std::string> weight_key_to_optim_op_;
std::unordered_map<Key, WeightPtr> weights_;
std::unordered_map<Key, bool> is_embedding_;
std::unordered_map<Key, WeightPtr> grads_;
std::unordered_map<Key, size_t> grads_accum_counter_;
std::unordered_map<Key, std::shared_ptr<PServerKernel>> embedding_lookup_ops_;
std::unordered_map<Key, uint64_t> tokens_;
std::mutex mutex_;
std::condition_variable apply_grads_cv_;
std::condition_variable accum_grads_cv_;
std::unique_ptr<std::thread> thread_;
@@ -165,6 +179,8 @@ void ParameterServer<T>::ServerHandler::Init() {
handlers_[kInitWeightToOptimIdCmd] = &ServerHandler::HandleInitWeightToOptimId;
handlers_[kInitOptimInputsShapeCmd] = &ServerHandler::HandleInitInputsShape;
handlers_[kInitEmbeddingsCmd] = &ServerHandler::HandleInitEmbeddings;
handlers_[kCheckReadyForPushCmd] = &ServerHandler::HandleCheckReadyForPush;
handlers_[kCheckReadyForPullCmd] = &ServerHandler::HandleCheckReadyForPull;
handlers_[kEmbeddingLookupCmd] = &ServerHandler::HandleEmbeddingLookup;
handlers_[kFinalizeCmd] = &ServerHandler::HandleFinalize;
}
@@ -186,6 +202,7 @@ void ParameterServer<T>::ServerHandler::HandlePullReq(const ::ps::KVMeta &req_me
template <typename T>
void ParameterServer<T>::ServerHandler::HandleInitWeights(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex());
size_t key_num = req_data.keys.size();
T *data_ptr = req_data.vals.data();
size_t pos = 0;
@@ -207,10 +224,16 @@ template <typename T>
void ParameterServer<T>::ServerHandler::HandleInitWeightToOptimId(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex());
size_t key_num = req_data.keys.size();
for (size_t i = 0; i < key_num; i++) {
Key key = req_data.keys[i];
T val = req_data.vals[i];
if (init_weight_to_optim_[key]) {
continue;
} else {
init_weight_to_optim_[key] = true;
}
ps_->InitWeightKeyToOptims(key, val);
}
}
@@ -218,12 +241,21 @@ void ParameterServer<T>::ServerHandler::HandleInitWeightToOptimId(const ::ps::KV
template <typename T>
void ParameterServer<T>::ServerHandler::HandleInitInputsShape(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex());
const Key &key = req_data.keys[0];
if (init_optim_info_[key]) {
return;
} else {
init_optim_info_[key] = true;
}
ps_->InitOptimInputsShape(req_data.keys, req_data.vals, req_data.lens);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex());
const Key &key = req_data.keys[0];
std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes =
std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>();
std::shared_ptr<std::vector<size_t>> input_shape = std::make_shared<std::vector<size_t>>();
@@ -233,7 +265,6 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta
shapes->push_back(indices_shape);
shapes->push_back(output_shape);
const Key &key = req_data.keys[0];
const Lengths &lens = req_data.lens;
size_t index = 0;
for (int i = 0; i < lens[0]; i++) {
@@ -248,6 +279,26 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta
ps_->InitEmbeddingTable(key, shapes);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleCheckReadyForPush(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
const Key &key = req_data.keys[0];
bool ready = ps_->ReadyForPush(key);
res->keys.push_back(key);
res->vals.push_back(ready);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleCheckReadyForPull(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
const Key &key = req_data.keys[0];
bool ready = ps_->ReadyForPull(key);
res->keys.push_back(key);
res->vals.push_back(ready);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
@@ -261,7 +312,7 @@ void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta
template <typename T>
void ParameterServer<T>::ServerHandler::HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
::ps::Finalize(0, false);
ps_->Finalize();
}
template <typename T>
@@ -274,7 +325,6 @@ bool ParameterServer<T>::Init(const FuncGraphPtr &func_graph) {
handler_->Init();
InitOptimInfoBuilders();
ps_->set_request_handle(*handler_);
thread_.reset(new std::thread(&ParameterServer::UpdateWeights, this));
return true;
@@ -296,6 +346,7 @@ void ParameterServer<T>::InitWeightKeyToOptims(const Key &key, const int &optim_
return;
}
weight_key_to_optims_[key] = Util::optimizer_name(optim_id);
weight_key_to_optim_op_[key] = Util::optimizer_node_name(optim_id);
}
template <typename T>
@@ -318,31 +369,49 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
}
if (weight_key_to_optims_.count(key) > 0) {
const std::string &optim_name = weight_key_to_optims_[key];
const std::string &optim_op_name = weight_key_to_optim_op_[key];
if (optimizers_.count(key) == 0 && optim_inputs_shape_.count(key) > 0) {
const CNodePtr cnode = GetCNode(optim_op_name);
MS_EXCEPTION_IF_NULL(cnode);
if (optim_name == kSparseAdam) {
std::shared_ptr<PServerKernel> optimizer =
std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_);
optimizer->InitKernel(optim_inputs_shape_[key]);
optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
optimizers_[key] = optimizer;
} else if (optim_name == kApplyMomentum) {
std::shared_ptr<PServerKernel> optimizer =
std::make_shared<kernel::ps::ApplyMomentumPSKernel>(rank_id_, pserver_num_);
optimizer->InitKernel(optim_inputs_shape_[key]);
optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
optimizers_[key] = optimizer;
} else if (optim_name == kSparseFtrl) {
std::shared_ptr<PServerKernel> optimizer =
std::make_shared<kernel::ps::SparseApplyFtrlPSKernel>(rank_id_, pserver_num_);
optimizer->InitKernel(optim_inputs_shape_[key]);
optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
optimizers_[key] = optimizer;
}
}
}
}
template <typename T>
const CNodePtr ParameterServer<T>::GetCNode(const std::string &name) const {
std::list<CNodePtr> cnodes = func_graph_->GetOrderedCnodes();
for (CNodePtr cnode : cnodes) {
std::string fullname = cnode->fullname_with_scope();
if (fullname.find(name) != std::string::npos && fullname.find("Push") != std::string::npos) {
return cnode;
}
}
return nullptr;
}
template <typename T>
void ParameterServer<T>::InitWeight(const Key &key, const WeightPtr &weight) {
if (weights_.count(key) == 0) {
MS_LOG(INFO) << "Initializing weight for key " << key;
if ((weights_.count(key) == 0) || (is_embedding_[key] && weights_.count(key) != 0)) {
weights_[key] = weight;
tokens_[key] = 0;
is_embedding_[key] = false;
}
}
@@ -357,7 +426,7 @@ void ParameterServer<T>::InitGrad(const Key &key, const GradPtr &grad) {
template <typename T>
void ParameterServer<T>::InitEmbeddingTable(
const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
// Init embedding lookup kernel
MS_LOG(INFO) << "Initializing embedding table for key " << key;
std::shared_ptr<PServerKernel> lookup = std::make_shared<kernel::ps::EmbeddingLookUpPSKernel>(rank_id_, pserver_num_);
lookup->InitKernel(shapes);
embedding_lookup_ops_[key] = lookup;
@@ -377,15 +446,26 @@ void ParameterServer<T>::InitEmbeddingTable(
embedding_data[i] = random(engine);
}
weights_[key] = embedding;
tokens_[key] = 0;
is_embedding_[key] = true;
grads_accum_counter_[key] = 0;
}
template <typename T>
void ParameterServer<T>::Finalize() {
running_ = false;
apply_grads_cv_.notify_one();
}
template <typename T>
void ParameterServer<T>::UpdateWeights() {
while (true) {
std::unique_lock<std::mutex> lock(mutex_);
apply_grads_cv_.wait(lock, [this] { return this->ReadyForUpdateWeights(); });
apply_grads_cv_.wait(lock, [this] { return this->ReadyForUpdateWeights() || !running_; });
if (!running_) {
break;
}
for (auto iter = weights_.begin(); iter != weights_.end(); iter++) {
Key key = iter->first;
@@ -408,17 +488,17 @@ void ParameterServer<T>::UpdateWeights() {
optim_info->ComputeMean(worker_num_);
optimizer->Execute(inputs, workspaces, outputs);
optim_info->Reset();
if (!is_embedding_[key]) {
tokens_[key] = worker_num_;
}
}
ResetGradAccumCount();
accum_grads_cv_.notify_all();
}
}
template <typename T>
void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const Lengths &lengths) {
std::unique_lock<std::mutex> lock(mutex_);
accum_grads_cv_.wait(lock, [this] { return this->ReadyForAccumGrads(); });
const Key &key = keys[0];
std::shared_ptr<OptimizerInfo> optim_info = optim_infos_[key];
@@ -451,14 +531,13 @@ void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const
template <typename T>
WeightPtr ParameterServer<T>::weight(const Key &key) {
std::unique_lock<std::mutex> lock(mutex_);
if (weights_.count(key) == 0) {
MS_LOG(ERROR) << "Invalid weight key " << key;
return nullptr;
MS_LOG(EXCEPTION) << "Invalid weight key " << key;
}
WeightPtr weight_ptr = weights_[key];
WeightPtr copy_weight_ptr = std::make_shared<::ps::SArray<T>>(weight_ptr->size(), 0);
copy_weight_ptr->CopyFrom(weight_ptr->data(), weight_ptr->size());
tokens_[key] -= 1;
return copy_weight_ptr;
}
@@ -529,8 +608,22 @@ inline bool ParameterServer<T>::ReadyForUpdateWeights() {
}
template <typename T>
inline bool ParameterServer<T>::ReadyForAccumGrads() {
return grad_accum_count_ < weights_.size();
inline bool ParameterServer<T>::ReadyForPush(const Key &key) {
std::unique_lock<std::mutex> lock(mutex_);
if (weights_.empty()) {
MS_LOG(EXCEPTION) << "The weights in server is empty. Many reasons could cause this: 1.The Worker didn't send "
"kInitWeightsCmd command. 2.The Server failed to initialize weights.";
}
return grad_accum_count_ < weights_.size() && tokens_[key] <= 0;
}
template <typename T>
inline bool ParameterServer<T>::ReadyForPull(const Key &key) {
std::unique_lock<std::mutex> lock(mutex_);
if (tokens_.count(key) == 0 || weights_[key] == 0) {
MS_LOG(EXCEPTION) << "Invalid weight key " << key;
}
return tokens_[key] > 0;
}
template <typename T>
@@ -541,6 +634,11 @@ inline void ParameterServer<T>::ResetGradAccumCount() {
}
}
template <typename T>
inline std::mutex &ParameterServer<T>::mutex() {
return mutex_;
}
template <typename T>
void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) {
::ps::Start(0);
@@ -550,6 +648,8 @@ void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) {
}
Init(func_graph);
thread_->join();
::ps::Finalize(0, true);
exit(1);
}
} // namespace ps
} // namespace parallel


+ 2
- 3
mindspore/ccsrc/frontend/parallel/ps/scheduler.cc View File

@@ -23,9 +23,8 @@ namespace parallel {
namespace ps {
void Scheduler::Run() {
::ps::Start(0);
while (true) {
sleep(1);
}
::ps::Finalize(0, true);
exit(1);
}
} // namespace ps
} // namespace parallel


+ 14
- 0
mindspore/ccsrc/frontend/parallel/ps/util.cc View File

@@ -33,6 +33,13 @@ std::unordered_map<int, std::string> Util::id_to_optimizers{
{1, kSparseAdam},
{2, kSparseFtrl},
};

std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{
{0, kApplyMomentumOp},
{1, kSparseAdamOp},
{2, kSparseFtrlOp},
};

bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); }

bool Util::IsRoleOfWorker() {
@@ -112,6 +119,13 @@ std::string Util::optimizer_name(int id) {
return "";
}

std::string Util::optimizer_node_name(int id) {
if (id_to_optimizer_nodes.count(id) > 0) {
return id_to_optimizer_nodes[id];
}
return "";
}

bool Util::is_optimizer(std::string name) { return optimizer_to_ids.count(name) > 0; }

int Util::LocalShard(int first_dim, int rank_id, int server_num) {


+ 2
- 0
mindspore/ccsrc/frontend/parallel/ps/util.h View File

@@ -34,12 +34,14 @@ class Util {
static void SetInternalEnvVar();
static int optimizer_id(std::string name);
static std::string optimizer_name(int id);
static std::string optimizer_node_name(int id);
static bool is_optimizer(std::string name);
static int LocalShard(int first_dim, int rank_id, int server_num);

private:
static std::unordered_map<std::string, int> optimizer_to_ids;
static std::unordered_map<int, std::string> id_to_optimizers;
static std::unordered_map<int, std::string> id_to_optimizer_nodes;
};
} // namespace ps
} // namespace parallel


+ 54
- 11
mindspore/ccsrc/frontend/parallel/ps/worker.h View File

@@ -24,6 +24,7 @@
#include <map>
#include "ps/ps.h"
#include "utils/log_adapter.h"
#include "ir/tensor.h"
#include "frontend/parallel/ps/util.h"
#include "frontend/parallel/ps/common.h"
#include "frontend/parallel/ps/worker_proxy.h"
@@ -43,18 +44,20 @@ class Worker {
void Push(const std::vector<size_t> &keys, std::vector<uintptr_t> addrs, const std::vector<int> &sizes);
void Pull(const size_t key, void *dev_addr, const size_t size);
size_t SetParamKey(const std::string &param_name);
void SetParamInitInServer(const std::string &param_name, bool init_in_server);
bool GetParamInitInServer(const std::string &param_name);
void SetKeyOptimId(size_t key, const std::string &optimizer_name);
void SetOptimInputShapes(size_t key, const std::vector<int> &shape);
void AddEmbeddingTable(const ::ps::Key &key, const size_t &row_count);
void InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vector<size_t> shapes, const std::vector<int> &sizes);
void InitPSParamAndOptim(const std::string &param_name, void *param_data, size_t param_size);
void InitPSParamAndOptim(const std::string &param_name, tensor::TensorPtr tensor);
void DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
const ::ps::SArray<int> &lens, ::ps::SArray<T> *lookup_result, int cmd);
void Finalize();

private:
Worker() : kv_worker_(nullptr), running_(false), key_cnt_(0) {}
~Worker() { ::ps::Finalize(0, true); }
~Worker() = default;
Worker(const Worker &) = delete;
Worker &operator=(const Worker &) = delete;

@@ -73,6 +76,7 @@ class Worker {
std::map<size_t, bool> init_keys_;
std::map<size_t, int> key_to_optimId_;
std::map<size_t, std::vector<std::vector<int>>> key_to_optim_shapes_;
std::map<std::string, bool> param_to_init_in_server_;
};

template <typename T>
@@ -81,7 +85,6 @@ void Worker<T>::Run() {
MS_LOG(INFO) << "'Worker is already running.";
return;
}

::ps::Start(0);
if (!::ps::IsWorker()) {
MS_LOG(EXCEPTION) << "The role is not worker.";
@@ -99,18 +102,30 @@ void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> add
::ps::SArray<T> total_buffer(total_size, 0);
size_t offset = 0;
for (size_t i = 0; i < sizes.size(); i++) {
memcpy_s(total_buffer.data() + offset / sizeof(T), sizes[i] * sizeof(T), reinterpret_cast<void *>(addrs[i]),
sizes[i] * sizeof(T));
auto ret = memcpy_s(total_buffer.data() + offset / sizeof(T), sizes[i] * sizeof(T),
reinterpret_cast<void *>(addrs[i]), sizes[i] * sizeof(T));
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
}
offset += sizes[i] * sizeof(T);
}
while (!kv_worker_->IsReadyForPush(keys[0])) {
continue;
}
kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes));
}

template <typename T>
void Worker<T>::Pull(const size_t key, void *dev_addr, const size_t size) {
::ps::SArray<T> variables(size / sizeof(T), 0);
while (!kv_worker_->IsReadyForPull(key)) {
continue;
}
kv_worker_->Wait(kv_worker_->ZPull({key}, &variables));
memcpy_s(dev_addr, size, variables.data(), size);
auto ret = memcpy_s(dev_addr, size, variables.data(), size);
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
}
}

template <typename T>
@@ -121,7 +136,11 @@ void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const :

template <typename T>
void Worker<T>::Finalize() {
kv_worker_->Finalize();
if (running_) {
kv_worker_->Finalize();
kv_worker_.reset();
running_ = false;
}
}

template <typename T>
@@ -192,6 +211,20 @@ size_t Worker<T>::SetParamKey(const std::string &param_name) {
return key;
}

template <typename T>
void Worker<T>::SetParamInitInServer(const std::string &param_name, bool init_in_server) {
MS_LOG(INFO) << "Set parameter " << param_name << " init_in_server:" << init_in_server;
param_to_init_in_server_[param_name] = init_in_server;
}

template <typename T>
bool Worker<T>::GetParamInitInServer(const std::string &param_name) {
if (param_to_init_in_server_.count(param_name) == 0) {
return false;
}
return param_to_init_in_server_[param_name];
}

template <typename T>
size_t Worker<T>::GetParamKey(const std::string &param_name) {
size_t key = kInvalidKey;
@@ -237,17 +270,27 @@ void Worker<T>::InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vecto

template <typename T>
// Initialize parameters and optimizer kernels of Parameter Server.
void Worker<T>::InitPSParamAndOptim(const std::string &param_name, void *param_data, size_t param_size) {
void Worker<T>::InitPSParamAndOptim(const std::string &param_name, tensor::TensorPtr tensor) {
void *param_data = tensor->data_c();
size_t param_size = LongToSize(tensor->data().nbytes());
std::vector<int> param_shape = tensor->shape_c();

size_t param_key = GetParamKey(param_name);
if (param_key == kInvalidKey) {
MS_LOG(INFO) << "Parameter " << param_name << " has no key assigned.";
return;
}
bool init_in_server = false;
std::vector<int> shape_init_in_server = {1};
if (param_shape == shape_init_in_server) {
init_in_server = true;
}
SetParamInitInServer(param_name, init_in_server);
bool init = IsKeyInit(param_key);
if (!init) {
MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name;
// No need to push embedding table data to Parameter Server.
if (param_name.find("embedding_table") == std::string::npos && param_name.find("wide_w") == std::string::npos) {
MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name
<< ", whether init in server: " << init_in_server;
if (!init_in_server) {
InitPSParamData({param_key}, param_data, param_size);
}
InitPSOptimId(param_key);


+ 25
- 1
mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h View File

@@ -56,6 +56,8 @@ class WorkerProxy : public ::ps::KVWorker<T> {
int priority = 0);
int InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals,
const ::ps::SArray<int> &lens = {}, const Callback &cb = nullptr, int priority = 0);
bool IsReadyForPush(const Key &key);
bool IsReadyForPull(const Key &key);
void PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, const ::ps::SArray<int> &lens = {},
int cmd = 0, int priority = 0);
void Finalize();
@@ -134,6 +136,28 @@ int WorkerProxy<T>::InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, cons
return ts;
}

template <typename T>
bool WorkerProxy<T>::IsReadyForPush(const Key &key) {
::ps::SArray<T> result(1, 0);
this->Wait(this->ZPull({key}, &result, nullptr, kCheckReadyForPushCmd));
if (result[0] > 0) {
return true;
} else {
return false;
}
}

template <typename T>
bool WorkerProxy<T>::IsReadyForPull(const Key &key) {
::ps::SArray<T> result(1, 0);
this->Wait(this->ZPull({key}, &result, nullptr, kCheckReadyForPullCmd));
if (result[0] > 0) {
return true;
} else {
return false;
}
}

template <typename T>
void WorkerProxy<T>::PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals,
const ::ps::SArray<int> &lens, int cmd, int priority) {
@@ -155,7 +179,7 @@ void WorkerProxy<T>::Finalize() {
kvs.vals.push_back(0.0f);
Send(obj_, ts, true, false, kFinalizeCmd, kvs, broadcast_slicer_);
obj_->WaitRequest(ts);
::ps::Finalize(0, false);
::ps::Finalize(0, true);
}

template <typename T>


+ 3
- 1
mindspore/ccsrc/minddata/dataset/CMakeLists.txt View File

@@ -47,7 +47,9 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/include
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ORIGIN:$ORIGIN/lib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")

ms_build_flatbuffers("engine/cache/de_tensor.fbs" ${CMAKE_CURRENT_SOURCE_DIR} generated_engine_files ${CMAKE_BINARY_DIR})
include_directories("${CMAKE_BINARY_DIR}/minddata/dataset/engine/cache")
set(MD_FLATBUFFER_OU "${CMAKE_BINARY_DIR}/minddata/dataset/engine/cache")
ms_build_flatbuffers("engine/cache/de_tensor.fbs" ${CMAKE_CURRENT_SOURCE_DIR} generated_engine_files ${MD_FLATBUFFER_OU})

################## Include sub-modules ###############################
add_subdirectory(util)


+ 4
- 0
mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc View File

@@ -410,6 +410,7 @@ Status DEPipeline::SaveDataset(const std::vector<std::string> &file_names, const
std::vector<std::string> index_fields;
s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields);
RETURN_IF_NOT_OK(s);
MS_LOG(DEBUG) << "Schema of saved mindrecord: " << mr_json.dump();
if (mindrecord::SUCCESS !=
mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) {
RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader.");
@@ -569,6 +570,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
if (column_name_id_map.empty()) {
RETURN_STATUS_UNEXPECTED("Error: column not found.");
}
json dataset_schema;
for (auto &col : column_name_id_map) {
auto idx = col.second;
auto column_name = col.first;
@@ -580,6 +582,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
auto shapes = column_shape.AsVector();
std::vector<int> mr_shape(shapes.begin(), shapes.end());
std::string el = column_type.ToString();
dataset_schema[column_name] = el;
if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) {
std::string err_msg("Error: can not support data type: " + el);
RETURN_STATUS_UNEXPECTED(err_msg);
@@ -605,6 +608,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string,
if (mr_type == "bytes" || !mr_shape.empty()) continue;
index_fields->emplace_back(column_name); // candidate of index fields
}
MS_LOG(DEBUG) << "Schema of dataset: " << dataset_schema.dump();
return Status::OK();
}
Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle,


+ 4
- 0
mindspore/ccsrc/minddata/dataset/core/tensor.cc View File

@@ -268,6 +268,10 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
(*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
int64_t byte_size = (*ptr)->SizeInBytes();
if (byte_size == 0) {
return Status::OK();
}

RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size));

unsigned char *data = static_cast<unsigned char *>(arr.request().ptr);


+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/cache/cache_client.h View File

@@ -23,9 +23,9 @@
#include <utility>
#include <vector>

#include "./de_tensor_generated.h"
#include "minddata/dataset/engine/data_buffer.h"
#include "minddata/dataset/engine/cache/cache_server.h"
#include "minddata/dataset/engine/cache/de_tensor_generated.h"
#include "minddata/dataset/util/lock.h"

namespace mindspore {


+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/cache/cache_request.h View File

@@ -23,8 +23,8 @@
#include <utility>
#include <vector>

#include "./de_tensor_generated.h"
#include "minddata/dataset/core/tensor_row.h"
#include "minddata/dataset/engine/cache/de_tensor_generated.h"
#include "minddata/dataset/util/slice.h"
#include "minddata/dataset/util/wait_post.h"



+ 2
- 1
mindspore/ccsrc/minddata/dataset/engine/cache/cache_service.h View File

@@ -25,10 +25,10 @@
#include <utility>
#include <vector>

#include "./de_tensor_generated.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/engine/cache/cache_request.h"
#include "minddata/dataset/engine/cache/de_tensor_generated.h"
#include "minddata/dataset/util/arena.h"
#include "minddata/dataset/util/btree.h"
#include "minddata/dataset/util/cache_pool.h"
@@ -84,6 +84,7 @@ class CacheService : public Service {
public:
using state_type = std::underlying_type<State>::type;
ServiceStat() : min_(0), max_(0), state_(0) {}
~ServiceStat() = default;
CachePool::CacheStat stat_{};
row_id_type min_;
row_id_type max_;


+ 9
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc View File

@@ -388,6 +388,13 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) {
op->tree_->Print(ss, op);
std::string ss_str = ss.str();

// Filter out the Num workers field when generating the check sum
ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), "");
ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*\\]"), "");

// Filter out Number of rows when generating the check sum
ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), "");

// Filter out the Operator control flags field when generating the check sum
ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), "");

@@ -400,6 +407,8 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) {
ss_str = std::regex_replace(ss_str, std::regex("Cache crc.*\n"), "");
ss_str = std::regex_replace(ss_str, std::regex("Server cache id.*\n"), "");

MS_LOG(DEBUG) << "Printing the tree for generating crc:\n" << ss_str;

uint32_t cache_crc = system::Crc32c::GetMaskCrc32cValue(ss_str.c_str(), ss_str.length());
return cache_crc;
}


+ 2
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc View File

@@ -212,12 +212,12 @@ Status DeviceQueueOp::SendDataToGPU() {
RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle));
total_batch++;
}
if (!TaskManager::FindMe()->Interrupted())
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
else
is_break_loop = true;
}
if (!TaskManager::FindMe()->Interrupted())
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
else
is_break_loop = true;


+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc View File

@@ -758,6 +758,11 @@ Status CsvOp::ComputeColMap() {
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
if (column_default_list_.size() < column_name_id_map_.size()) {
for (int32_t i = column_default_list_.size(); i < column_name_id_map_.size(); i++) {
column_default_list_.push_back(std::make_shared<CsvOp::Record<std::string>>(CsvOp::STRING, ""));
}
}
return Status::OK();
}
} // namespace dataset


+ 3
- 2
mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc View File

@@ -679,9 +679,10 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
}
cv::Mat result;
cv::merge(image_result, result);
result.convertTo(result, input_cv->mat().type());
std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result);
if (input_cv->Rank() == 2) output_cv->Squeeze();
(*output) = std::static_pointer_cast<Tensor>(output_cv);
(*output)->Reshape(input->shape());
} catch (const cv::Exception &e) {
RETURN_STATUS_UNEXPECTED("Error in auto contrast");
}
@@ -781,8 +782,8 @@ Status Equalize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *o
cv::Mat result;
cv::merge(image_result, result);
std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result);
if (input_cv->Rank() == 2) output_cv->Squeeze();
(*output) = std::static_pointer_cast<Tensor>(output_cv);
(*output)->Reshape(input->shape());
} catch (const cv::Exception &e) {
RETURN_STATUS_UNEXPECTED("Error in equalize.");
}


+ 18
- 13
mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.cc View File

@@ -27,17 +27,34 @@ namespace dataset {
SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab,
const SPieceTokenizerLoadType load_type,
const SPieceTokenizerOutType out_type)
: vocab_(vocab), load_type_(load_type), out_type_(out_type) {}
: vocab_(vocab), load_type_(load_type), out_type_(out_type) {
auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto());
if (!status.ok()) {
model_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "parser vocab model filed.");
} else {
model_status_ = Status::OK();
}
}

SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename,
const SPieceTokenizerLoadType load_type,
const SPieceTokenizerOutType out_type)
: load_type_(load_type), out_type_(out_type) {
(void)GetModelRealPath(model_path, model_filename);
auto status = processor_.Load(file_path_);
if (!status.ok()) {
model_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "load vocab model filed.");
} else {
model_status_ = Status::OK();
}
}

Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (!model_status_.IsOk()) {
return model_status_;
}

if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
}
@@ -45,18 +62,6 @@ Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
std::string_view sentence_v;
RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {}));
std::string sentence{sentence_v};
if (load_type_ == SPieceTokenizerLoadType::kFile) {
auto status = processor_.Load(file_path_);
if (!status.ok()) {
RETURN_STATUS_UNEXPECTED("load sentence piece model failed.");
}
} else {
RETURN_UNEXPECTED_IF_NULL(vocab_);
auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto());
if (!status.ok()) {
RETURN_STATUS_UNEXPECTED("sentence piece load model failed.");
}
}

if (out_type_ == SPieceTokenizerOutType::kString) {
std::vector<std::string> pieces;


+ 1
- 0
mindspore/ccsrc/minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h View File

@@ -58,6 +58,7 @@ class SentencePieceTokenizerOp : public TensorOp {
std::string file_path_;
SPieceTokenizerLoadType load_type_;
sentencepiece::SentencePieceProcessor processor_;
Status model_status_;
};
} // namespace dataset
} // namespace mindspore


+ 7
- 2
mindspore/ccsrc/minddata/dataset/util/task_manager.cc View File

@@ -296,7 +296,13 @@ Status TaskGroup::CreateAsyncTask(const std::string &my_name, const std::functio
return Status::OK();
}

void TaskGroup::interrupt_all() noexcept { intrp_svc_->InterruptAll(); }
void TaskGroup::interrupt_all() noexcept {
// There is a racing condition if we don't stop the interrupt service at this point. New resource
// may come in and not being picked up after we call InterruptAll(). So stop new comers and then
// interrupt any existing resources.
(void)intrp_svc_->ServiceStop();
intrp_svc_->InterruptAll();
}

Status TaskGroup::join_all(Task::WaitFlag wf) {
Status rc;
@@ -312,7 +318,6 @@ Status TaskGroup::join_all(Task::WaitFlag wf) {
}

Status TaskGroup::DoServiceStop() {
intrp_svc_->ServiceStop();
interrupt_all();
return (join_all(Task::WaitFlag::kNonBlocking));
}


+ 1
- 0
mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc View File

@@ -133,6 +133,7 @@ void BindGlobalParams(py::module *m) {
(*m).attr("MAX_PAGE_SIZE") = kMaxPageSize;
(*m).attr("MIN_SHARD_COUNT") = kMinShardCount;
(*m).attr("MAX_SHARD_COUNT") = kMaxShardCount;
(*m).attr("MAX_FILE_COUNT") = kMaxFileCount;
(*m).attr("MIN_CONSUMER_COUNT") = kMinConsumerCount;
(void)(*m).def("get_max_thread_num", &GetMaxThreadNum);
}


+ 2
- 1
mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h View File

@@ -104,7 +104,8 @@ const uint64_t kInt64Len = 8;
const uint64_t kMinFileSize = kInt64Len;

const int kMinShardCount = 1;
const int kMaxShardCount = 1000;
const int kMaxShardCount = 1000; // write
const int kMaxFileCount = 4096; // read

const int kMinConsumerCount = 1;
const int kMaxConsumerCount = 128;


+ 1
- 1
mindspore/ccsrc/minddata/mindrecord/include/shard_header.h View File

@@ -152,7 +152,7 @@ class ShardHeader {

MSRStatus CheckIndexField(const std::string &field, const json &schema);

void ParsePage(const json &page, int shard_index, bool load_dataset);
MSRStatus ParsePage(const json &page, int shard_index, bool load_dataset);

MSRStatus ParseStatistics(const json &statistics);



+ 2
- 2
mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc View File

@@ -252,7 +252,7 @@ std::vector<std::tuple<int, int, int, uint64_t>> ShardReader::ReadRowGroupSummar
if (shard_count <= 0) {
return row_group_summary;
}
if (shard_count <= kMaxShardCount) {
if (shard_count <= kMaxFileCount) {
for (int shard_id = 0; shard_id < shard_count; ++shard_id) {
// return -1 when page's size equals to 0.
auto last_page_id = shard_header_->GetLastPageId(shard_id);
@@ -1054,7 +1054,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector<std::tuple<int, int, i
}
auto offsets = std::get<1>(ret);
auto local_columns = std::get<2>(ret);
if (shard_count_ <= kMaxShardCount) {
if (shard_count_ <= kMaxFileCount) {
for (int shard_id = 0; shard_id < shard_count_; shard_id++) {
for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) {
tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1],


+ 1
- 1
mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc View File

@@ -83,7 +83,7 @@ MSRStatus ShardWriter::OpenDataFiles(bool append) {
// if not append and mindrecord file exist, return FAILED
fs->open(common::SafeCStr(file), std::ios::in | std::ios::binary);
if (fs->good()) {
MS_LOG(ERROR) << "MindRecord file already existed.";
MS_LOG(ERROR) << "MindRecord file already existed, please delete file: " << common::SafeCStr(file);
fs->close();
return FAILED;
}


+ 14
- 4
mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc View File

@@ -55,7 +55,9 @@ MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers, bool l
header_size_ = header["header_size"].get<uint64_t>();
page_size_ = header["page_size"].get<uint64_t>();
}
ParsePage(header["page"], shard_index, load_dataset);
if (SUCCESS != ParsePage(header["page"], shard_index, load_dataset)) {
return FAILED;
}
shard_index++;
}
return SUCCESS;
@@ -248,11 +250,16 @@ MSRStatus ShardHeader::ParseIndexFields(const json &index_fields) {
return SUCCESS;
}

void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) {
MSRStatus ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) {
// set shard_index when load_dataset is false
if (pages_.empty() && shard_count_ <= kMaxShardCount) {
if (shard_count_ > kMaxFileCount) {
MS_LOG(ERROR) << "The number of mindrecord files is greater than max value: " << kMaxFileCount;
return FAILED;
}
if (pages_.empty() && shard_count_ <= kMaxFileCount) {
pages_.resize(shard_count_);
}

for (auto &page : pages) {
int page_id = page["page_id"];
int shard_id = page["shard_id"];
@@ -275,6 +282,7 @@ void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_datase
pages_[shard_index].push_back(std::move(parsed_page));
}
}
return SUCCESS;
}

MSRStatus ShardHeader::ParseStatistics(const json &statistics) {
@@ -715,7 +723,9 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {

std::string line;
while (std::getline(page_in_handle, line)) {
ParsePage(json::parse(line), -1, true);
if (SUCCESS != ParsePage(json::parse(line), -1, true)) {
return FAILED;
}
}

page_in_handle.close();


+ 11
- 5
mindspore/ccsrc/pipeline/jit/parse/parse.cc View File

@@ -17,6 +17,8 @@
*/

#include "pipeline/jit/parse/parse.h"

#include <utility>
#include <string>
#include <memory>
#include <sstream>
@@ -1480,21 +1482,25 @@ AnfNodePtr FindPhis(const std::unordered_map<ParameterPtr, AnfNodePtr> &removabl
void Parser::RemoveUnnecessaryPhis() {
// merge all removable phis to one map;
std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis;
std::vector<ParameterPtr> phis;
for (FunctionBlockPtr &block : func_block_list_) {
MS_EXCEPTION_IF_NULL(block);
removable_phis.insert(block->removable_phis().begin(), block->removable_phis().end());
std::transform(block->removable_phis().begin(), block->removable_phis().end(), std::back_inserter(phis),
[](std::pair<ParameterPtr, AnfNodePtr> pair) { return pair.first; });
}
if (removable_phis.size() == 0) {
return;
}

auto fg_name = func_graph_->ToString();
auto mng = Manage(func_graph_, false);
// replace the nodes
for (auto iter : removable_phis) {
auto new_node = FindPhis(removable_phis, iter.first);
MS_LOG(DEBUG) << "phi " << iter.first->DebugString() << " to " << new_node->DebugString();
mng->Replace(iter.first, new_node);
// remove from inside to outside
for (int idx = SizeToInt(phis.size() - 1); idx >= 0; idx--) {
auto phi = phis[IntToSize(idx)];
auto new_node = FindPhis(removable_phis, phi);
MS_LOG(DEBUG) << "phi " << phi->DebugString() << " to " << new_node->DebugString();
mng->Replace(phi, new_node);
}
// remove the parameter
for (FunctionBlockPtr &block : func_block_list_) {


+ 9
- 1
mindspore/ccsrc/pipeline/jit/pipeline.cc View File

@@ -45,6 +45,7 @@
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "frontend/parallel/ps/common.h"
#include "frontend/parallel/ps/util.h"
#include "frontend/parallel/ps/worker.h"
#endif

#if (ENABLE_GE || ENABLE_D)
@@ -261,6 +262,7 @@ void ExecutorPy::DelNetRes(const std::string &id) {
for (auto &item : tmp_info) {
if (item.first.find(id) != string::npos) {
MS_LOG(DEBUG) << "Delete network res:" << item.first;
item.second = nullptr;
(void)info_.erase(item.first);
flag = true;
}
@@ -949,7 +951,13 @@ void ClearResAtexit() {
pynative::ClearPyNativeSession();
session::ClearPythonParasMap();
device::KernelRuntimeManager::Instance().ClearRuntimeResource();

#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if (mindspore::parallel::ps::Util::IsParamServerMode()) {
if (parallel::ps::Util::IsRoleOfWorker()) {
parallel::ps::Worker<float>::GetInstance().Finalize();
}
}
#endif
ad::g_k_prims.clear();

abstract::ClearPrimEvaluatorMap();


+ 12
- 1
mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc View File

@@ -150,7 +150,8 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() {
using mindspore::parse::PyObjectWrapper;

EvalResultPtr StandardPrimEvaluator::EvalPrim(const AnalysisEnginePtr &engine, const AbstractBasePtrList &args) {
if (prim_ != prim::kPrimMakeTuple && prim_ != prim::kPrimSwitch) {
if (prim_ != prim::kPrimMakeTuple && prim_ != prim::kPrimSwitch && prim_ != prim::kPrimEnvSetItem &&
prim_ != prim::kPrimEnvGetItem) {
auto ret_abstract = AbstractEval(args);
if (ret_abstract != nullptr) {
MS_LOG(DEBUG) << "StandardPrimEvaluator eval Undetermined";
@@ -386,6 +387,16 @@ py::dict ConvertAbstractToPython(const AbstractBasePtr &abs_base) {
dic["shape"] = arg_tensor->shape()->shape();
dic["dtype"] = arg_tensor->BuildType();
dic["value"] = BuildValue(arg_tensor->BuildValue());
} else if (abs_base->isa<AbstractIndexedSlices>()) {
auto arg = dyn_cast<AbstractIndexedSlices>(abs_base);
dic["shape"] = arg->shape()->shape();
dic["dtype"] = arg->BuildType();
dic["value"] = BuildValue(arg->BuildValue());
} else if (abs_base->isa<AbstractSparseTensor>()) {
auto arg = dyn_cast<AbstractSparseTensor>(abs_base);
dic["shape"] = arg->shape()->shape();
dic["dtype"] = arg->BuildType();
dic["value"] = BuildValue(arg->BuildValue());
} else if (abs_base->isa<AbstractScalar>() || abs_base->isa<AbstractType>() || abs_base->isa<AbstractRefKey>()) {
std::vector<int> shape;
dic["shape"] = shape;


+ 1
- 1
mindspore/ccsrc/pipeline/pynative/base.h View File

@@ -59,7 +59,7 @@ struct OpExecInfo {
using OpExecInfoPtr = std::shared_ptr<OpExecInfo>;
OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args);

const std::set<std::string> ignore_infer_prim = {"make_ref"};
const std::set<std::string> ignore_infer_prim = {"make_ref", "mixed_precision_cast"};
} // namespace pynative
} // namespace mindspore



+ 18
- 9
mindspore/ccsrc/pipeline/pynative/pynative_execute.cc View File

@@ -57,7 +57,7 @@ using mindspore::tensor::TensorPy;

const char SINGLE_OP_GRAPH[] = "single_op_graph";
// primitive unable to infer value for constant input in PyNative mode
const std::set<std::string> vm_operators = {"make_ref", "HookBackward", "stop_gradient"};
const std::set<std::string> vm_operators = {"make_ref", "HookBackward", "stop_gradient", "mixed_precision_cast"};

namespace mindspore {
namespace pynative {
@@ -690,12 +690,15 @@ py::tuple RunOpInner(const OpExecInfoPtr &op_exec_info, const py::args &args) {
return err_ret;
}

auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result);
if (node != nullptr) {
node->set_abstract(op_exec_info->abstract);
MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString();
if (op_exec_info->op_name != prim::kPrimMixedPrecisionCast->name()) {
auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result);
if (node != nullptr) {
node->set_abstract(op_exec_info->abstract);
MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString();
}
MS_LOG(DEBUG) << "RunOp end";
}
MS_LOG(DEBUG) << "RunOp end";
return result;
}

@@ -766,6 +769,9 @@ PynativeExecutor::PynativeExecutor() { grad_flag_ = false; }
void PynativeExecutor::NewGraphInner(const py::object &cell, const py::args &args) {
auto cell_id = GetId(cell);
if (cell_graph_map_.count(cell_id) != 0) {
if (cell_resource_map_.find(cell_id) != cell_resource_map_.end()) {
resource_ = cell_resource_map_[cell_id];
}
MS_LOG(DEBUG) << "Newgraph already compiled";
return;
}
@@ -774,6 +780,8 @@ void PynativeExecutor::NewGraphInner(const py::object &cell, const py::args &arg

if (top_g_ == nullptr) {
top_g_ = curr_g_ = g;
resource_ = std::make_shared<pipeline::Resource>();
cell_resource_map_[cell_id] = resource_;
df_builder_ = std::make_shared<FuncGraph>();
MS_LOG(DEBUG) << "First new graph" << top_g_.get();
Pushp();
@@ -910,8 +918,8 @@ void PynativeExecutor::EndGraphInner(const py::object &cell, const py::object &o
cnode->set_inputs(args);
set_obj_node_map(curr_g_, out_id, cnode);
} else {
MS_LOG(ERROR) << "Graph has no this out: " << out_id;
return;
MS_LOG(DEBUG) << "Set ValueNode as output for graph, out id: " << out_id;
MakeValueNode(out, out_id);
}
}
EndGraphByOutId(out_id, cell, out, args);
@@ -1075,6 +1083,7 @@ void PynativeExecutor::Clear(const std::string &flag) {
MS_LOG(INFO) << "Clear res";
(void)graph_map_.erase(flag);
(void)cell_graph_map_.erase(flag);
(void)cell_resource_map_.erase(flag);
Clean();
// Maybe exit in the pynative runing op, so need reset pynative flag.
auto ms_context = MsContext::GetInstance();
@@ -1086,6 +1095,7 @@ void PynativeExecutor::Clear(const std::string &flag) {

MS_LOG(INFO) << "Clear";
top_g_ = nullptr;
df_builder_ = nullptr;
curr_g_ = nullptr;
graph_info_map_.clear();
std::stack<FuncGraphPtr>().swap(graph_p_);
@@ -1095,7 +1105,6 @@ void PynativeExecutor::Clean() {
MS_LOG(INFO) << "Clean all res";
Clear();
grad_flag_ = false;
df_builder_ = nullptr;
ad::CleanRes();
pipeline::ReclaimOptimizer();
}


+ 1
- 0
mindspore/ccsrc/pipeline/pynative/pynative_execute.h View File

@@ -115,6 +115,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
bool grad_flag_;
std::unordered_map<std::string, FuncGraphPtr> graph_map_;
std::unordered_map<std::string, FuncGraphPtr> cell_graph_map_;
std::unordered_map<std::string, ResourcePtr> cell_resource_map_;
std::unordered_map<FuncGraphPtr, GraphInfo> graph_info_map_;
std::stack<FuncGraphPtr> graph_p_;
FuncGraphPtr top_g_;


+ 2
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc View File

@@ -484,7 +484,8 @@ bool AscendDeviceAddress::SyncDeviceToHostAndConvertFormat(const std::vector<int
std::vector<size_t> device_shape = GetDeviceShape(&host_shape);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->execution_mode() == kPynativeMode && type_id_name_map.find(type_id_) != type_id_name_map.end()) {
if (ms_context->execution_mode() != kPynativeMode && ms_context->execution_mode() != kGraphMode &&
type_id_name_map.find(type_id_) != type_id_name_map.end()) {
std::pair<std::string, std::string> type_format = std::make_pair(type_id_name_map.at(type_id_), format_);
if (use_trans_data.find(type_format) != use_trans_data.end()) {
sync_ok = SyncDeviceToHostAndConvertFormatBasedOnTransData(host_shape, device_shape, size, type, host_ptr);


+ 10
- 6
mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.cc View File

@@ -672,10 +672,8 @@ void AscendStreamAssign::InsertEventForIndependentParallel(const NotNull<KernelG
void AscendStreamAssign::GetNeedActiveStreams(const NotNull<KernelGraphPtr> &graph_ptr) {
CNodePtr cur_cnode_ptr = nullptr;
auto cnode_ptr_list = graph_ptr->execution_order();
// 1)first stream 0 should be actived first;
need_first_active_streams_.emplace_back(0);

// 2)stream witch kStreamNeedActivedFirst attr should be actived;
// 1)stream witch kStreamNeedActivedFirst attr should be actived;
for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
cur_cnode_ptr = cnode_ptr_list[i];
MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
@@ -691,19 +689,25 @@ void AscendStreamAssign::GetNeedActiveStreams(const NotNull<KernelGraphPtr> &gra
}
}

// 3)independent stream:if has not been activate, push to need active vector
// 2)independent stream:if has not been activate, push to need active vector
if (!independent_stream_activated_) {
for (auto &item : independent_stream_map_) {
need_first_active_streams_.emplace_back(item.first);
}
}

// 4)hcom stream:if has not been activate, push to need active vector
// 3)hcom stream:if has not been activate, push to need active vector
if (!hcom_stream_activated_) {
for (auto &item : hcom_stream_map_) {
need_first_active_streams_.emplace_back(item.first);
}
}

// 4)first stream 0 should be actived first;
auto it = std::find(need_first_active_streams_.begin(), need_first_active_streams_.end(), 0);
if (it == need_first_active_streams_.end()) {
need_first_active_streams_.emplace_back(0);
}
}

// section8
@@ -958,7 +962,7 @@ void AscendStreamAssign::DFS(uint32_t start, std::vector<uint32_t> *group) {
if (!IsVecExist(group)) {
stream_groups_.emplace_back(*group);
} else {
MS_LOG(WARNING) << "DFS should not print this log";
MS_LOG(WARNING) << "DFS find same stream group, Not expected";
}
return;
}


+ 4
- 0
mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc View File

@@ -492,6 +492,10 @@ void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, co
AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) {
continue;
}
if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown &&
AnfAlgo::OutputAddrExist(real_input_node, 0)) {
continue;
}
if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
builder->SetOutputsFormat(output_format);


+ 5
- 0
mindspore/ccsrc/runtime/device/cpu/cpu_device_address.cc View File

@@ -52,6 +52,11 @@ bool CPUDeviceAddress::SyncDeviceToHost(const std::vector<int> & /*shape*/, size

bool CPUDeviceAddress::SyncHostToDevice(const std::vector<int> & /*shape*/, size_t size, TypeId type,
const void *host_ptr) const {
if (host_ptr == ptr_) {
MS_LOG(DEBUG) << "host_ptr is equal to ptr_, request ignored.";
return true;
}

if (type == kNumberTypeFloat16) {
HalfToFloat(ptr_, host_ptr, size / 2);
} else if (type == kNumberTypeFloat64) {


+ 8
- 4
mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc View File

@@ -40,8 +40,7 @@ void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) {
AssignValueNodeAddress(kernel_graph);
AssignInputNodeAddress(kernel_graph);
AssignKernelOutputAddress(kernel_graph);
resource_manager_.MemPlan(kernel_graph);
resource_manager_.MemMalloc(kernel_graph);
resource_manager_.AssignMemory(kernel_graph);
}

void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) {
@@ -186,11 +185,15 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &k
return ret;
}
return CreatTensorForOutput(node, index, bound_addresses, need_sync_outputs);
} else if (input_node->isa<Parameter>() || input_node->isa<ValueNode>()) {
} else if (input_node->isa<Parameter>()) {
auto iter = input_map.find(input_node.get());
if (iter != input_map.end()) {
return iter->second;
}
} else if (input_node->isa<ValueNode>()) {
auto value_node = input_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
return value_node->value();
}
return BaseRef();
}
@@ -220,7 +223,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
(void)tensor->data_sync();
}

if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) {
if (tensor->data_type() == address->type_id_ || tensor->data_type() == kNumberTypeFloat32 ||
tensor->data_type() == kNumberTypeInt32) {
address->ptr_ = tensor->data_c();
} else {
std::vector<int> data_shape = tensor->shape();


+ 6
- 7
mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.cc View File

@@ -34,11 +34,13 @@ void CPUResourceManager::MemFree() {
dynamic_mem_.clear();
}

void CPUResourceManager::MemPlan(const session::KernelGraph *graph) {
mem_plan_.MemPlan(graph);
size_t graph_mem_size = mem_plan_.GetGraphMemSize(graph);
void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) {
size_t graph_mem_size = mem_plan_.MemPlan(graph);
if (graph_mem_size > mem_size_) {
MemFree();
if (mem_size_ > 0) {
dynamic_mem_[mem_ptr_] = mem_size_;
mem_size_ = 0;
}
mem_ptr_ = reinterpret_cast<uint8_t *>(malloc(graph_mem_size));
if (mem_ptr_ != nullptr) {
mem_size_ = graph_mem_size;
@@ -48,9 +50,6 @@ void CPUResourceManager::MemPlan(const session::KernelGraph *graph) {
dynamic_malloc_ = true;
}
}
}

void CPUResourceManager::MemMalloc(const session::KernelGraph *graph) {
if (dynamic_malloc_) {
return;
}


+ 3
- 4
mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.h View File

@@ -17,7 +17,7 @@
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_

#include <vector>
#include <unordered_map>
#include <map>
#include "backend/session/kernel_graph.h"
#include "backend/session/session_basic.h"
#include "runtime/device/device_address.h"
@@ -30,8 +30,7 @@ class CPUResourceManager {
CPUResourceManager() = default;
~CPUResourceManager();

void MemPlan(const session::KernelGraph *graph);
void MemMalloc(const session::KernelGraph *graph);
void AssignMemory(const session::KernelGraph *graph);
void IncreaseAddressRefCount(const session::KernelGraph *graph);
void DecreaseAddressRefCount(const AnfNodePtr &kernel);
void *MemMalloc(size_t mem_size);
@@ -46,7 +45,7 @@ class CPUResourceManager {
size_t mem_size_{0};
uint8_t *mem_ptr_{nullptr};
bool dynamic_malloc_{false};
std::unordered_map<void *, size_t> dynamic_mem_;
std::map<void *, size_t> dynamic_mem_;
};
} // namespace cpu
} // namespace device


+ 3
- 10
mindspore/ccsrc/runtime/device/cpu/cpu_simple_mem_plan.cc View File

@@ -19,9 +19,9 @@
namespace mindspore {
namespace device {
namespace cpu {
void CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) {
size_t CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
size_t total_mem_size = 0;
size_t total_mem_size = 32;
auto kernels = graph->execution_order();
for (const auto &kernel : kernels) {
MS_EXCEPTION_IF_NULL(kernel);
@@ -58,15 +58,8 @@ void CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) {
}
}
}
graph_mem_size_[graph] = total_mem_size;
}

size_t CPUSimpleMemPlan::GetGraphMemSize(const session::KernelGraph *graph) const {
auto iter = graph_mem_size_.find(graph);
if (iter != graph_mem_size_.end()) {
return iter->second;
}
return 0;
return total_mem_size;
}

void CPUSimpleMemPlan::MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr) {


+ 1
- 6
mindspore/ccsrc/runtime/device/cpu/cpu_simple_mem_plan.h View File

@@ -17,7 +17,6 @@
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_SIMPLE_MEM_PLAN_H_

#include <vector>
#include <unordered_map>
#include "backend/session/kernel_graph.h"
#include "runtime/device/device_address.h"

@@ -29,12 +28,8 @@ class CPUSimpleMemPlan {
CPUSimpleMemPlan() = default;
~CPUSimpleMemPlan() = default;

void MemPlan(const session::KernelGraph *graph);
size_t MemPlan(const session::KernelGraph *graph);
void MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr);
size_t GetGraphMemSize(const session::KernelGraph *graph) const;

private:
std::unordered_map<const session::KernelGraph *, size_t> graph_mem_size_;
};
} // namespace cpu
} // namespace device


+ 0
- 1
mindspore/ccsrc/runtime/device/kernel_runtime.cc View File

@@ -355,7 +355,6 @@ void KernelRuntime::AssignStaticMemoryOutput(session::KernelGraph *graph) {
if (!item_with_index.first->isa<CNode>() || !AnfAlgo::IsRealKernel(item_with_index.first)) {
continue;
}
graph->AddFinalOutputKernel(item_with_index.first);
if (AnfAlgo::IsCommunicationOp(item_with_index.first)) {
AssignCommunicationNodeMem(kStaticMem, item_with_index.first);
} else {


+ 1
- 11
mindspore/ccsrc/transform/graph_ir/op_declare.cc View File

@@ -309,12 +309,7 @@ INPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{1, INPUT_DESC(features)}, {2, INPUT
ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP;
OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}};

// MeanGrad
INPUT_MAP(MeanGrad) = {{1, INPUT_DESC(x)}};
INPUT_ATTR_MAP(MeanGrad) = {{2, ATTR_DESC(mean_grad_output_shape_value, kOpFormat_NHWC,
AnyTraits<std::vector<int64_t>>(), AnyTraits<int64_t>())}};
ATTR_MAP(MeanGrad) = {{"mode", ATTR_DESC(mode, AnyTraits<int64_t>())}};

// SliceD
INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}};
INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
{3, ATTR_DESC(size, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}};
@@ -431,11 +426,6 @@ INPUT_MAP(TopK) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(k)}};
ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}};
OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}};

// Multiply
INPUT_MAP(Multiply) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
ATTR_MAP(Multiply) = EMPTY_ATTR_MAP;
OUTPUT_MAP(Multiply) = {{0, OUTPUT_DESC(z)}};

// TileD
INPUT_MAP(TileD) = {{1, INPUT_DESC(x)}};
INPUT_ATTR_MAP(TileD) = {{2, ATTR_DESC(multiples, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}};


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save