| Author | SHA1 | Message | Date |
|---|---|---|---|
|
|
a2edfcb09b |
!8171 【轻量级 PR】:update RELEASE.md.
Merge pull request !8171 from shenwei41/N/A |
5 years ago |
|
|
bbff1828ba |
!8185 fix securec download links due to mistakes made by openeuler community
Merge pull request !8185 from yanghaoran/r0.6 |
5 years ago |
|
|
94c644cb52 | update graphengine, fix securec download links | 5 years ago |
|
|
df90cf1538 | update RELEASE.md. | 5 years ago |
|
|
4ca658319b |
!6531 【MD】r0.6 Branch: MD5 value update in the file - icu4c.cmake of branch r0.6
Merge pull request !6531 from magemomou/MD5_r0.6 |
5 years ago |
|
|
aa89c9f33c | MD5 value update in the file icu4c.cmake of branch r0.6 | 5 years ago |
|
|
d1b1a626c2 |
!5447 Support manual convert to quantative network of resnet
Merge pull request !5447 from chenfei_mindspore/r0.6 |
5 years ago |
|
|
d27f7bf88b | add manual quantative network of resnet | 5 years ago |
|
|
50d7480a4e |
!4457 modify yolov3_quant eval script
Merge pull request !4457 from chengxb7532/r0.6 |
5 years ago |
|
|
ef9e3a5360 | modify yolov3_darknet53 | 5 years ago |
|
|
04a6612baf |
!4424 modify quant DenseBnAct API
Merge pull request !4424 from chengxb7532/r0.6 |
5 years ago |
|
|
7bc5b71b44 | modify quant DenseBnAct code | 5 years ago |
|
|
30452899ec |
!4351 modify yolov3-darknet quant net codes
Merge pull request !4351 from chengxb7532/r0.6 |
5 years ago |
|
|
59863abcd3 | modify yolov3-darknet53 quant code | 5 years ago |
|
|
a15ae5238d |
!4304 upload yolov3-darknet quant net codes
Merge pull request !4304 from chengxb7532/r0.6 |
5 years ago |
|
|
c80a1da8ac | upload yolov3-darknet53 quant code | 5 years ago |
|
|
7d483cd09c |
!4115 runpackage sync C75B050 for mindspore r0.6
Merge pull request !4115 from HW_KK/r0.6 |
5 years ago |
|
|
9f3dcd7ab9 | runpackage sync C75B050 for r0.6 | 5 years ago |
|
|
801660ef08 |
!3912 fix numpyslice bug
Merge pull request !3912 from luoyang/son_r0.6 |
5 years ago |
|
|
16f54c900b | fix numpyslice bug | 5 years ago |
|
|
5465525f09 |
!3812 upgrade dockerfile version to 0.6.0-beta
Merge pull request !3812 from yanghaoran/r0.6 |
5 years ago |
|
|
d9320b1606 |
!3805 modify release note for 0.6
Merge pull request !3805 from changzherui/mod_release |
5 years ago |
|
|
34f2e94bd4 | update mindspore version to 0.6.0-beta | 5 years ago |
|
|
a0e575a17d | modify release | 5 years ago |
|
|
dc4e15d32c | update RELEASE.md. | 5 years ago |
|
|
917a7e227f | update RELEASE.md. | 5 years ago |
|
|
1b7daf777a | update build.sh. | 5 years ago |
|
|
7d6160516f |
!3761 simplify googlenet
Merge pull request !3761 from panfengfeng/simplify_googlenet |
5 years ago |
|
|
ca881ec03e | add maxpool_with_argmax/grad cuda kernel | 5 years ago |
|
|
983437feaf |
!3757 debug mindspore hub
Merge pull request !3757 from chenzhongming/r0.6 |
5 years ago |
|
|
a059e8910f | debug mindspore hub | 5 years ago |
|
|
9dc23eeb98 |
!3602 Delete hard code in pull node
Merge pull request !3602 from ZPaC/r0.6-delete-hard-code-in-pull-node |
5 years ago |
|
|
78e3cb4bc4 | Delete hard code in pull kernel. | 5 years ago |
|
|
0db3ff5773 |
!3742 fix GetInputReshapeType reports ERROR
Merge pull request !3742 from liubuyu/r0.6 |
5 years ago |
|
|
c9583ad3a4 |
!3730 fix bug of cast dtype when using mix_presion in pynative mode
Merge pull request !3730 from jinyaohui/mix_presion |
5 years ago |
|
|
294520e1fd |
!3548 Pass optimzier attributes to push kernel and parameter server.
Merge pull request !3548 from ZPaC/r0.6-pass-attr-to-ps |
5 years ago |
|
|
4621565258 |
!3733 block trans data to change format
Merge pull request !3733 from lvchangquan/r.06 |
5 years ago |
|
|
b3b71e1d3f |
!3724 modify readme and timemoniter steps
Merge pull request !3724 from wanghua/r0.6 |
5 years ago |
|
|
7d5e523743 | fix set/get reshape type bug | 5 years ago |
|
|
0fb669190a |
!3703 Enlarge the threshold of resnet50 performance st in pynative
Merge pull request !3703 from JoyLvliang/r0.6 |
5 years ago |
|
|
db216a077a | fix bug of cast dtype when using mix_presion in pynative mode | 5 years ago |
|
|
f298e55072 | block use trans data to change format | 5 years ago |
|
|
dcd471eb96 |
!3718 add mindspore hub for download ckpt file
Merge pull request !3718 from chenzhongming/r0.6 |
5 years ago |
|
|
c9a675f4e5 | modify readme and timemoniter steps | 5 years ago |
|
|
fdc183ad36 |
!3704 [r0.6][bug][auto_mixed_precision]fix amp bug in eval
Merge pull request !3704 from vlne-v1/amp_doc_r0.6 |
5 years ago |
|
|
937c5b5d8e | enlarge the threshold of resnet50 performance in pynative | 5 years ago |
|
|
783b823a25 |
add mindspore hub for download ckpt file
add mindspore.hub and change model_zoo |
5 years ago |
|
|
30ffcd8a1f |
!3681 modelzoo: support vgg16 in GPU
Merge pull request !3681 from ms_yan/vgg_r0.6 |
5 years ago |
|
|
9ab94fa076 |
!3685 add tinybert scripts
Merge pull request !3685 from wanghua/r0.6 |
5 years ago |
|
|
944929f980 |
!3682 add googlenet gpu
Merge pull request !3682 from panfengfeng/googlenet-gpu_support |
5 years ago |
|
|
09dd4128d5 |
!3689 fix cpu multi graph mem error
Merge pull request !3689 from kisnwang/r0.6-fix-cpu-multi-graph-memory-error |
5 years ago |
|
|
ca4b2f6c0b | fix eval in amp | 5 years ago |
|
|
7f3926429b |
!3628 fix log bug
Merge pull request !3628 from gukecai/log |
5 years ago |
|
|
7360a2fa07 | fix cpu multi graph mem error | 5 years ago |
|
|
10f0f0d5a5 |
!3673 fix serving input numbers
Merge pull request !3673 from hexia/fix_input_check_r0.6 |
5 years ago |
|
|
6b81f9f7f7 |
!3683 Modify patches and alerts
Merge pull request !3683 from shenwei41/r0.6 |
5 years ago |
|
|
5a36b19e80 |
!3666 Modify the order of init and open of TDT
Merge pull request !3666 from hanjun996/r0.6 |
5 years ago |
|
|
6944af09ee |
!3596 fix batchnorm issue under mix precision in pynative mode
Merge pull request !3596 from wangqiuliang/fix-batchnorm-r0.6 |
5 years ago |
|
|
e497117b74 |
init add vgg16 gpu version
merge the script optimize the script repair problem in vgg16 cifar10 version optimize the vgg script |
5 years ago |
|
|
78375e104a |
!3680 lowering value checking threshold to fix bug of pass eps
Merge pull request !3680 from wangnan39/lowering_value_checking_threshold_to_support_training_with_very_small_steps |
5 years ago |
|
|
abd346e84b |
!3649 modify setup.py version number for r0.6
Merge pull request !3649 from changzherui/mod_ver_num |
5 years ago |
|
|
9156775655 |
!3677 support multy node training in deeplabv3
Merge pull request !3677 from zhouyaqiang0/r0.6 |
5 years ago |
|
|
df7f0c8a7c |
!3659 modify readme for maskrcnn
Merge pull request !3659 from meixiaowei/r0.6 |
5 years ago |
|
|
9da1c96c4a | add tinybert scripts | 5 years ago |
|
|
7d5a67e9f0 | googlenet-gpu | 5 years ago |
|
|
fc92598881 | fix batchnorm issue in pynative auto mix precision | 5 years ago |
|
|
e3fe1d76ca |
!3558 Fix a racing condition in CacheMergeOp when the leaf hits an error and exit too early
Merge pull request !3558 from guozhijian/fix_dataset_none_hung |
5 years ago |
|
|
b429a8421f |
!3586 fix python api doc for mindspore .dataset
Merge pull request !3586 from guansongsong/gss/fix_python_api_for_r0.6 |
5 years ago |
|
|
bb4339e3ca |
!3584 Fix a DatasetCache sharing scenario
Merge pull request !3584 from guansongsong/gss/fix_cache |
5 years ago |
|
|
e49a2f83e7 | Modify patches and alerts | 5 years ago |
|
|
1ec63700c7 |
!3632 Fix resource not release bug
Merge pull request !3632 from Kang/r0.6 |
5 years ago |
|
|
fc5d419422 | Lowering value checking threshold to support fix the bug of pass add eps | 5 years ago |
|
|
d4b5cda934 |
!3604 Fix minor errors in probabilistic programming
Merge pull request !3604 from peixu_ren/r0.6 |
5 years ago |
|
|
d6a56cd6fd | Pass optimizer attributes to push nodes. | 5 years ago |
|
|
f04243b1f1 |
!3663 Fix multi worker
Merge pull request !3663 from ZPaC/r0.6-fix-sgd |
5 years ago |
|
|
6b57b4f0e1 |
!3652 add epoch_num description
Merge pull request !3652 from panfengfeng/add_epoch_num_description |
5 years ago |
|
|
b096a6cbe9 | support multy node training and remove code | 5 years ago |
|
|
c718774538 | modify tdt | 5 years ago |
|
|
68128f87a9 |
!3634 Spilt unsupported transdata
Merge pull request !3634 from lianliguang/r0.6 |
5 years ago |
|
|
22dbd1a233 |
!3646 [MD] fix minddataset core dump when file list size ia greater than 1000.
Merge pull request !3646 from liyong126/r0.6_fix_minrecord_bug |
5 years ago |
|
|
52776820d8 | fix_input_check | 5 years ago |
|
|
5b15f40598 | Fix a DatasetCache sharing scenario | 5 years ago |
|
|
bf74164df3 | fix sync sgd under multi-worker | 5 years ago |
|
|
e5b9776b86 | modify readme | 5 years ago |
|
|
8803c6258d | add epoch_num | 5 years ago |
|
|
614841aa39 | modify setup version number | 5 years ago |
|
|
983cb9b23d | Fix resource not release bug | 5 years ago |
|
|
68f27eb62b | fix python api doc for mindspore.dataset | 5 years ago |
|
|
924a34acb8 |
!3639 fix GeneratorDataset time out
Merge pull request !3639 from yanghaitao/yht_generator_timeout_r0.6 |
5 years ago |
|
|
db01f3eafe |
!3640 support bprop for const in pynative and develop stridedslice and isinstance
Merge pull request !3640 from zhangbuxue/support_bprop_for_const_in_pynative_and_develop_stridedslice_and_isinstance |
5 years ago |
|
|
66d8395fea | fix coredump when number of file list more than 1000. | 5 years ago |
|
|
e33b5e435e |
!3633 fix dataset & train gil lock of gpu process
Merge pull request !3633 from panfengfeng/fix_dataset_train_gil_of_gpu |
5 years ago |
|
|
477bf42fe5 |
!3641 Update submodule akg to r0.6 branch
Merge pull request !3641 from looop5/akg_r0.6 |
5 years ago |
|
|
edba641ddb | split unsupported transdata | 5 years ago |
|
|
338a225410 |
!3623 [r0.6][bug][auto_mixed_precision]fix amp doc and eval network build
Merge pull request !3623 from vlne-v1/amp_doc_r0.6 |
5 years ago |
|
|
13d8bedbf4 | update submodule akg to r0.6 branch | 5 years ago |
|
|
9a43468fee |
!3626 fix: device occupied tdt hung
Merge pull request !3626 from guozhijian/fix_device_occupied_tdt_hung_r0.6 |
5 years ago |
|
|
6beb8071d7 | support bprop for const in pynative and develop stridedslice and isinstance. | 5 years ago |
|
|
cc233f66ab |
!3629 Fix numpyslice issue
Merge pull request !3629 from xiefangqi/md_fix_numpyslice_r0.6 |
5 years ago |
|
|
248130e5d1 | fix generator time out | 5 years ago |
|
|
8f6eafdfcd |
!3589 fix the description of cache
Merge pull request !3589 from guansongsong/fix_cache_core_for_r0.6 |
5 years ago |
|
|
30ed5a25ce | fix numpyslice issue to r0.6 | 5 years ago |
|
|
4eea891730 | fix dataset train gil of gpu | 5 years ago |
|
|
fe29a2501f | fix log bug | 5 years ago |
|
|
0d375bbaa3 | fix: device occupied tdt hung | 5 years ago |
|
|
4f1e586ee3 |
!3579 fix maskrcnn dataset rescale bug
Merge pull request !3579 from meixiaowei/r0.6 |
5 years ago |
|
|
dd26d85caf | fix doc and eval network build in amp | 5 years ago |
|
|
49cdeb3f78 | Fix minor errors in probabilistic programming | 5 years ago |
|
|
d9ca3f2e88 |
!3566 dataset: api format problem in totype, totensor, slice
Merge pull request !3566 from ms_yan/r0.6_api_format |
5 years ago |
|
|
c5f8b6b0c7 |
!3599 merge fix sparse doc to r0.6
Merge pull request !3599 from riemann_penn/merge_fix_sparse_doc_to_r0.6 |
5 years ago |
|
|
3714a07d71 | fix sparse api doc | 5 years ago |
|
|
950367c102 |
!3595 add desc about sink_size
Merge pull request !3595 from jinyaohui/sink_size |
5 years ago |
|
|
40b859395d | add description about sink_size | 5 years ago |
|
|
d7caa7955b |
!3582 Fix minddata cache include flatbuffer head problem
Merge pull request !3582 from xiefangqi/r0.6 |
5 years ago |
|
|
552490326f |
!3572 [MD] fix save pydoc and log
Merge pull request !3572 from liyong126/r0.6_fix_save_pydoc_log |
5 years ago |
|
|
543b75f366 | fix the description of cache | 5 years ago |
|
|
3d87436bb0 |
!3580 fix allreduce fusion case in grad reducer
Merge pull request !3580 from gziyan/fix_allreduce_fusion |
5 years ago |
|
|
47efc83bcd | repair api format problem in totype, totensor, slice | 5 years ago |
|
|
0e4065f0ef | fix flatbuffer head to r0.6 | 5 years ago |
|
|
fdb21ecf74 | update | 5 years ago |
|
|
7df05b1da7 | fix rescale dataset bug | 5 years ago |
|
|
c617a07dff |
!3533 modify serving readme
Merge pull request !3533 from dinghao/r0.6 |
5 years ago |
|
|
f52859a2fc | fix save op pydoc and log | 5 years ago |
|
|
2a6884d97c |
!3564 [Auto parallel] Cost model for GPU
Merge pull request !3564 from Xiaoda/15-r0.6-add-new-gpu-costmodel |
5 years ago |
|
|
b54fc35cde | modify serving readme | 5 years ago |
|
|
ab676ba81a | add costmodel for gpu | 5 years ago |
|
|
f118869869 | Fix a merge_op timing hole | 5 years ago |
|
|
c31c1c808a |
!3530 Fix a bug for Parameter
Merge pull request !3530 from hewei/fix_parameter_bug_r0.6 |
5 years ago |
|
|
67600c1d8c |
!3539 Change at-most collected tensor summary from 50 to 20 when auto-calculated
Merge pull request !3539 from LiHongzhang/f50_t20_r |
5 years ago |
|
|
49e8727d37 |
!3518 fix python import r0.6
Merge pull request !3518 from hexia/fix_python_import_r0.6 |
5 years ago |
|
|
36c2bbdbcc |
!3501 fix sparse feature bug for auto parallel
Merge pull request !3501 from lirongzhen1/r0.6 |
5 years ago |
|
|
a536e922c2 |
!3524 add bert ci script to r0.6 branch
Merge pull request !3524 from yoonlee666/bertci |
5 years ago |
|
|
d86668d216 |
change at-most collected tensor from 50 to 20
When `collect_tensor_freq` is specified as `None`, the `collect_tensor_freq` would be auto calculated. The previous behavior is to collect at most 50 steps, now changing to 20 |
5 years ago |
|
|
bcba696a62 |
!3482 `max_file_size` includes metadata and drops the last step
Merge pull request !3482 from LiHongzhang/limit_summary_r |
5 years ago |
|
|
1f6771256d |
Fix a bug for Parameter
1. Parameter's init_data() should have no effect if default_input already set; 2. This bug is introduced by 'decouple ParamValue from python'; 3. An unit test case is added to ensure the right init_data() behavior. |
5 years ago |
|
|
1dcf9abf6a | add bert ci script | 5 years ago |
|
|
5fb1280e12 | fix python import | 5 years ago |
|
|
dfab48d532 |
!3492 Change readme.txt in WarpCTC and checkpoint directory
Merge pull request !3492 from yangyongjie/r0.6 |
5 years ago |
|
|
5a517f3a49 | max_file_size include metadata length and drop last step | 5 years ago |
|
|
62cf01fc7b |
!3509 Add parameter server mode_zoo case and CI test cases.
Merge pull request !3509 from ZPaC/add-ps-test-cases |
5 years ago |
|
|
b109e6f643 | Add parameter server model_zoo case and CI test cases. | 5 years ago |
|
|
fdf198eee9 |
!3493 Modify comment of register_backward_hook [r0.6]
Merge pull request !3493 from Simson/push-to-r06 |
5 years ago |
|
|
7f6f140d94 |
!3498 Fix getting output address of internal output
Merge pull request !3498 from YuJianfeng/r0.6 |
5 years ago |
|
|
ec3e7269ba |
!3505 merge eager mode enable sparse to r0.6
Merge pull request !3505 from riemann_penn/merge_eager_mode_enable_sparse_to_r0.6 |
5 years ago |
|
|
032c5e0fdc | eager mode enable sparse | 5 years ago |
|
|
9626532e0b |
!3499 Delete parameter name hard code for embedding-lookup
Merge pull request !3499 from ZPaC/r0.6-delete-param-name-hard-code |
5 years ago |
|
|
304ae51a25 |
!3470 Init CSV column default list when it's empty r0.6
Merge pull request !3470 from jiangzhiwen/fix_column_name_r_0_6 |
5 years ago |
|
|
2241017e3f | fix word missing in readme.txt | 5 years ago |
|
|
c1b36c3d4f | Delete parameter name hard code for embedding table. | 5 years ago |
|
|
8af4a16d9d | fix sparse feature bug for auto parallel | 5 years ago |
|
|
67ed5451ad | Fix getting output address of internal output | 5 years ago |
|
|
ac564a9e86 |
!3466 fix cpu nonop net fp16 error
Merge pull request !3466 from kisnwang/r0.6-fix-cpu-nonop-fp16-error |
5 years ago |
|
|
375078cf55 |
!3471 Fixing Bug with AutoContrast/Equalize supporting uint8 dtype/mnist
Merge pull request !3471 from guozhijian/fix_autocontrast_equalize_support_uint8 |
5 years ago |
|
|
63bb52b408 | Modify comment of register_backward_hook | 5 years ago |
|
|
c9f25d0d5c |
!3477 upload maskrcnn scripts
Merge pull request !3477 from gengdongjie/r0.6 |
5 years ago |
|
|
b0cb13d265 |
!3463 [MD]Fix Segementation Falut when SentencepieceTokenizer Op before zipOp and ConcatOp
Merge pull request !3463 from xulei/tmp_0.6 |
5 years ago |
|
|
14ce0afab3 |
!3478 Add Warpctc GPU network
Merge pull request !3478 from yangyongjie/r0.6 |
5 years ago |
|
|
26733198e9 |
!3458 fix getdataset size error r0.6
Merge pull request !3458 from panfengfeng/fix_getdataset_size_error_r0.6 |
5 years ago |
|
|
73f58dc937 |
!3480 Graceful shutdown for ps modules
Merge pull request !3480 from chengang/graceful_shutdown_ps_r0.6 |
5 years ago |
|
|
c1332c03e5 | support graceful shutdown for ps components | 5 years ago |
|
|
10c74de9b6 | upload maskrcnn scripts | 5 years ago |
|
|
28b9074e9b | add warpctc GPU | 5 years ago |
|
|
63442d563f |
!3402 [AutoParallel]Fix autoparallel gpu bug
Merge pull request !3402 from lichen/fix_autoparallel_gpu_bug |
5 years ago |
|
|
b0e83c5a06 | Fixing AutoContrast/Equalize Bug | 5 years ago |
|
|
9f5315fc80 | fix cpu nonop net fp16 error | 5 years ago |
|
|
4e7cb1a7a4 | fix get daataset size error | 5 years ago |
|
|
d408cdf0e0 | init column_default_list_ when it is empty | 5 years ago |
|
|
c5e6cfebe7 |
!3436 fix mix precision operator issue
Merge pull request !3436 from wangqiuliang/fix-mix-precision-r0.6 |
5 years ago |
|
|
aac2275d1b | support graceful shutdown for ps components | 5 years ago |
|
|
70aee2fe7a |
!3401 cpp client example
Merge pull request !3401 from hexia/cpp_client_example_r0.6 |
5 years ago |
|
|
c43bc92d7c | add code | 5 years ago |
|
|
5916da1763 | fix mix precision operator issue | 5 years ago |
|
|
50e20e4042 |
!3443 Restore the code to collect the graph network
Merge pull request !3443 from LiHongzhang/oh_graph_r |
5 years ago |
|
|
2373e94384 | restore the ability to collect network graph | 5 years ago |
|
|
cda920b21b |
!3432 add single quotes and modify parameters
Merge pull request !3432 from lijiaqi/add_single_quotes_and_others |
5 years ago |
|
|
af4b4fb36d |
!3417 fix bug of group lr when save ckpt
Merge pull request !3417 from wangnan39/fix_bug_of_group_lr_when_save_ckpt |
5 years ago |
|
|
927a52fdf8 |
!3388 Transfer tuple getitem's control to new added memcpy_async
Merge pull request !3388 from huanghui/r0.6 |
5 years ago |
|
|
0f8c4d6794 |
!3428 modify annotation: wegith_decay modify weight_decay
Merge pull request !3428 from lilei/modify_weight_decay_annotation |
5 years ago |
|
|
8feb9450f2 | add single quotes and modify parameters | 5 years ago |
|
|
f304fe9614 | modify weight_decay annotation | 5 years ago |
|
|
e62137f7c0 |
!3406 fix optimizer parallel problems
Merge pull request !3406 from gziyan/fix_optimizer_parallel_r0.6 |
5 years ago |
|
|
c005dfd803 |
!3389 merge sparse hot fix to r0.6
Merge pull request !3389 from riemann_penn/merger_sparse_hot_fix_to_r0.6 |
5 years ago |
|
|
a051d7c5dc |
!3410 [bug][ci]fix bug when remove the phis
Merge pull request !3410 from vlne-v1/fix_redundant_phi-r0.6 |
5 years ago |
|
|
3c93ff3385 | fix_bug_of_group_lr_when_save_ckpt | 5 years ago |
|
|
43d4f80428 | fix bug in remove phiphi should replace the inner ones first | 5 years ago |
|
|
9f264b6e55 | fix optimizer parallel problems | 5 years ago |
|
|
2cebc62bbf | fix sparse related issues | 5 years ago |
|
|
f9aec99c01 |
!3379 modify the vgg16/lstm path to offical/{cv/nlp}
Merge pull request !3379 from caojian05/ms_master_dev |
5 years ago |
|
|
80a655099a | modify the vgg16/lstm path to offical/{cv/nlp} | 5 years ago |
|
|
f14974392c | cpp_client_example_r0.6 | 5 years ago |
|
|
3901c0414f | deal tuple getitem control for new added memcpy | 5 years ago |
|
|
12738ceda7 | fix auto parallel gpu bug | 5 years ago |
|
|
fe0348b3d7 |
!3380 Fix visit depend node
Merge pull request !3380 from lianliguang/r0.6 |
5 years ago |
|
|
93ce266ae5 |
!3373 support call super when class define in test_case
Merge pull request !3373 from zhangbuxue/support_call_super_when_class_define_in_test_case_ |
5 years ago |
|
|
35b466f8f7 | fix visit depend node | 5 years ago |
|
|
15487759ff | support call super when class define in test_case. | 5 years ago |
|
|
251fba00f5 |
!3363 fix cloner when funcgraph return is null
Merge pull request !3363 from leopz/fix_clone |
5 years ago |
|
|
984be47299 |
!3365 restructure client example
Merge pull request !3365 from hexia/restructure_client_example_r0.6 |
5 years ago |
|
|
45d8a9eea3 |
!3354 improve performance of bert by adding order paramters
Merge pull request !3354 from shibeiji/r0.6 |
5 years ago |
|
|
5cdfbf0e82 |
!3359 fix cpu nonop net
Merge pull request !3359 from kisnwang/r0.6-cpu-support-nonop-net |
5 years ago |
|
|
9daa8a890b | restructure client example | 5 years ago |
|
|
61bf0c5d99 | fix cloner when funcgraph is null | 5 years ago |
|
|
27982ebbe8 |
!3347 Fix internal multiple outputs check
Merge pull request !3347 from YuJianfeng/r0.6 |
5 years ago |
|
|
926120ef95 | cpu support nonop net | 5 years ago |
|
|
1ae2d2d6c8 | add order params for bert to improve performance | 5 years ago |
|
|
16035dc62c | Fix internal multiple outputs check | 5 years ago |
| @@ -106,6 +106,7 @@ endif() # NOT ENABLE_ACL | |||||
| if (ENABLE_SERVING) | if (ENABLE_SERVING) | ||||
| add_subdirectory(serving) | add_subdirectory(serving) | ||||
| add_subdirectory(serving/example/cpp_client) | |||||
| endif() | endif() | ||||
| if (NOT ENABLE_ACL) | if (NOT ENABLE_ACL) | ||||
| @@ -75,7 +75,7 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex | |||||
| 1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package. | 1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package. | ||||
| ``` | ``` | ||||
| pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl | |||||
| pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.6.0-cp37-cp37m-linux_x86_64.whl | |||||
| ``` | ``` | ||||
| 2. Run the following command to verify the install. | 2. Run the following command to verify the install. | ||||
| @@ -1,3 +1,76 @@ | |||||
| # Release 0.6.0-beta | |||||
| ## Major Features and Improvements | |||||
| ### Ascend 910 Training and Inference Framework | |||||
| * New models | |||||
| * There are official, research and community under modelzoo. | |||||
| * Official is maintained with the newest APIs by MindSpore team, MaskRCNN are added. | |||||
| * Research is uploaded by researchers for official review, and APIs may not be updated in time. | |||||
| * Community reprints the relevant links of partner research results. | |||||
| * Hub added on the same level as modelzoo, synchronous storage of materials needed for official hub web pages which will be launched soon. | |||||
| * Support pre-trained models, few lines of code can be used to download and load pre-trained models, supporting inference or transfer learning. | |||||
| * Frontend and user interface | |||||
| * Supports user side operator compilation and graph execution error rendering. | |||||
| * Uniform definition dynamic learning rate behavior in optimizers. | |||||
| * Support IndexSlice in sparse expression. | |||||
| * Support use parent construct method during construct. | |||||
| * Support asynchronous execution save checkpoint file. | |||||
| * Support implicit type conversion in pynative mode. | |||||
| * User interfaces change log | |||||
| * unform learning rate behavior in optimizers([!2755](https://gitee.com/mindspore/mindspore/pulls/2755)) | |||||
| * rename operator of sparse optimizer([!3217](https://gitee.com/mindspore/mindspore/pulls/3217)) | |||||
| * move profiler module from mindinsight to mindspore([!3075](https://gitee.com/mindspore/mindspore/pulls/3075)) | |||||
| * VOCDataset output change to multi-columns([!3093](https://gitee.com/mindspore/mindspore/pulls/3093)) | |||||
| * GetDatasize feature([!3212](https://gitee.com/mindspore/mindspore/pulls/3212)) | |||||
| * dataset: modify config api([!2936](https://gitee.com/mindspore/mindspore/pulls/2936)) | |||||
| * Executor and performance optimization | |||||
| * Decouple C++ and python, so make the architecture more extensible. | |||||
| * Parameter Server for distributed deep learning supported. | |||||
| * Serving:a flexible service deployment framework for deep learning models. | |||||
| * Memory reuse is enhanced, and the batch size of Bert large model is increased from 96 to 160 on a single server. | |||||
| * Data processing, augmentation, and save format | |||||
| * Support MindRecord save operator after date processing | |||||
| * Support automatic fusion operator, such as decode/resize/crop | |||||
| * Support CSV dataset loading | |||||
| ### Other Hardware Support | |||||
| * GPU platform | |||||
| * New model supported: ResNext50, WarpCTC and GoogLeNet. | |||||
| * Support hyperparametric search and data enhanced automl on GPU. | |||||
| * Support Resnet50 automatic parallel in GPU backend. | |||||
| ## Bugfixes | |||||
| * Models | |||||
| * Improved the performance and accuracy on ResNet50([!3456](https://gitee.com/mindspore/mindspore/pulls/3456)) | |||||
| * Fixed the performance test case of bert([!3486](https://gitee.com/mindspore/mindspore/pulls/3486)) | |||||
| * Python API | |||||
| * Fix assign used in while loop([!2720](https://gitee.com/mindspore/mindspore/pulls/2720)) | |||||
| * Revert optimize the graph output of all nop node.([!2857](https://gitee.com/mindspore/mindspore/pulls/2857)) | |||||
| * Print tensor as numpy.([!2859](https://gitee.com/mindspore/mindspore/pulls/2859)) | |||||
| * Support weight decay for sparse optimizer([!2668](https://gitee.com/mindspore/mindspore/pulls/2668)) | |||||
| * Fix BatchToSpaceND([!2741](https://gitee.com/mindspore/mindspore/pulls/2741)) | |||||
| * Fixing type check mistakes of InplaceAdd and Inplace Sub ops([!2744](https://gitee.com/mindspore/mindspore/pulls/2744])) | |||||
| * Change order param only equal to group param([!2748](https://gitee.com/mindspore/mindspore/pulls/2748)) | |||||
| * Executor | |||||
| * The performance of graph whith control flow is optimized([!2931](https://gitee.com/mindspore/mindspore/pulls/2931)) | |||||
| * Fix bug of wrong number of tuple layers([!3390](https://gitee.com/mindspore/mindspore/pulls/3390)) | |||||
| * Fix cpu multi graph memory exception([!3631](https://gitee.com/mindspore/mindspore/pulls/3631)) | |||||
| * Enable data sync when calling operator without defining a cell([!3081](https://gitee.com/mindspore/mindspore/pulls/3081)) | |||||
| * Fix argmaxwith value error in pynative mode on GPU([!3082](https://gitee.com/mindspore/mindspore/pulls/3082)) | |||||
| * Fix precision error with fp16 input on pynative mode([!3196](https://gitee.com/mindspore/mindspore/pulls/3196)) | |||||
| * Data processing | |||||
| * Fix bug of RandomColor and RandomSharpness default parameter checking ([!2833](https://gitee.com/mindspore/mindspore/pulls/2833)) | |||||
| * Fix process hung when training and eval ([!3469](https://gitee.com/mindspore/mindspore/pulls/3469)) | |||||
| * Third party | |||||
| * Sqlite : Update sqlite to 3.32.2 to handle [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656), [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871), [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655), [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327), [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630), [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358), [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631), [CVE-2020-13632](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13632), [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434), [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435), and [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655). | |||||
| * Libjpeg-turbo : Update libjpeg-turbo to 2.0.4 to handle [CVE-2020-13790](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13790). | |||||
| ## Contributors | |||||
| Thanks goes to these wonderful people: | |||||
| Alexey Shevlyakov, avakh, baihuawei, BowenK, buxue, caifubi, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenjianping, chentingting, chenzomi, chujinjin, Danish Farid, dayschan, dengwentao, dinghao, etone-chan, fangzehua, fary86, geekun, Giancarlo Colmenares, gong chen, gukecai, guohongzilong, hangangqiang, heleiwang, hesham, He Wei, hexia, hongxing, huangdongrun, huanghui, islam_amin, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jojobugfree, Jonathan Yan, jonyguo, Junhan Hu, Kang, kingfo, kouzhenzhong, kpy, kswang, laiyongqiang, leopz, liangzelang, lichenever, lihongkang, Li Hongzhang, lilei, limingqi107, lirongzhen1, liubuyu, liuchongming74, liuwenhao4, liuxiao, Lixia Chen, liyanliu, liyong, lizhenyu, lvliang, Mahdi, Margaret_wangrui, meixiaowei, ms_yan, nhussain, ougongchang, panfengfeng, panyifeng, peilinwang, Peilin Wang, pkuliuliu, qianlong, rick_sanchez, shibeiji, Shida He, shijianning, simson, sunsuodong, suteng, Tinazhang, Tron Zhang, unknown, VectorSL, wandongdong, wangcong, wangdongxu, wangdongxu6, wanghua, wangnan39, Wei Luning, wenchunjiang, wenkai, wilfChen, WilliamLian, wukesong, Xian Weizhao, Xiaoda Zhang, xiefangqi, xulei2020, xunxue, xutianchun, Yang, yanghaitao, yanghaitao1, yanghaoran, yangjie, yangjie159, YangLuo, Yanjun Peng, yankai, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yuchaojie, yujianfeng, zhangzhongpeng, zhangdengcheng, Zhang Qinghua, zhangyinxia, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang, wangdongxu | |||||
| Contributions of any kind are welcome! | |||||
| # Release 0.5.0-beta | # Release 0.5.0-beta | ||||
| ## Major Features and Improvements | ## Major Features and Improvements | ||||
| @@ -62,6 +135,8 @@ | |||||
| * Fix bug of Cifar dataset reading([!2096](https://gitee.com/mindspore/mindspore/pulls/2096)) | * Fix bug of Cifar dataset reading([!2096](https://gitee.com/mindspore/mindspore/pulls/2096)) | ||||
| * Fix bug of C++ behavior in RandomCropAndResize([!2026](https://gitee.com/mindspore/mindspore/pulls/2026)) | * Fix bug of C++ behavior in RandomCropAndResize([!2026](https://gitee.com/mindspore/mindspore/pulls/2026)) | ||||
| * Fix the bug of mindrecord shuffle([!2420](https://gitee.com/mindspore/mindspore/pulls/2420)) | * Fix the bug of mindrecord shuffle([!2420](https://gitee.com/mindspore/mindspore/pulls/2420)) | ||||
| * Third party | |||||
| * Sqlite : Update sqlite to 3.32.2 to handle [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656), [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871), [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655), [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327), [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630), [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358), [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631), [CVE-2020-13632](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13632), [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434), [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435), and [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655). | |||||
| ## Contributors | ## Contributors | ||||
| Thanks goes to these wonderful people: | Thanks goes to these wonderful people: | ||||
| @@ -134,7 +209,7 @@ Contributions of any kind are welcome! | |||||
| * Fix sens shape error of TrainOneStepWithLossScaleCell([!1050](https://gitee.com/mindspore/mindspore/pulls/1050)) | * Fix sens shape error of TrainOneStepWithLossScaleCell([!1050](https://gitee.com/mindspore/mindspore/pulls/1050)) | ||||
| * Fix BatchNormGrad operator([!1344](https://gitee.com/mindspore/mindspore/pulls/1344)) | * Fix BatchNormGrad operator([!1344](https://gitee.com/mindspore/mindspore/pulls/1344)) | ||||
| * Executor | * Executor | ||||
| * Fix dropout,topK and addn errors in PyNative mode ([!1285](https://gitee.com/mindspore/mindspore/pulls/1285), [!1138](https://gitee.com/mindspore/mindspore/pulls/1138), [!1033](https://gitee.com/mindspore/mindspore/pulls/1033)). | |||||
| * Fix dropout, topK and addn errors in PyNative mode ([!1285](https://gitee.com/mindspore/mindspore/pulls/1285), [!1138](https://gitee.com/mindspore/mindspore/pulls/1138), [!1033](https://gitee.com/mindspore/mindspore/pulls/1033)). | |||||
| * Fix memory leaks after execution in PyNatvie mode ([!1201](https://gitee.com/mindspore/mindspore/pulls/1201)). | * Fix memory leaks after execution in PyNatvie mode ([!1201](https://gitee.com/mindspore/mindspore/pulls/1201)). | ||||
| * Fix HCCL failure in some special scenes ([!1204](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1204), [!1252](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1252)). | * Fix HCCL failure in some special scenes ([!1204](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1204), [!1252](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1252)). | ||||
| * Fix SSD network when Select failed, cann't find kernel info([!1449](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1449)). | * Fix SSD network when Select failed, cann't find kernel info([!1449](https://gitee.com/mindspore/dashboard/projects/mindspore/mindspore/pulls/1449)). | ||||
| @@ -1 +1 @@ | |||||
| Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625 | |||||
| Subproject commit 5c0e3d2ffb6ba7650453c3b11163237a43d206d6 | |||||
| @@ -491,9 +491,9 @@ build_predict() | |||||
| cd "${BASEPATH}/predict/output/" | cd "${BASEPATH}/predict/output/" | ||||
| if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then | if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then | ||||
| tar -cf MSPredict-0.5.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed | |||||
| tar -cf MSPredict-0.6.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed | |||||
| elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then | elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then | ||||
| tar -cf MSPredict-0.5.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed | |||||
| tar -cf MSPredict-0.6.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed | |||||
| fi | fi | ||||
| echo "success to build predict project!" | echo "success to build predict project!" | ||||
| } | } | ||||
| @@ -8,7 +8,7 @@ else() | |||||
| VER 67.1 | VER 67.1 | ||||
| LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N} | LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N} | ||||
| URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz | URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz | ||||
| MD5 0c2662a2b0bc80b0eb56495205247c8f | |||||
| MD5 fd525fb47d8827b0b7da78b51dd2d93f | |||||
| CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh | CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh | ||||
| ) | ) | ||||
| include_directories(${icu4c_INC}) | include_directories(${icu4c_INC}) | ||||
| @@ -12,6 +12,7 @@ mindspore_add_pkg(jpeg_turbo | |||||
| URL https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.4.tar.gz | URL https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.4.tar.gz | ||||
| MD5 44c43e4a9fb352f47090804529317c88 | MD5 44c43e4a9fb352f47090804529317c88 | ||||
| CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE | CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE | ||||
| PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/jpeg_turbo/jpeg_turbo.patch001 | |||||
| ) | ) | ||||
| include_directories(${jpeg_turbo_INC}) | include_directories(${jpeg_turbo_INC}) | ||||
| add_library(mindspore::jpeg_turbo ALIAS jpeg_turbo::jpeg) | add_library(mindspore::jpeg_turbo ALIAS jpeg_turbo::jpeg) | ||||
| @@ -278,6 +278,13 @@ if (ENABLE_SERVING) | |||||
| COMPONENT mindspore | COMPONENT mindspore | ||||
| ) | ) | ||||
| file(GLOB MS_SERVING_PY_LIST ${CMAKE_SOURCE_DIR}/serving/*.py) | |||||
| install( | |||||
| FILES ${MS_SERVING_PY_LIST} | |||||
| DESTINATION ${INSTALL_PY_DIR} | |||||
| COMPONENT mindspore | |||||
| ) | |||||
| install( | install( | ||||
| TARGETS inference | TARGETS inference | ||||
| DESTINATION ${INSTALL_LIB_DIR} | DESTINATION ${INSTALL_LIB_DIR} | ||||
| @@ -0,0 +1,67 @@ | |||||
| FROM ubuntu:18.04 | |||||
| MAINTAINER leonwanghui <leon.wanghui@huawei.com> | |||||
| # Set env | |||||
| ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5 | |||||
| ENV PATH /usr/local/bin:$PATH | |||||
| # Install base tools | |||||
| RUN apt update \ | |||||
| && DEBIAN_FRONTEND=noninteractive apt install -y \ | |||||
| vim \ | |||||
| wget \ | |||||
| curl \ | |||||
| xz-utils \ | |||||
| net-tools \ | |||||
| openssh-client \ | |||||
| git \ | |||||
| ntpdate \ | |||||
| tzdata \ | |||||
| tcl \ | |||||
| sudo \ | |||||
| bash-completion | |||||
| # Install compile tools | |||||
| RUN DEBIAN_FRONTEND=noninteractive apt install -y \ | |||||
| gcc \ | |||||
| g++ \ | |||||
| zlibc \ | |||||
| make \ | |||||
| libgmp-dev \ | |||||
| patch \ | |||||
| autoconf \ | |||||
| libtool \ | |||||
| automake \ | |||||
| flex | |||||
| # Set bash | |||||
| RUN echo "dash dash/sh boolean false" | debconf-set-selections | |||||
| RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash | |||||
| # Install python (v3.7.5) | |||||
| RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||||
| libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \ | |||||
| && cd /tmp \ | |||||
| && wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \ | |||||
| && tar -xvf v3.7.5.tar.gz \ | |||||
| && cd /tmp/cpython-3.7.5 \ | |||||
| && mkdir -p ${PYTHON_ROOT_PATH} \ | |||||
| && ./configure --prefix=${PYTHON_ROOT_PATH} \ | |||||
| && make -j4 \ | |||||
| && make install -j4 \ | |||||
| && rm -f /usr/local/bin/python \ | |||||
| && rm -f /usr/local/bin/pip \ | |||||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||||
| && rm -rf /tmp/cpython-3.7.5 \ | |||||
| && rm -f /tmp/v3.7.5.tar.gz | |||||
| # Set pip source | |||||
| RUN mkdir -pv /root/.pip \ | |||||
| && echo "[global]" > /root/.pip/pip.conf \ | |||||
| && echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \ | |||||
| && echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf | |||||
| # Install MindSpore cpu whl package | |||||
| RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.6.0-cp37-cp37m-linux_x86_64.whl | |||||
| @@ -0,0 +1,83 @@ | |||||
| FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 | |||||
| MAINTAINER leonwanghui <leon.wanghui@huawei.com> | |||||
| # Set env | |||||
| ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5 | |||||
| ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5 | |||||
| ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH | |||||
| ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH | |||||
| # Install base tools | |||||
| RUN apt update \ | |||||
| && DEBIAN_FRONTEND=noninteractive apt install -y \ | |||||
| vim \ | |||||
| wget \ | |||||
| curl \ | |||||
| xz-utils \ | |||||
| net-tools \ | |||||
| openssh-client \ | |||||
| git \ | |||||
| ntpdate \ | |||||
| tzdata \ | |||||
| tcl \ | |||||
| sudo \ | |||||
| bash-completion | |||||
| # Install compile tools | |||||
| RUN DEBIAN_FRONTEND=noninteractive apt install -y \ | |||||
| gcc \ | |||||
| g++ \ | |||||
| zlibc \ | |||||
| make \ | |||||
| libgmp-dev \ | |||||
| patch \ | |||||
| autoconf \ | |||||
| libtool \ | |||||
| automake \ | |||||
| flex \ | |||||
| libnccl2=2.4.8-1+cuda10.1 \ | |||||
| libnccl-dev=2.4.8-1+cuda10.1 | |||||
| # Set bash | |||||
| RUN echo "dash dash/sh boolean false" | debconf-set-selections | |||||
| RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash | |||||
| # Install python (v3.7.5) | |||||
| RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||||
| libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \ | |||||
| && cd /tmp \ | |||||
| && wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \ | |||||
| && tar -xvf v3.7.5.tar.gz \ | |||||
| && cd /tmp/cpython-3.7.5 \ | |||||
| && mkdir -p ${PYTHON_ROOT_PATH} \ | |||||
| && ./configure --prefix=${PYTHON_ROOT_PATH} \ | |||||
| && make -j4 \ | |||||
| && make install -j4 \ | |||||
| && rm -f /usr/local/bin/python \ | |||||
| && rm -f /usr/local/bin/pip \ | |||||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||||
| && rm -rf /tmp/cpython-3.7.5 \ | |||||
| && rm -f /tmp/v3.7.5.tar.gz | |||||
| # Set pip source | |||||
| RUN mkdir -pv /root/.pip \ | |||||
| && echo "[global]" > /root/.pip/pip.conf \ | |||||
| && echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \ | |||||
| && echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf | |||||
| # Install openmpi (v3.1.5) | |||||
| RUN cd /tmp \ | |||||
| && wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \ | |||||
| && tar -xvf openmpi-3.1.5.tar.gz \ | |||||
| && cd /tmp/openmpi-3.1.5 \ | |||||
| && mkdir -p ${OMPI_ROOT_PATH} \ | |||||
| && ./configure --prefix=${OMPI_ROOT_PATH} \ | |||||
| && make -j4 \ | |||||
| && make install -j4 \ | |||||
| && rm -rf /tmp/openmpi-3.1.5 \ | |||||
| && rm -f /tmp/openmpi-3.1.5.tar.gz | |||||
| # Install MindSpore cuda-10.1 whl package | |||||
| RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.6.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.6.0-cp37-cp37m-linux_x86_64.whl | |||||
| @@ -1 +1 @@ | |||||
| Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009 | |||||
| Subproject commit 885af56694eff438a4ea079c0c34de30993f1473 | |||||
| @@ -14,7 +14,10 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """builtin_operations""" | """builtin_operations""" | ||||
| import numpy as np | import numpy as np | ||||
| from mindspore.ops import functional as F | |||||
| from mindspore.ops import composite as C | |||||
| from mindspore.common.tensor import Tensor | from mindspore.common.tensor import Tensor | ||||
| import mindspore.common.dtype as mstype | |||||
| from mindspore.common.dtype import dtype_to_nptype, get_py_obj_dtype | from mindspore.common.dtype import dtype_to_nptype, get_py_obj_dtype | ||||
| @@ -113,6 +116,7 @@ def bool_or(x, y): | |||||
| """Implement `bool_or`.""" | """Implement `bool_or`.""" | ||||
| return x or y | return x or y | ||||
| def vm_compare(*args): | def vm_compare(*args): | ||||
| """Implement `vm_compare` for tensor.""" | """Implement `vm_compare` for tensor.""" | ||||
| obj_str = args[-1] | obj_str = args[-1] | ||||
| @@ -141,10 +145,12 @@ def list_len(x): | |||||
| """Implement `list_len`.""" | """Implement `list_len`.""" | ||||
| return len(x) | return len(x) | ||||
| def Depend(value, expr): | def Depend(value, expr): | ||||
| """Implement `Depend`.""" | """Implement `Depend`.""" | ||||
| return value | return value | ||||
| # only used in PyNative mode | # only used in PyNative mode | ||||
| def make_ref(key, value, ref): | def make_ref(key, value, ref): | ||||
| return value | return value | ||||
| @@ -171,3 +177,16 @@ def tuple_to_array(x): | |||||
| def stop_gradient(x): | def stop_gradient(x): | ||||
| """Implement `stop_gradient`.""" | """Implement `stop_gradient`.""" | ||||
| return x | return x | ||||
| hyper_map = C.HyperMap() | |||||
| def mixed_precision_cast(dst_type, x): | |||||
| """Implement `mixed_precision_cast`.""" | |||||
| def cast_inner(data): | |||||
| if isinstance(data, Tensor) and data.dtype in (mstype.float32, mstype.float16): | |||||
| return F.cast(data, dst_type) | |||||
| return data | |||||
| return hyper_map(cast_inner, x) | |||||
| @@ -459,27 +459,27 @@ class Parser: | |||||
| logger.debug("ops info = %r", ops_info) | logger.debug("ops info = %r", ops_info) | ||||
| return ops_info | return ops_info | ||||
| def analyze_super(self, father_class_node, subclass_instance): | |||||
| def analyze_super(self, class_type_node, subclass_instance): | |||||
| """Analyze super and return a class instance.""" | """Analyze super and return a class instance.""" | ||||
| father_class = None | |||||
| if father_class_node is None: | |||||
| father_class = type(subclass_instance) | |||||
| if isinstance(father_class_node, ast.Name): | |||||
| father_class_name = getattr(father_class_node, 'id') | |||||
| father_class = self.global_namespace[father_class_name] | |||||
| if isinstance(father_class_node, ast.Attribute): | |||||
| value = getattr(father_class_node, 'value') | |||||
| attr = getattr(father_class_node, 'attr') | |||||
| module_name = getattr(value, 'id') | |||||
| father_class_module = self.global_namespace[module_name] | |||||
| father_class = getattr(father_class_module, attr) | |||||
| if father_class is None: | |||||
| raise ValueError("When call 'super', the father class is None.") | |||||
| if not isinstance(subclass_instance, father_class): | |||||
| sub_class = type(subclass_instance) | |||||
| if class_type_node is None: | |||||
| return super(sub_class, subclass_instance) | |||||
| if isinstance(class_type_node, ast.Name): | |||||
| class_name = getattr(class_type_node, 'id') | |||||
| elif isinstance(class_type_node, ast.Attribute): | |||||
| class_name = getattr(class_type_node, 'attr') | |||||
| else: | |||||
| raise ValueError(f"When call 'super', the first arg should be a class type, " | |||||
| f"but got {class_type_node.__class__.__name__}.") | |||||
| target_father_class = None | |||||
| for class_element in sub_class.mro(): | |||||
| if class_element.__name__ == class_name: | |||||
| target_father_class = class_element | |||||
| break | |||||
| if target_father_class is None: | |||||
| raise ValueError("When call 'super', the second arg should be an instance of first arg.") | raise ValueError("When call 'super', the second arg should be an instance of first arg.") | ||||
| target_class_instance = super(father_class, subclass_instance) | |||||
| return target_class_instance | |||||
| return super(target_father_class, subclass_instance) | |||||
| def get_location(self, node): | def get_location(self, node): | ||||
| """ | """ | ||||
| @@ -132,7 +132,9 @@ def while_cond(x): | |||||
| @constexpr | @constexpr | ||||
| def check_type_same(x_type, base_type): | def check_type_same(x_type, base_type): | ||||
| """Check x_type is same as base_type.""" | """Check x_type is same as base_type.""" | ||||
| return mstype.issubclass_(x_type, base_type) | |||||
| if mstype.issubclass_(x_type, base_type): | |||||
| return True | |||||
| raise TypeError(f"The arg 'x' should be a {base_type}, but got {x_type}.") | |||||
| @constexpr | @constexpr | ||||
| @@ -31,8 +31,9 @@ class PServerKernel { | |||||
| ~PServerKernel() = default; | ~PServerKernel() = default; | ||||
| PServerKernel(const PServerKernel &) = delete; | PServerKernel(const PServerKernel &) = delete; | ||||
| PServerKernel &operator=(const PServerKernel &) = delete; | PServerKernel &operator=(const PServerKernel &) = delete; | ||||
| virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | ||||
| virtual void InitKernel(const CNodePtr &cnode, | |||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | |||||
| virtual void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | virtual void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | ||||
| virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs) = 0; | const std::vector<AddressPtr> &outputs) = 0; | ||||
| @@ -33,8 +33,9 @@ class PullKernel : public CPUKernel { | |||||
| ~PullKernel() override = default; | ~PullKernel() override = default; | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &) { | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &) { | ||||
| // If the paramter is embedding table, don't Pull from PServer. | |||||
| if (param_name_.find("embedding") == std::string::npos && param_name_.find("wide_w") == std::string::npos) { | |||||
| bool init_in_server = mindspore::parallel::ps::Worker<float>::GetInstance().GetParamInitInServer(param_name_); | |||||
| // If init_in_server, forward kernel should run in server too. | |||||
| if (!init_in_server) { | |||||
| parallel::ps::Worker<T>::GetInstance().Pull(key_, inputs[1]->addr, inputs[1]->size); | parallel::ps::Worker<T>::GetInstance().Pull(key_, inputs[1]->addr, inputs[1]->size); | ||||
| } | } | ||||
| return true; | return true; | ||||
| @@ -43,7 +43,10 @@ class PushKernel : public CPUKernel { | |||||
| sizes.push_back(SizeToInt(input->size) / sizeof(T)); | sizes.push_back(SizeToInt(input->size) / sizeof(T)); | ||||
| } | } | ||||
| parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes); | parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes); | ||||
| memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t)); | |||||
| auto ret = memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t)); | |||||
| if (ret != EOK) { | |||||
| MS_LOG(EXCEPTION) << "Lookup id memcpy failed."; | |||||
| } | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -23,7 +23,7 @@ namespace mindspore { | |||||
| namespace kernel { | namespace kernel { | ||||
| namespace ps { | namespace ps { | ||||
| void SparseApplyAdamPSKernel::InitKernel( | void SparseApplyAdamPSKernel::InitKernel( | ||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | ||||
| std::vector<size_t> &var_shape = *(shape_vec[0]); | std::vector<size_t> &var_shape = *(shape_vec[0]); | ||||
| std::vector<size_t> &m_shape = *(shape_vec[1]); | std::vector<size_t> &m_shape = *(shape_vec[1]); | ||||
| @@ -55,11 +55,9 @@ void SparseApplyAdamPSKernel::InitKernel( | |||||
| if (grad_shape[0] != indices_size_) { | if (grad_shape[0] != indices_size_) { | ||||
| MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices"; | MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices"; | ||||
| } | } | ||||
| /* | |||||
| if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) { | |||||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov"); | |||||
| if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) { | |||||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov"); | |||||
| } | } | ||||
| */ | |||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| @@ -30,7 +30,8 @@ class SparseApplyAdamPSKernel : public SparseApplyAdamCPUKernel, public PServerK | |||||
| SparseApplyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | SparseApplyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | ||||
| ~SparseApplyAdamPSKernel() override = default; | ~SparseApplyAdamPSKernel() override = default; | ||||
| void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void InitKernel(const CNodePtr &cnode, | |||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | ||||
| bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| @@ -20,7 +20,7 @@ namespace mindspore { | |||||
| namespace kernel { | namespace kernel { | ||||
| namespace ps { | namespace ps { | ||||
| void SparseApplyFtrlPSKernel::InitKernel( | void SparseApplyFtrlPSKernel::InitKernel( | ||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | ||||
| std::vector<size_t> var_shape = *(shape_vec[0]); | std::vector<size_t> var_shape = *(shape_vec[0]); | ||||
| std::vector<size_t> accum_shape = *(shape_vec[1]); | std::vector<size_t> accum_shape = *(shape_vec[1]); | ||||
| @@ -46,10 +46,22 @@ void SparseApplyFtrlPSKernel::InitKernel( | |||||
| if (grad_shape[0] != indices_size_) { | if (grad_shape[0] != indices_size_) { | ||||
| MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices"; | MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices"; | ||||
| } | } | ||||
| lr_ = 0.01; | |||||
| l1_ = 1e-8; | |||||
| l2_ = 1e-8; | |||||
| lr_power_ = -0.5; | |||||
| lr_ = AnfAlgo::GetNodeAttr<float>(cnode, "lr"); | |||||
| if (lr_ <= 0) { | |||||
| MS_LOG(EXCEPTION) << "lr should be a positive scalar"; | |||||
| } | |||||
| l1_ = AnfAlgo::GetNodeAttr<float>(cnode, "l1"); | |||||
| if (l1_ < 0) { | |||||
| MS_LOG(EXCEPTION) << "l1 should be a non-negative scalar"; | |||||
| } | |||||
| l2_ = AnfAlgo::GetNodeAttr<float>(cnode, "l2"); | |||||
| if (l2_ < 0) { | |||||
| MS_LOG(EXCEPTION) << "l2 should be a non-negative scalar"; | |||||
| } | |||||
| lr_power_ = AnfAlgo::GetNodeAttr<float>(cnode, "lr_power"); | |||||
| if (lr_power_ > 0) { | |||||
| MS_LOG(EXCEPTION) << "lr_power should be a non-positive scalar"; | |||||
| } | |||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| @@ -30,7 +30,8 @@ class SparseApplyFtrlPSKernel : public SparseApplyFtrlCPUKernel, public PServerK | |||||
| SparseApplyFtrlPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | SparseApplyFtrlPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | ||||
| ~SparseApplyFtrlPSKernel() override = default; | ~SparseApplyFtrlPSKernel() override = default; | ||||
| void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void InitKernel(const CNodePtr &cnode, | |||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | ||||
| bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| @@ -23,7 +23,7 @@ namespace mindspore { | |||||
| namespace kernel { | namespace kernel { | ||||
| namespace ps { | namespace ps { | ||||
| void SparseApplyLazyAdamPSKernel::InitKernel( | void SparseApplyLazyAdamPSKernel::InitKernel( | ||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | |||||
| const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; | ||||
| std::vector<size_t> &var_shape = *(shape_vec[0]); | std::vector<size_t> &var_shape = *(shape_vec[0]); | ||||
| std::vector<size_t> &m_shape = *(shape_vec[1]); | std::vector<size_t> &m_shape = *(shape_vec[1]); | ||||
| @@ -55,11 +55,9 @@ void SparseApplyLazyAdamPSKernel::InitKernel( | |||||
| if (grad_shape[0] != indices_size_) { | if (grad_shape[0] != indices_size_) { | ||||
| MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices"; | MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices"; | ||||
| } | } | ||||
| /* | |||||
| if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) { | |||||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov"); | |||||
| if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) { | |||||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov"); | |||||
| } | } | ||||
| */ | |||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | ||||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | ||||
| @@ -30,7 +30,8 @@ class SparseApplyLazyAdamPSKernel : public SparseApplyLazyAdamCPUKernel, public | |||||
| SparseApplyLazyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | SparseApplyLazyAdamPSKernel(size_t rank_id, size_t pserver_num) : PServerKernel(rank_id, pserver_num) {} | ||||
| ~SparseApplyLazyAdamPSKernel() override = default; | ~SparseApplyLazyAdamPSKernel() override = default; | ||||
| void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void InitKernel(const CNodePtr &cnode, | |||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | |||||
| void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | void ReInit(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) override; | ||||
| bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| @@ -0,0 +1,226 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <algorithm> | |||||
| #include "maxpool_with_argmax_grad_impl.cuh" | |||||
| #include "runtime/device/gpu/cuda_common.h" | |||||
| #include "include/cuda_fp16.h" | |||||
| template <typename T, typename S> | |||||
| __global__ void MaxPoolWithArgmaxGrad(const T* x, | |||||
| const T* dy, | |||||
| const S* index, | |||||
| const int n, | |||||
| const int c, | |||||
| const int xHeight, | |||||
| const int xWidth, | |||||
| const int dyHeight, | |||||
| const int dyWidth, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int xNCHW, | |||||
| const int xCHW, | |||||
| const int xHW, | |||||
| const int dyCHW, | |||||
| const int dyHW, | |||||
| T* dx) { | |||||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| pos < (xNCHW); | |||||
| pos += blockDim.x * gridDim.x) { | |||||
| const int posn = pos / xCHW; | |||||
| const int posc = pos / xHW % c; | |||||
| const int posh = pos / xHeight % xHeight; | |||||
| const int posw = pos % xWidth; | |||||
| const S posIdx = posh*xWidth + posw; | |||||
| int hstart = posh+padTop; | |||||
| if (hstart < windowHeight) { | |||||
| hstart = 0; | |||||
| } else { | |||||
| hstart = (hstart-windowHeight)/strideHeight + 1; | |||||
| } | |||||
| int wstart = posw+padLeft; | |||||
| if (wstart < windowWidth) { | |||||
| wstart = 0; | |||||
| } else { | |||||
| wstart = (wstart-windowWidth)/strideWidth + 1; | |||||
| } | |||||
| const int hend = min((posh+padTop)/strideHeight +1, dyHeight); | |||||
| const int wend = min((posw+padLeft)/strideWidth +1, dyWidth); | |||||
| const int channelStart = posn*dyCHW + posc*dyHW; | |||||
| T dySum = static_cast<T>(0.0); | |||||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||||
| const int curIdx = hcur*dyWidth + wcur; | |||||
| S maxIdx = index[channelStart+curIdx]; | |||||
| if (maxIdx == posIdx) { | |||||
| dySum += dy[channelStart+curIdx]; | |||||
| } | |||||
| } | |||||
| } | |||||
| dx[pos] = dySum; | |||||
| } | |||||
| return; | |||||
| } | |||||
| template <> | |||||
| __global__ void MaxPoolWithArgmaxGrad(const half* x, | |||||
| const half* dy, | |||||
| const int* index, | |||||
| const int n, | |||||
| const int c, | |||||
| const int xHeight, | |||||
| const int xWidth, | |||||
| const int dyHeight, | |||||
| const int dyWidth, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int xNCHW, | |||||
| const int xCHW, | |||||
| const int xHW, | |||||
| const int dyCHW, | |||||
| const int dyHW, | |||||
| half* dx) { | |||||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| pos < (xNCHW); | |||||
| pos += blockDim.x * gridDim.x) { | |||||
| const int posn = pos / xCHW; | |||||
| const int posc = pos / xHW % c; | |||||
| const int posh = pos / xHeight % xHeight; | |||||
| const int posw = pos % xWidth; | |||||
| const int posIdx = posh*xWidth + posw; | |||||
| int hstart = posh+padTop; | |||||
| if (hstart < windowHeight) { | |||||
| hstart = 0; | |||||
| } else { | |||||
| hstart = (hstart-windowHeight)/strideHeight + 1; | |||||
| } | |||||
| int wstart = posw+padLeft; | |||||
| if (wstart < windowWidth) { | |||||
| wstart = 0; | |||||
| } else { | |||||
| wstart = (wstart-windowWidth)/strideWidth + 1; | |||||
| } | |||||
| const int hend = min((posh+padTop)/strideHeight +1, dyHeight); | |||||
| const int wend = min((posw+padLeft)/strideWidth +1, dyWidth); | |||||
| const int channelStart = posn*dyCHW + posc*dyHW; | |||||
| float dySum = 0.0f; | |||||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||||
| const int curIdx = hcur*dyWidth + wcur; | |||||
| int maxIdx = index[channelStart+curIdx]; | |||||
| if (maxIdx == posIdx) { | |||||
| dySum += __half2float(dy[channelStart+curIdx]); | |||||
| } | |||||
| } | |||||
| } | |||||
| dx[pos] = __float2half(dySum); | |||||
| } | |||||
| return; | |||||
| } | |||||
| template <typename T, typename S> | |||||
| void CalMaxPoolWithArgmaxGrad(const T* x, | |||||
| const T* dy, | |||||
| const S* index, | |||||
| const int n, | |||||
| const int c, | |||||
| const int xHeight, | |||||
| const int xWidth, | |||||
| const int dyHeight, | |||||
| const int dyWidth, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| T* dx, | |||||
| cudaStream_t cuda_stream) { | |||||
| const int xHW = xHeight*xWidth; | |||||
| const int xCHW = c*xHW; | |||||
| const int xNCHW = n*xCHW; | |||||
| const int dyHW = dyHeight*dyWidth; | |||||
| const int dyCHW = c*dyHW; | |||||
| MaxPoolWithArgmaxGrad<<<GET_BLOCKS(xNCHW), | |||||
| GET_THREADS, | |||||
| 0, | |||||
| cuda_stream>>>( | |||||
| x, | |||||
| dy, | |||||
| index, | |||||
| n, | |||||
| c, | |||||
| xHeight, | |||||
| xWidth, | |||||
| dyHeight, | |||||
| dyWidth, | |||||
| windowHeight, | |||||
| windowWidth, | |||||
| strideHeight, | |||||
| strideWidth, | |||||
| padTop, | |||||
| padLeft, | |||||
| xNCHW, | |||||
| xCHW, | |||||
| xHW, | |||||
| dyCHW, | |||||
| dyHW, | |||||
| dx); | |||||
| return; | |||||
| } | |||||
| template void CalMaxPoolWithArgmaxGrad<float, int>(const float* x, | |||||
| const float* dy, | |||||
| const int* index, | |||||
| const int n, | |||||
| const int c, | |||||
| const int xHeight, | |||||
| const int xWidth, | |||||
| const int dyHeight, | |||||
| const int dyWidth, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| float* dx, | |||||
| cudaStream_t cuda_stream); | |||||
| template void CalMaxPoolWithArgmaxGrad<half, int>(const half* x, | |||||
| const half* dy, | |||||
| const int* index, | |||||
| const int n, | |||||
| const int c, | |||||
| const int xHeight, | |||||
| const int xWidth, | |||||
| const int dyHeight, | |||||
| const int dyWidth, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| half* dx, | |||||
| cudaStream_t cuda_stream); | |||||
| @@ -0,0 +1,25 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||||
| template <typename T, typename S> | |||||
| void CalMaxPoolWithArgmaxGrad(const T* x, const T* dy, const S* index, const int n, const int c, const int xHeight, | |||||
| const int xWidth, const int dyHeight, const int dyWidth, const int windowHeight, | |||||
| const int windowWidth, const int strideHeight, const int strideWidth, const int padTop, | |||||
| const int padLeft, T* dx, cudaStream_t cuda_stream); | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||||
| @@ -0,0 +1,149 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <algorithm> | |||||
| #include "maxpool_with_argmax_impl.cuh" | |||||
| #include "runtime/device/gpu/cuda_common.h" | |||||
| #include "include/cuda_fp16.h" | |||||
| template <typename T, typename S> | |||||
| __global__ void MaxPoolWithArgmax(const T* input, | |||||
| const int n, | |||||
| const int c, | |||||
| const int h, | |||||
| const int w, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int outputHeight, | |||||
| const int outputWidth, | |||||
| const int outputNCHW, | |||||
| const int outputCHW, | |||||
| const int outputHW, | |||||
| T* output, | |||||
| S *index) { | |||||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| pos < (outputNCHW); | |||||
| pos += blockDim.x * gridDim.x) { | |||||
| const int posn = pos / outputCHW; | |||||
| const int posc = pos / outputHW % c; | |||||
| const int posh = pos / outputHeight % outputHeight; | |||||
| const int posw = pos % outputWidth; | |||||
| int hstart = posh * strideHeight - padTop; | |||||
| int wstart = posw * strideWidth - padLeft; | |||||
| const int hend = min(hstart + windowHeight, h); | |||||
| const int wend = min(wstart + windowWidth, w); | |||||
| hstart = max(hstart, 0); | |||||
| wstart = max(wstart, 0); | |||||
| S inputStart = posn*c*h*w + posc*h*w; | |||||
| S maxIdx = hstart*w + wstart; | |||||
| T maxData = input[inputStart+maxIdx]; | |||||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||||
| S inputIdx = hcur*w + wcur; | |||||
| T inputData = input[inputStart+inputIdx]; | |||||
| if (inputData > maxData) { | |||||
| maxIdx = inputIdx; | |||||
| maxData = inputData; | |||||
| } | |||||
| } | |||||
| } | |||||
| output[pos] = maxData; | |||||
| index[pos] = maxIdx; | |||||
| } | |||||
| return; | |||||
| } | |||||
| template <typename T, typename S> | |||||
| void CalMaxPoolWithArgmax(const T* input, | |||||
| const int n, | |||||
| const int c, | |||||
| const int h, | |||||
| const int w, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int outputHeight, | |||||
| const int outputWidth, | |||||
| T* output, | |||||
| S *index, | |||||
| cudaStream_t cuda_stream) { | |||||
| const int outputNCHW = n*c*outputHeight*outputWidth; | |||||
| const int outputCHW = c*outputHeight*outputWidth; | |||||
| const int outputHW = outputHeight*outputWidth; | |||||
| MaxPoolWithArgmax<<<GET_BLOCKS(n*c*outputHeight*outputWidth), | |||||
| GET_THREADS, | |||||
| 0, | |||||
| cuda_stream>>>( | |||||
| input, | |||||
| n, | |||||
| c, | |||||
| h, | |||||
| w, | |||||
| windowHeight, | |||||
| windowWidth, | |||||
| strideHeight, | |||||
| strideWidth, | |||||
| padTop, | |||||
| padLeft, | |||||
| outputHeight, | |||||
| outputWidth, | |||||
| outputNCHW, | |||||
| outputCHW, | |||||
| outputHW, | |||||
| output, | |||||
| index); | |||||
| return; | |||||
| } | |||||
| template void CalMaxPoolWithArgmax<float, int>(const float* input, | |||||
| const int n, | |||||
| const int c, | |||||
| const int h, | |||||
| const int w, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int outputHeight, | |||||
| const int outputWidth, | |||||
| float* output, | |||||
| int* index, | |||||
| cudaStream_t cuda_stream); | |||||
| template void CalMaxPoolWithArgmax<half, int>(const half* input, | |||||
| const int n, | |||||
| const int c, | |||||
| const int h, | |||||
| const int w, | |||||
| const int windowHeight, | |||||
| const int windowWidth, | |||||
| const int strideHeight, | |||||
| const int strideWidth, | |||||
| const int padTop, | |||||
| const int padLeft, | |||||
| const int outputHeight, | |||||
| const int outputWidth, | |||||
| half* output, | |||||
| int* index, | |||||
| cudaStream_t cuda_stream); | |||||
| @@ -0,0 +1,25 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||||
| template <typename T, typename S> | |||||
| void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, const int windowHeight, | |||||
| const int windowWidth, const int strideHeight, const int strideWidth, const int padTop, | |||||
| const int padLeft, const int outputHeight, const int outputWidth, T* output, S *index, | |||||
| cudaStream_t cuda_stream); | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||||
| @@ -0,0 +1,30 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| MaxPoolWithArgmax, | |||||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32), | |||||
| MaxPoolWithArgmaxGpuFwdKernel, float, int) | |||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| MaxPoolWithArgmax, | |||||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32), | |||||
| MaxPoolWithArgmaxGpuFwdKernel, half, int) | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,160 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||||
| #include <algorithm> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||||
| #include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cuh" | |||||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| template <typename T, typename S> | |||||
| class MaxPoolWithArgmaxGpuFwdKernel : public GpuKernel { | |||||
| public: | |||||
| MaxPoolWithArgmaxGpuFwdKernel() | |||||
| : n_(0), | |||||
| c_(0), | |||||
| input_height_(0), | |||||
| input_width_(0), | |||||
| window_height_(0), | |||||
| window_width_(0), | |||||
| pad_height_(0), | |||||
| pad_width_(0), | |||||
| pad_top_(0), | |||||
| pad_left_(0), | |||||
| stride_height_(0), | |||||
| stride_width_(0), | |||||
| output_height_(0), | |||||
| output_width_(0), | |||||
| input_size_(0), | |||||
| output_size_(0) {} | |||||
| ~MaxPoolWithArgmaxGpuFwdKernel() override = default; | |||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| S *index_addr = GetDeviceAddress<S>(outputs, 1); | |||||
| CalMaxPoolWithArgmax(input_addr, n_, c_, input_height_, input_width_, window_height_, window_width_, stride_height_, | |||||
| stride_width_, pad_top_, pad_left_, output_height_, output_width_, output_addr, index_addr, | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolWithArgmax needs 1 inputs."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 2) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolWithArgmax needs 2 output."; | |||||
| return false; | |||||
| } | |||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| input_size_ = sizeof(T); | |||||
| for (auto x : input_shape) { | |||||
| input_size_ *= x; | |||||
| } | |||||
| output_size_ = sizeof(T); | |||||
| for (auto x : output_shape) { | |||||
| output_size_ *= x; | |||||
| } | |||||
| n_ = SizeToInt(input_shape[0]); | |||||
| c_ = SizeToInt(input_shape[1]); | |||||
| input_height_ = SizeToInt(input_shape[2]); | |||||
| input_width_ = SizeToInt(input_shape[3]); | |||||
| output_height_ = SizeToInt(output_shape[2]); | |||||
| output_width_ = SizeToInt(output_shape[3]); | |||||
| auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize")); | |||||
| window_height_ = window[1]; | |||||
| window_width_ = window[2]; | |||||
| auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides")); | |||||
| stride_height_ = stride[1]; | |||||
| stride_width_ = stride[2]; | |||||
| pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding")); | |||||
| pad_top_ = 0; | |||||
| pad_left_ = 0; | |||||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) { | |||||
| SetPad(); | |||||
| } | |||||
| InitSizeLists(); | |||||
| return true; | |||||
| } | |||||
| protected: | |||||
| void InitSizeLists() override { | |||||
| input_size_list_.push_back(input_size_); | |||||
| output_size_list_.push_back(output_size_); | |||||
| output_size_list_.push_back(output_size_ / sizeof(T) * sizeof(S)); | |||||
| } | |||||
| private: | |||||
| void SetPad() { | |||||
| pad_height_ = std::max<int>( | |||||
| 0, (((input_height_ / stride_height_) * stride_height_ == input_height_ ? (input_height_ / stride_height_) | |||||
| : (input_height_ / stride_height_) + 1) - | |||||
| 1) * | |||||
| stride_height_ + | |||||
| window_height_ - input_height_); | |||||
| pad_width_ = std::max<int>( | |||||
| 0, (((input_width_ / stride_width_) * stride_width_ == input_width_ ? (input_width_ / stride_width_) | |||||
| : (input_width_ / stride_width_) + 1) - | |||||
| 1) * | |||||
| stride_width_ + | |||||
| window_width_ - input_width_); | |||||
| pad_top_ = pad_height_ / 2; | |||||
| pad_left_ = pad_width_ / 2; | |||||
| } | |||||
| std::string pad_mode_; | |||||
| std::vector<size_t> input_size_list_; | |||||
| std::vector<size_t> output_size_list_; | |||||
| std::vector<size_t> workspace_size_list_; | |||||
| int n_; | |||||
| int c_; | |||||
| int input_height_; | |||||
| int input_width_; | |||||
| int window_height_; | |||||
| int window_width_; | |||||
| int pad_height_; | |||||
| int pad_width_; | |||||
| int pad_top_; | |||||
| int pad_left_; | |||||
| int stride_height_; | |||||
| int stride_width_; | |||||
| int output_height_; | |||||
| int output_width_; | |||||
| size_t input_size_; | |||||
| size_t output_size_; | |||||
| }; | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||||
| @@ -0,0 +1,36 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax, | |||||
| KernelAttr() | |||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddInputAttr(kNumberTypeInt32) | |||||
| .AddOutputAttr(kNumberTypeFloat32), | |||||
| MaxPoolWithArgmaxGradGpuKernel, float, int) | |||||
| MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax, | |||||
| KernelAttr() | |||||
| .AddInputAttr(kNumberTypeFloat16) | |||||
| .AddInputAttr(kNumberTypeFloat16) | |||||
| .AddInputAttr(kNumberTypeInt32) | |||||
| .AddOutputAttr(kNumberTypeFloat16), | |||||
| MaxPoolWithArgmaxGradGpuKernel, half, int) | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,168 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||||
| #include <algorithm> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||||
| #include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cuh" | |||||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| template <typename T, typename S> | |||||
| class MaxPoolWithArgmaxGradGpuKernel : public GpuKernel { | |||||
| public: | |||||
| MaxPoolWithArgmaxGradGpuKernel() | |||||
| : n_(0), | |||||
| c_(0), | |||||
| x_height_(0), | |||||
| x_width_(0), | |||||
| dy_height_(0), | |||||
| dy_width_(0), | |||||
| x_size_(0), | |||||
| dy_size_(0), | |||||
| index_size_(0), | |||||
| dx_size_(0) {} | |||||
| ~MaxPoolWithArgmaxGradGpuKernel() override = default; | |||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| T *x_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *dy_addr = GetDeviceAddress<T>(inputs, 1); | |||||
| S *index_addr = GetDeviceAddress<S>(inputs, 2); | |||||
| T *dx_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| CalMaxPoolWithArgmaxGrad(x_addr, dy_addr, index_addr, n_, c_, x_height_, x_width_, dy_height_, dy_width_, | |||||
| window_height_, window_width_, stride_height_, stride_width_, pad_top_, pad_left_, dx_addr, | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 3) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolGradWithArgmax needs 3 inputs."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolGradWithArgmax needs 1 output."; | |||||
| return false; | |||||
| } | |||||
| auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| auto index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||||
| auto dx_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| x_size_ = sizeof(T); | |||||
| for (auto x : x_shape) { | |||||
| x_size_ *= x; | |||||
| } | |||||
| dy_size_ = sizeof(T); | |||||
| for (auto x : dy_shape) { | |||||
| dy_size_ *= x; | |||||
| } | |||||
| index_size_ = sizeof(S); | |||||
| for (auto x : index_shape) { | |||||
| index_size_ *= x; | |||||
| } | |||||
| dx_size_ = sizeof(T); | |||||
| for (auto x : dx_shape) { | |||||
| dx_size_ *= x; | |||||
| } | |||||
| n_ = SizeToInt(x_shape[0]); | |||||
| c_ = SizeToInt(x_shape[1]); | |||||
| x_height_ = SizeToInt(x_shape[2]); | |||||
| x_width_ = SizeToInt(x_shape[3]); | |||||
| dy_height_ = SizeToInt(dy_shape[2]); | |||||
| dy_width_ = SizeToInt(dy_shape[3]); | |||||
| auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize")); | |||||
| window_height_ = window[1]; | |||||
| window_width_ = window[2]; | |||||
| auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides")); | |||||
| stride_height_ = stride[1]; | |||||
| stride_width_ = stride[2]; | |||||
| pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding")); | |||||
| pad_top_ = 0; | |||||
| pad_left_ = 0; | |||||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) { | |||||
| SetPad(); | |||||
| } | |||||
| InitSizeLists(); | |||||
| return true; | |||||
| } | |||||
| protected: | |||||
| void InitSizeLists() override { | |||||
| input_size_list_.push_back(x_size_); | |||||
| input_size_list_.push_back(dy_size_); | |||||
| input_size_list_.push_back(index_size_); | |||||
| output_size_list_.push_back(dx_size_); | |||||
| } | |||||
| private: | |||||
| void SetPad() { | |||||
| pad_height_ = std::max<int>( | |||||
| 0, (((x_height_ / stride_height_) * stride_height_ == x_height_ ? (x_height_ / stride_height_) | |||||
| : (x_height_ / stride_height_) + 1) - | |||||
| 1) * | |||||
| stride_height_ + | |||||
| window_height_ - x_height_); | |||||
| pad_width_ = | |||||
| std::max<int>(0, (((x_width_ / stride_width_) * stride_width_ == x_width_ ? (x_width_ / stride_width_) | |||||
| : (x_width_ / stride_width_) + 1) - | |||||
| 1) * | |||||
| stride_width_ + | |||||
| window_width_ - x_width_); | |||||
| pad_top_ = pad_height_ / 2; | |||||
| pad_left_ = pad_width_ / 2; | |||||
| } | |||||
| std::string pad_mode_; | |||||
| std::vector<size_t> input_size_list_; | |||||
| std::vector<size_t> output_size_list_; | |||||
| std::vector<size_t> workspace_size_list_; | |||||
| int n_; | |||||
| int c_; | |||||
| int x_height_; | |||||
| int x_width_; | |||||
| int dy_height_; | |||||
| int dy_width_; | |||||
| int window_height_; | |||||
| int window_width_; | |||||
| int pad_height_; | |||||
| int pad_width_; | |||||
| int pad_top_; | |||||
| int pad_left_; | |||||
| int stride_height_; | |||||
| int stride_width_; | |||||
| size_t x_size_; | |||||
| size_t dy_size_; | |||||
| size_t index_size_; | |||||
| size_t dx_size_; | |||||
| }; | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||||
| @@ -20,11 +20,17 @@ | |||||
| #include "utils/utils.h" | #include "utils/utils.h" | ||||
| #include "backend/kernel_compiler/hccl/hcom_util.h" | #include "backend/kernel_compiler/hccl/hcom_util.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| #include "frontend/parallel/context.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | namespace { | ||||
| std::string GetKernelFormat(const CNodePtr &kernel_node, size_t index) { | std::string GetKernelFormat(const CNodePtr &kernel_node, size_t index) { | ||||
| auto parallel_context_instance = parallel::ParallelContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(parallel_context_instance); | |||||
| if (parallel_context_instance->enable_parallel_optimizer()) { | |||||
| return kOpFormat_DEFAULT; | |||||
| } | |||||
| const std::set<std::string> kReduceNoSupportedSet = {kOpFormat_FRAC_Z, kOpFormat_FRACTAL_Z_C04, kOpFormat_C1HWNCoC0}; | const std::set<std::string> kReduceNoSupportedSet = {kOpFormat_FRAC_Z, kOpFormat_FRACTAL_Z_C04, kOpFormat_C1HWNCoC0}; | ||||
| auto op_name = AnfAlgo::GetCNodeName(kernel_node); | auto op_name = AnfAlgo::GetCNodeName(kernel_node); | ||||
| auto format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, index); | auto format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, index); | ||||
| @@ -65,6 +65,9 @@ size_t KernelBuildInfo::GetInputNum() const { return inputs_format_.size(); } | |||||
| size_t KernelBuildInfo::GetOutputNum() const { return outputs_format_.size(); } | size_t KernelBuildInfo::GetOutputNum() const { return outputs_format_.size(); } | ||||
| std::vector<Axis> KernelBuildInfo::GetInputReshapeType(size_t input_index) const { | std::vector<Axis> KernelBuildInfo::GetInputReshapeType(size_t input_index) const { | ||||
| if (input_reshape_type_.empty()) { | |||||
| return {}; | |||||
| } | |||||
| if (input_index >= input_reshape_type_.size()) { | if (input_index >= input_reshape_type_.size()) { | ||||
| MS_LOG(EXCEPTION) << "The index [" << input_index << "] is exceed the number of input node size " | MS_LOG(EXCEPTION) << "The index [" << input_index << "] is exceed the number of input node size " | ||||
| << input_reshape_type_.size(); | << input_reshape_type_.size(); | ||||
| @@ -73,6 +76,9 @@ std::vector<Axis> KernelBuildInfo::GetInputReshapeType(size_t input_index) const | |||||
| } | } | ||||
| std::vector<Axis> KernelBuildInfo::GetOutputReshapeType(size_t output_index) const { | std::vector<Axis> KernelBuildInfo::GetOutputReshapeType(size_t output_index) const { | ||||
| if (output_reshape_type_.empty()) { | |||||
| return {}; | |||||
| } | |||||
| if (output_index >= output_reshape_type_.size()) { | if (output_index >= output_reshape_type_.size()) { | ||||
| MS_LOG(EXCEPTION) << "The index [" << output_index << "] is exceed the number of output node size " | MS_LOG(EXCEPTION) << "The index [" << output_index << "] is exceed the number of output node size " | ||||
| << output_reshape_type_.size(); | << output_reshape_type_.size(); | ||||
| @@ -158,13 +164,13 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetProcessor(Processor processor) | |||||
| std::shared_ptr<KernelBuildInfo> KernelBuildInfo::KernelBuildInfoBuilder::Build() { return kernel_build_info_; } | std::shared_ptr<KernelBuildInfo> KernelBuildInfo::KernelBuildInfoBuilder::Build() { return kernel_build_info_; } | ||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetInputReshapeType( | |||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetInputsReshapeType( | |||||
| const std::vector<std::vector<Axis>> &input_reshape_type) { | const std::vector<std::vector<Axis>> &input_reshape_type) { | ||||
| MS_EXCEPTION_IF_NULL(kernel_build_info_); | MS_EXCEPTION_IF_NULL(kernel_build_info_); | ||||
| kernel_build_info_->input_reshape_type_ = input_reshape_type; | kernel_build_info_->input_reshape_type_ = input_reshape_type; | ||||
| } | } | ||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputReshapeType( | |||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputsReshapeType( | |||||
| const std::vector<std::vector<Axis>> &output_reshape_type) { | const std::vector<std::vector<Axis>> &output_reshape_type) { | ||||
| MS_EXCEPTION_IF_NULL(kernel_build_info_); | MS_EXCEPTION_IF_NULL(kernel_build_info_); | ||||
| kernel_build_info_->output_reshape_type_ = output_reshape_type; | kernel_build_info_->output_reshape_type_ = output_reshape_type; | ||||
| @@ -189,5 +195,37 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputFormat(const std::string | |||||
| } | } | ||||
| kernel_build_info_->outputs_format_[index] = format; | kernel_build_info_->outputs_format_[index] = format; | ||||
| } | } | ||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetInputReshapeType(const std::vector<Axis> &input_reshape_type, | |||||
| size_t index) { | |||||
| if (index >= kernel_build_info_->input_reshape_type_.size()) { | |||||
| MS_LOG(EXCEPTION) << "index outof range!"; | |||||
| } | |||||
| std::copy(input_reshape_type.begin(), input_reshape_type.end(), | |||||
| std::back_inserter(kernel_build_info_->input_reshape_type_[index])); | |||||
| } | |||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputReshapeType(const std::vector<Axis> &output_reshape_type, | |||||
| size_t index) { | |||||
| if (index >= kernel_build_info_->output_reshape_type_.size()) { | |||||
| MS_LOG(EXCEPTION) << "index outof range!"; | |||||
| } | |||||
| std::copy(output_reshape_type.begin(), output_reshape_type.end(), | |||||
| std::back_inserter(kernel_build_info_->output_reshape_type_[index])); | |||||
| } | |||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputDeviceType(const TypeId &output_device_type, size_t index) { | |||||
| if (index >= kernel_build_info_->outputs_device_type_.size()) { | |||||
| MS_LOG(EXCEPTION) << "index outof range!"; | |||||
| } | |||||
| kernel_build_info_->outputs_device_type_[index] = output_device_type; | |||||
| } | |||||
| void KernelBuildInfo::KernelBuildInfoBuilder::SetInputDeviceType(const TypeId &input_device_type, size_t index) { | |||||
| if (index >= kernel_build_info_->inputs_device_type_.size()) { | |||||
| MS_LOG(EXCEPTION) << "index outof range!"; | |||||
| } | |||||
| kernel_build_info_->inputs_device_type_[index] = input_device_type; | |||||
| } | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -71,6 +71,10 @@ class KernelBuildInfo { | |||||
| std::vector<TypeId> GetAllOutputDeviceTypes() const; | std::vector<TypeId> GetAllOutputDeviceTypes() const; | ||||
| std::vector<std::vector<Axis>> GetAllOutputReshapeType() const; | |||||
| std::vector<std::vector<Axis>> GetAllInputReshapeType() const; | |||||
| OpPattern op_pattern() const { return op_pattern_; } | OpPattern op_pattern() const { return op_pattern_; } | ||||
| FusionType fusion_type() const { return fusion_type_; } | FusionType fusion_type() const { return fusion_type_; } | ||||
| @@ -108,8 +112,23 @@ class KernelBuildInfo::KernelBuildInfoBuilder { | |||||
| public: | public: | ||||
| KernelBuildInfoBuilder() { kernel_build_info_ = std::make_shared<KernelBuildInfo>(); } | KernelBuildInfoBuilder() { kernel_build_info_ = std::make_shared<KernelBuildInfo>(); } | ||||
| explicit KernelBuildInfoBuilder(std::shared_ptr<KernelBuildInfo> kernel_build_info) | |||||
| : kernel_build_info_(std::move(kernel_build_info)) {} | |||||
| explicit KernelBuildInfoBuilder(const std::shared_ptr<KernelBuildInfo> &kernel_build_info) | |||||
| : kernel_build_info_(std::make_shared<KernelBuildInfo>()) { | |||||
| SetKernelType(kernel_build_info->kernel_type()); | |||||
| SetFusionType(kernel_build_info->fusion_type()); | |||||
| SetProcessor(kernel_build_info->processor()); | |||||
| OpPattern(kernel_build_info->op_pattern()); | |||||
| for (size_t index = 0; index < kernel_build_info->GetInputNum(); ++index) { | |||||
| kernel_build_info_->inputs_device_type_.emplace_back(kernel_build_info->GetInputDeviceType(index)); | |||||
| kernel_build_info_->inputs_format_.emplace_back(kernel_build_info->GetInputFormat(index)); | |||||
| kernel_build_info_->input_reshape_type_.emplace_back(kernel_build_info->GetInputReshapeType(index)); | |||||
| } | |||||
| for (size_t index = 0; index < kernel_build_info->GetOutputNum(); ++index) { | |||||
| kernel_build_info_->outputs_device_type_.emplace_back(kernel_build_info->GetOutputDeviceType(index)); | |||||
| kernel_build_info_->outputs_format_.emplace_back(kernel_build_info->GetOutputFormat(index)); | |||||
| kernel_build_info_->output_reshape_type_.emplace_back(kernel_build_info->GetOutputReshapeType(index)); | |||||
| } | |||||
| } | |||||
| ~KernelBuildInfoBuilder() = default; | ~KernelBuildInfoBuilder() = default; | ||||
| @@ -123,9 +142,9 @@ class KernelBuildInfo::KernelBuildInfoBuilder { | |||||
| void SetOutputsDeviceType(const std::vector<TypeId> &outputs_device_type); | void SetOutputsDeviceType(const std::vector<TypeId> &outputs_device_type); | ||||
| void SetInputReshapeType(const std::vector<std::vector<Axis>> &input_reshape_type); | |||||
| void SetInputsReshapeType(const std::vector<std::vector<Axis>> &input_reshape_type); | |||||
| void SetOutputReshapeType(const std::vector<std::vector<Axis>> &output_reshape_type); | |||||
| void SetOutputsReshapeType(const std::vector<std::vector<Axis>> &output_reshape_type); | |||||
| void SetFusionType(FusionType fusion_type); | void SetFusionType(FusionType fusion_type); | ||||
| @@ -137,6 +156,14 @@ class KernelBuildInfo::KernelBuildInfoBuilder { | |||||
| void SetOutputFormat(const std::string &format, size_t index); | void SetOutputFormat(const std::string &format, size_t index); | ||||
| void SetInputReshapeType(const std::vector<Axis> &input_reshape_type, size_t index); | |||||
| void SetOutputReshapeType(const std::vector<Axis> &output_reshape_type, size_t index); | |||||
| void SetInputDeviceType(const TypeId &input_device_type, size_t index); | |||||
| void SetOutputDeviceType(const TypeId &output_device_type, size_t index); | |||||
| std::shared_ptr<KernelBuildInfo> Build(); | std::shared_ptr<KernelBuildInfo> Build(); | ||||
| private: | private: | ||||
| @@ -118,7 +118,7 @@ void TbeKernelSelect::GetCommonPatternKernelInfo(const OpInfo &op_info) { | |||||
| } | } | ||||
| builder.SetInputsDeviceType(inputs_device_type); | builder.SetInputsDeviceType(inputs_device_type); | ||||
| builder.SetInputsFormat(inputs_format); | builder.SetInputsFormat(inputs_format); | ||||
| builder.SetInputReshapeType(inputs_reshape_type); | |||||
| builder.SetInputsReshapeType(inputs_reshape_type); | |||||
| // output | // output | ||||
| std::vector<std::string> outputs_format; | std::vector<std::string> outputs_format; | ||||
| std::vector<TypeId> outputs_device_type; | std::vector<TypeId> outputs_device_type; | ||||
| @@ -129,7 +129,7 @@ void TbeKernelSelect::GetCommonPatternKernelInfo(const OpInfo &op_info) { | |||||
| } | } | ||||
| builder.SetOutputsDeviceType(outputs_device_type); | builder.SetOutputsDeviceType(outputs_device_type); | ||||
| builder.SetOutputsFormat(outputs_format); | builder.SetOutputsFormat(outputs_format); | ||||
| builder.SetOutputReshapeType(outputs_reshape_type); | |||||
| builder.SetOutputsReshapeType(outputs_reshape_type); | |||||
| kernel_info_list_->emplace_back(builder.Build()); | kernel_info_list_->emplace_back(builder.Build()); | ||||
| } | } | ||||
| MS_LOG(INFO) << "end."; | MS_LOG(INFO) << "end."; | ||||
| @@ -47,6 +47,7 @@ | |||||
| #include "backend/optimizer/ascend/ir_fission/transdata_split.h" | #include "backend/optimizer/ascend/ir_fission/transdata_split.h" | ||||
| #include "backend/optimizer/ascend/ir_fission/topk_split.h" | #include "backend/optimizer/ascend/ir_fission/topk_split.h" | ||||
| #include "backend/optimizer/ascend/ir_fusion/momentum_lossscale_fusion.h" | #include "backend/optimizer/ascend/ir_fusion/momentum_lossscale_fusion.h" | ||||
| #include "backend/optimizer/ascend/format_type/split_unsupported_transdata.h" | |||||
| #include "backend/optimizer/ascend/ir_fusion/mul_add_fusion.h" | #include "backend/optimizer/ascend/ir_fusion/mul_add_fusion.h" | ||||
| #include "backend/optimizer/ascend/ir_fusion/mul_addn_fusion.h" | #include "backend/optimizer/ascend/ir_fusion/mul_addn_fusion.h" | ||||
| #include "backend/optimizer/ascend/ir_fusion/matmul_biasadd_fusion.h" | #include "backend/optimizer/ascend/ir_fusion/matmul_biasadd_fusion.h" | ||||
| @@ -228,6 +229,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap | |||||
| mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>()); | mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>()); | ||||
| mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>()); | mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>()); | ||||
| mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>()); | mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>()); | ||||
| mixed_precision_pm->AddPass(std::make_shared<SplitUnsupportedTransData>()); | |||||
| mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>()); | mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>()); | ||||
| mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>()); | mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>()); | ||||
| optimizer->AddPassManager(mixed_precision_pm); | optimizer->AddPassManager(mixed_precision_pm); | ||||
| @@ -153,7 +153,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const | |||||
| std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx); | std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx); | ||||
| if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) { | if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) { | ||||
| auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false); | auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false); | ||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node, output_idx)) { | |||||
| kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0); | kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0); | ||||
| } | } | ||||
| make_tuple_inputs.emplace_back(trans_op); | make_tuple_inputs.emplace_back(trans_op); | ||||
| @@ -174,8 +174,8 @@ void RefreshKernelBuildInfo(const std::string &input_format, const std::string & | |||||
| MS_EXCEPTION_IF_NULL(ori_build_info); | MS_EXCEPTION_IF_NULL(ori_build_info); | ||||
| auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(ori_build_info); | auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(ori_build_info); | ||||
| builder->SetInputsFormat({input_format}); | builder->SetInputsFormat({input_format}); | ||||
| builder->SetInputReshapeType({reshape_type}); | |||||
| builder->SetOutputReshapeType({reshape_type}); | |||||
| builder->SetInputsReshapeType({reshape_type}); | |||||
| builder->SetOutputsReshapeType({reshape_type}); | |||||
| builder->SetOutputsFormat({output_format}); | builder->SetOutputsFormat({output_format}); | ||||
| if (type_id != kTypeUnknown) { | if (type_id != kTypeUnknown) { | ||||
| builder->SetOutputsDeviceType({type_id}); | builder->SetOutputsDeviceType({type_id}); | ||||
| @@ -265,7 +265,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP | |||||
| // Single output | // Single output | ||||
| if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) { | if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) { | ||||
| auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select); | auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select); | ||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node, 0)) { | |||||
| kernel_graph->ReplaceInternalOutput(node, new_node); | kernel_graph->ReplaceInternalOutput(node, new_node); | ||||
| } | } | ||||
| return new_node; | return new_node; | ||||
| @@ -40,6 +40,38 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) { | |||||
| return real_node->isa<ValueNode>(); | return real_node->isa<ValueNode>(); | ||||
| } | } | ||||
| void SetInput(const CNodePtr &control_depend, const int index, const FuncGraphPtr &graph, const CNodePtr &hccl_node, | |||||
| const std::vector<AnfNodePtr> &memcpy_async_list) { | |||||
| MS_EXCEPTION_IF_NULL(control_depend); | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| MS_EXCEPTION_IF_NULL(hccl_node); | |||||
| std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple)}; | |||||
| make_tuple_inputs.insert(make_tuple_inputs.end(), memcpy_async_list.begin(), memcpy_async_list.end()); | |||||
| make_tuple_inputs.emplace_back(hccl_node); | |||||
| auto make_tuple = graph->NewCNode(make_tuple_inputs); | |||||
| MS_EXCEPTION_IF_NULL(make_tuple); | |||||
| control_depend->set_input(IntToSize(index), make_tuple); | |||||
| } | |||||
| void DealControlForGetitem(const CNodePtr &tuple_getitem, const FuncGraphPtr &graph, const CNodePtr &hccl_node, | |||||
| const std::vector<AnfNodePtr> &memcpy_async_list) { | |||||
| MS_EXCEPTION_IF_NULL(tuple_getitem); | |||||
| auto manager = graph->manager(); | |||||
| MS_EXCEPTION_IF_NULL(manager); | |||||
| auto &node_users = manager->node_users(); | |||||
| auto iter = node_users.find(tuple_getitem); | |||||
| if (iter == node_users.end()) { | |||||
| MS_LOG(EXCEPTION) << "node has no output in manager"; | |||||
| } | |||||
| for (const auto &node_index : iter->second) { | |||||
| AnfNodePtr output = node_index.first; | |||||
| MS_EXCEPTION_IF_NULL(output); | |||||
| if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimControlDepend)) { | |||||
| SetInput(output->cast<CNodePtr>(), node_index.second, graph, hccl_node, memcpy_async_list); | |||||
| } | |||||
| } | |||||
| } | |||||
| void TransferControl(const CNodePtr &hccl_node, const std::vector<AnfNodePtr> &memcpy_async_list, | void TransferControl(const CNodePtr &hccl_node, const std::vector<AnfNodePtr> &memcpy_async_list, | ||||
| const FuncGraphPtr &graph) { | const FuncGraphPtr &graph) { | ||||
| MS_EXCEPTION_IF_NULL(hccl_node); | MS_EXCEPTION_IF_NULL(hccl_node); | ||||
| @@ -53,25 +85,13 @@ void TransferControl(const CNodePtr &hccl_node, const std::vector<AnfNodePtr> &m | |||||
| } | } | ||||
| // find hccl_node's output which is a control depend | // find hccl_node's output which is a control depend | ||||
| for (const auto &node_index : iter->second) { | for (const auto &node_index : iter->second) { | ||||
| if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimControlDepend)) { | |||||
| continue; | |||||
| } | |||||
| CNodePtr control_depend = node_index.first->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(control_depend); | |||||
| std::vector<AnfNodePtr> new_inputs; | |||||
| for (size_t i = 0; i < control_depend->size(); ++i) { | |||||
| if (i == IntToSize(node_index.second)) { | |||||
| std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple)}; | |||||
| make_tuple_inputs.insert(make_tuple_inputs.end(), memcpy_async_list.begin(), memcpy_async_list.end()); | |||||
| make_tuple_inputs.emplace_back(hccl_node); | |||||
| auto make_tuple = graph->NewCNode(make_tuple_inputs); | |||||
| MS_EXCEPTION_IF_NULL(make_tuple); | |||||
| new_inputs.push_back(make_tuple); | |||||
| } else { | |||||
| new_inputs.push_back(control_depend->input(i)); | |||||
| } | |||||
| AnfNodePtr output = node_index.first; | |||||
| MS_EXCEPTION_IF_NULL(output); | |||||
| if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimControlDepend)) { | |||||
| SetInput(output->cast<CNodePtr>(), node_index.second, graph, hccl_node, memcpy_async_list); | |||||
| } else if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimTupleGetItem)) { | |||||
| DealControlForGetitem(output->cast<CNodePtr>(), graph, hccl_node, memcpy_async_list); | |||||
| } | } | ||||
| control_depend->set_inputs(new_inputs); | |||||
| } | } | ||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| @@ -148,11 +168,10 @@ const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_gr | |||||
| if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) { | if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) { | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| auto cnode = node->cast<CNodePtr>(); | |||||
| if (!AnfAlgo::IsCommunicationOp(node)) { | if (!AnfAlgo::IsCommunicationOp(node)) { | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| InsertMemcpyAsync(func_graph, cnode); | |||||
| InsertMemcpyAsync(func_graph, node->cast<CNodePtr>()); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| } // namespace opt | } // namespace opt | ||||
| @@ -65,7 +65,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo | |||||
| MS_EXCEPTION_IF_NULL(replace_node); | MS_EXCEPTION_IF_NULL(replace_node); | ||||
| replace_node->set_scope(cnode->scope()); | replace_node->set_scope(cnode->scope()); | ||||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | ||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) { | |||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode, output_idx)) { | |||||
| kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0); | kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0); | ||||
| } | } | ||||
| } else { | } else { | ||||
| @@ -114,7 +114,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c | |||||
| MS_EXCEPTION_IF_NULL(replace_node); | MS_EXCEPTION_IF_NULL(replace_node); | ||||
| replace_node->set_scope(cnode->scope()); | replace_node->set_scope(cnode->scope()); | ||||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | ||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) { | |||||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode, 0)) { | |||||
| kernel_graph->ReplaceInternalOutput(cnode, replace_node); | kernel_graph->ReplaceInternalOutput(cnode, replace_node); | ||||
| } | } | ||||
| } | } | ||||
| @@ -58,7 +58,7 @@ const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, c | |||||
| if (kernel_graph == nullptr) { | if (kernel_graph == nullptr) { | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| if (!kernel_graph->IsInternalOutput(node)) { | |||||
| if (!kernel_graph->IsInternalOutput(node, 0)) { | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| if (!UsedForOutputOnly(func_graph, node)) { | if (!UsedForOutputOnly(func_graph, node)) { | ||||
| @@ -0,0 +1,65 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/ascend/format_type/split_unsupported_transdata.h" | |||||
| #include <vector> | |||||
| #include <memory> | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| const BaseRef SplitUnsupportedTransData::DefinePattern() const { | |||||
| VarPtr X = std::make_shared<Var>(); | |||||
| return VectorRef({prim::KPrimTransData, X}); | |||||
| } | |||||
| const AnfNodePtr SplitUnsupportedTransData::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||||
| const EquivPtr &) const { | |||||
| if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) { | |||||
| return nullptr; | |||||
| } | |||||
| auto ori_trans_data = node->cast<CNodePtr>(); | |||||
| if (AnfAlgo::GetCNodeName(ori_trans_data) != prim::KPrimTransData->name()) { | |||||
| return nullptr; | |||||
| } | |||||
| auto kernel_info = AnfAlgo::GetSelectKernelBuildInfo(ori_trans_data); | |||||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||||
| if (kernel_info->GetInputNum() != 1 || kernel_info->GetOutputNum() != 1) { | |||||
| MS_LOG(EXCEPTION) << "Transdata node's kernel info's input and output format size is not 1" | |||||
| << ori_trans_data->DebugString(); | |||||
| } | |||||
| return SplitTransData(func_graph, ori_trans_data); | |||||
| } | |||||
| AnfNodePtr SplitUnsupportedTransData::SplitTransData(const FuncGraphPtr &func_graph, const CNodePtr &trans_node) const { | |||||
| auto kernel_info = AnfAlgo::GetSelectKernelBuildInfo(trans_node); | |||||
| if (kHWSpecialFormatSet.find(kernel_info->GetInputFormat(0)) == kHWSpecialFormatSet.end() || | |||||
| kHWSpecialFormatSet.find(kernel_info->GetOutputFormat(0)) == kHWSpecialFormatSet.end()) { | |||||
| return trans_node; | |||||
| } | |||||
| auto builder_info_to_default = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_info); | |||||
| auto builder_info_to_special_foramt = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_info); | |||||
| builder_info_to_default->SetOutputsFormat({kOpFormat_DEFAULT}); | |||||
| builder_info_to_special_foramt->SetInputsFormat({kOpFormat_DEFAULT}); | |||||
| std::vector<AnfNodePtr> next_trans_node_inputs = { | |||||
| NewValueNode(std::make_shared<Primitive>(prim::KPrimTransData->name())), trans_node}; | |||||
| auto next_trans_node = func_graph->NewCNode(next_trans_node_inputs); | |||||
| next_trans_node->set_abstract(trans_node->abstract()); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(builder_info_to_default->Build(), trans_node.get()); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(builder_info_to_special_foramt->Build(), next_trans_node.get()); | |||||
| return next_trans_node; | |||||
| } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,37 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H | |||||
| #include "backend/optimizer/common/optimizer.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class SplitUnsupportedTransData : public PatternProcessPass { | |||||
| public: | |||||
| explicit SplitUnsupportedTransData(bool multigraph = true) | |||||
| : PatternProcessPass("split_unsupported_transdata", multigraph) {} | |||||
| ~SplitUnsupportedTransData() override = default; | |||||
| const BaseRef DefinePattern() const override; | |||||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||||
| private: | |||||
| AnfNodePtr SplitTransData(const FuncGraphPtr &func_graph, const CNodePtr &trans_node) const; | |||||
| }; | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_REMOVE_TRANSDATA_SPILT_H | |||||
| @@ -405,7 +405,7 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod | |||||
| } | } | ||||
| auto node = cnode->input(input_idx + 1); | auto node = cnode->input(input_idx + 1); | ||||
| MS_EXCEPTION_IF_NULL(node); | MS_EXCEPTION_IF_NULL(node); | ||||
| return VisitKernel(node, 0); | |||||
| return VisitKernelWithReturnType(node, 0); | |||||
| } | } | ||||
| std::string AnfRuntimeAlgorithm::GetPrevNodeOutputFormat(const AnfNodePtr &anf_node, size_t input_idx) { | std::string AnfRuntimeAlgorithm::GetPrevNodeOutputFormat(const AnfNodePtr &anf_node, size_t input_idx) { | ||||
| @@ -94,25 +94,33 @@ bool AscendInferenceSession::CheckModelInputs(uint32_t graph_id, const std::vect | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| auto kernel_graph_inputs = kernel_graph->inputs(); | auto kernel_graph_inputs = kernel_graph->inputs(); | ||||
| size_t no_weight_input = 0; | size_t no_weight_input = 0; | ||||
| vector<ParameterPtr> paras; | |||||
| // find parameters of graph inputs | |||||
| for (size_t i = 0; i < kernel_graph_inputs.size(); ++i) { | for (size_t i = 0; i < kernel_graph_inputs.size(); ++i) { | ||||
| tensor::TensorPtr tensor = nullptr; | |||||
| if (!kernel_graph_inputs[i]->isa<Parameter>()) { | if (!kernel_graph_inputs[i]->isa<Parameter>()) { | ||||
| MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter."; | MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter."; | ||||
| continue; | continue; | ||||
| } | } | ||||
| auto parameter = kernel_graph_inputs[i]->cast<ParameterPtr>(); | auto parameter = kernel_graph_inputs[i]->cast<ParameterPtr>(); | ||||
| if (!AnfAlgo::IsParameterWeight(parameter)) { | if (!AnfAlgo::IsParameterWeight(parameter)) { | ||||
| // compare input number | |||||
| if (no_weight_input >= inputs.size()) { | |||||
| MS_LOG(ERROR) << "Input number is inconsistent. The actual input number [" << inputs.size() | |||||
| << "] less than that of graph."; | |||||
| return false; | |||||
| } | |||||
| auto input = inputs[no_weight_input++]; | |||||
| if (!CompareInput(input, parameter)) { | |||||
| MS_LOG(ERROR) << "Please check the input information."; | |||||
| return false; | |||||
| } | |||||
| paras.push_back(parameter); | |||||
| } | |||||
| } | |||||
| // check inputs | |||||
| for (size_t i = 0; i < paras.size(); ++i) { | |||||
| // compare input number | |||||
| if (paras.size() != inputs.size()) { | |||||
| MS_LOG(ERROR) << "Input number is inconsistent. The actual input number [" << inputs.size() | |||||
| << "] but the graph input number is [" << paras.size() << "]"; | |||||
| MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs); | |||||
| return false; | |||||
| } | |||||
| auto input = inputs[no_weight_input++]; | |||||
| if (!CompareInput(input, paras[i])) { | |||||
| MS_LOG(ERROR) << "Please check the input information."; | |||||
| MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs); | |||||
| return false; | |||||
| } | } | ||||
| } | } | ||||
| return true; | return true; | ||||
| @@ -123,12 +131,6 @@ bool AscendInferenceSession::CompareInput(const tensor::TensorPtr &input, const | |||||
| MS_EXCEPTION_IF_NULL(parameter); | MS_EXCEPTION_IF_NULL(parameter); | ||||
| // compare dims | // compare dims | ||||
| auto parameter_shape = AnfAlgo::GetOutputDeviceShape(parameter, 0); | auto parameter_shape = AnfAlgo::GetOutputDeviceShape(parameter, 0); | ||||
| if (input->shape().size() != parameter_shape.size()) { | |||||
| MS_LOG(ERROR) << "Input dim is inconsistent. The actual dim is " << input->shape().size() | |||||
| << ", but the parameter dim is " << parameter_shape.size() | |||||
| << ". parameter : " << parameter->DebugString(); | |||||
| return false; | |||||
| } | |||||
| // compare shape | // compare shape | ||||
| auto input_shape = input->shape(); | auto input_shape = input->shape(); | ||||
| @@ -153,12 +155,31 @@ bool AscendInferenceSession::CompareInput(const tensor::TensorPtr &input, const | |||||
| return true; | return true; | ||||
| } | } | ||||
| std::string AscendInferenceSession::PrintInputShape(std::vector<size_t> shape) const { | |||||
| template <typename T> | |||||
| std::string AscendInferenceSession::PrintInputShape(std::vector<T> shape) const { | |||||
| string res = "["; | string res = "["; | ||||
| for (auto dim : shape) { | for (auto dim : shape) { | ||||
| res += " " + std::to_string(dim); | res += " " + std::to_string(dim); | ||||
| } | } | ||||
| return res + " ]"; | return res + " ]"; | ||||
| } | } | ||||
| std::string AscendInferenceSession::InputsInfo(const std::vector<ParameterPtr> ¶s, | |||||
| const std::vector<tensor::TensorPtr> &inputs) const { | |||||
| std::string graph = "graph inputs:{ "; | |||||
| for (size_t i = 0; i < paras.size(); ++i) { | |||||
| graph += std::to_string(i) + ": dims " + std::to_string(AnfAlgo::GetOutputDeviceShape(paras[i], 0).size()) + | |||||
| ", shape " + PrintInputShape(AnfAlgo::GetOutputDeviceShape(paras[i], 0)) + ", data type " + | |||||
| std::to_string(AnfAlgo::GetSelectKernelBuildInfo(paras[i])->GetOutputDeviceType(0)) + " }"; | |||||
| } | |||||
| std::string actual = "actual inputs:{ "; | |||||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||||
| actual += std::to_string(i) + ": dims " + std::to_string(inputs[i]->shape().size()) + ", shape " + | |||||
| PrintInputShape(inputs[i]->shape()) + ", data type " + std::to_string(inputs[i]->data_type()) + " }"; | |||||
| } | |||||
| return graph + " " + actual; | |||||
| } | |||||
| } // namespace session | } // namespace session | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -41,7 +41,9 @@ class AscendInferenceSession : public AscendSession { | |||||
| GraphId CompileGraph(NotNull<FuncGraphPtr> func_graph) override; | GraphId CompileGraph(NotNull<FuncGraphPtr> func_graph) override; | ||||
| bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs) const override; | bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs) const override; | ||||
| bool CompareInput(const tensor::TensorPtr &input, const ParameterPtr ¶meter) const; | bool CompareInput(const tensor::TensorPtr &input, const ParameterPtr ¶meter) const; | ||||
| std::string PrintInputShape(std::vector<size_t> shape) const; | |||||
| template <typename T> | |||||
| std::string PrintInputShape(std::vector<T> shape) const; | |||||
| std::string InputsInfo(const std::vector<ParameterPtr> ¶s, const std::vector<tensor::TensorPtr> &inputs) const; | |||||
| }; | }; | ||||
| MS_REG_SESSION(kDavinciInferenceDevice, AscendInferenceSession); | MS_REG_SESSION(kDavinciInferenceDevice, AscendInferenceSession); | ||||
| } // namespace session | } // namespace session | ||||
| @@ -517,9 +517,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor:: | |||||
| LoadInputData(kernel_graph, inputs); | LoadInputData(kernel_graph, inputs); | ||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| // Initialize parameter server | // Initialize parameter server | ||||
| if (!ps_init_) { | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| } | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| #endif | #endif | ||||
| // convert inputs to model | // convert inputs to model | ||||
| predictmodel::StepConvertWeight(inputs); | predictmodel::StepConvertWeight(inputs); | ||||
| @@ -91,10 +91,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten | |||||
| auto &kernel_graph = graphs_[graph_id]; | auto &kernel_graph = graphs_[graph_id]; | ||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| // Initialize parameter server | |||||
| if (!ps_init_) { | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| } | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| #endif | #endif | ||||
| MS_LOG(INFO) << "Bind input output address"; | MS_LOG(INFO) << "Bind input output address"; | ||||
| std::vector<tensor::TensorPtr> need_sync_outputs; | std::vector<tensor::TensorPtr> need_sync_outputs; | ||||
| @@ -233,9 +233,7 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten | |||||
| LoadInputData(kernel_graph, inputs); | LoadInputData(kernel_graph, inputs); | ||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| // Initialize parameter server | // Initialize parameter server | ||||
| if (!ps_init_) { | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| } | |||||
| InitPSParamAndOptim(kernel_graph, inputs); | |||||
| #endif | #endif | ||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| // Convert inputs to model | // Convert inputs to model | ||||
| @@ -281,7 +279,10 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph | |||||
| RunOpAllocateMemory(input_tensors, kernel_graph.get()); | RunOpAllocateMemory(input_tensors, kernel_graph.get()); | ||||
| // Execute the computation | // Execute the computation | ||||
| LoadInputData(kernel_graph, input_tensors); | LoadInputData(kernel_graph, input_tensors); | ||||
| Execute(kernel_graph); | |||||
| { | |||||
| py::gil_scoped_release gil_release; | |||||
| Execute(kernel_graph); | |||||
| } | |||||
| // Fetch outputs | // Fetch outputs | ||||
| VectorRef outputs; | VectorRef outputs; | ||||
| UpdateOutputs(kernel_graph, &outputs, input_tensors); | UpdateOutputs(kernel_graph, &outputs, input_tensors); | ||||
| @@ -1021,26 +1021,16 @@ AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_nod | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const { | |||||
| if (internal_outputs_to_front_map_.find(node) != internal_outputs_to_front_map_.end()) { | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) { | |||||
| if (node == nullptr) { | |||||
| return; | |||||
| } | |||||
| (void)final_output_kernels_.insert(node); | |||||
| } | |||||
| bool KernelGraph::IsFinalOutputKernel(const AnfNodePtr &node) const { | |||||
| if (node == nullptr) { | |||||
| return false; | |||||
| } | |||||
| if (final_output_kernels_.find(node) != final_output_kernels_.end()) { | |||||
| return true; | |||||
| bool KernelGraph::IsInternalOutput(const AnfNodePtr &node, int output_idx) const { | |||||
| auto front_nodes_iter = internal_outputs_to_front_map_.find(node); | |||||
| if (front_nodes_iter != internal_outputs_to_front_map_.end()) { | |||||
| if (output_idx == -1) { | |||||
| return true; | |||||
| } | |||||
| auto &front_nodes = front_nodes_iter->second; | |||||
| if (front_nodes.find(output_idx) != front_nodes.end()) { | |||||
| return true; | |||||
| } | |||||
| } | } | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -153,9 +153,7 @@ class KernelGraph : public FuncGraph { | |||||
| void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1, | void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1, | ||||
| int dst_output_idx = -1); | int dst_output_idx = -1); | ||||
| AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const; | AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const; | ||||
| bool IsInternalOutput(const AnfNodePtr &node) const; | |||||
| void AddFinalOutputKernel(const AnfNodePtr &node); | |||||
| bool IsFinalOutputKernel(const AnfNodePtr &node) const; | |||||
| bool IsInternalOutput(const AnfNodePtr &node, int output_idx = -1) const; | |||||
| uint32_t current_epoch() const { return current_epoch_; } | uint32_t current_epoch() const { return current_epoch_; } | ||||
| void set_current_epoch(uint32_t epoch) { current_epoch_ = epoch; } | void set_current_epoch(uint32_t epoch) { current_epoch_ = epoch; } | ||||
| void UpdateChildGraphOrder(); | void UpdateChildGraphOrder(); | ||||
| @@ -230,7 +228,6 @@ class KernelGraph : public FuncGraph { | |||||
| bool null_output_; | bool null_output_; | ||||
| std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_; | std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_; | ||||
| std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_; | std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_; | ||||
| std::set<AnfNodePtr> final_output_kernels_; | |||||
| uint32_t current_epoch_; | uint32_t current_epoch_; | ||||
| }; | }; | ||||
| } // namespace session | } // namespace session | ||||
| @@ -89,7 +89,7 @@ BaseRef CreateOneTensor(const AnfNodePtr &node, size_t output_index, const Kerne | |||||
| TypeId type_id = kNumberTypeFloat32; | TypeId type_id = kNumberTypeFloat32; | ||||
| type_id = AnfAlgo::GetOutputInferDataType(node, output_index); | type_id = AnfAlgo::GetOutputInferDataType(node, output_index); | ||||
| std::vector<int> temp_shape; | std::vector<int> temp_shape; | ||||
| if (graph.IsInternalOutput(node)) { | |||||
| if (graph.IsInternalOutput(node, output_index)) { | |||||
| temp_shape.emplace_back(1); | temp_shape.emplace_back(1); | ||||
| tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape); | tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape); | ||||
| tensor->set_device_address(address); | tensor->set_device_address(address); | ||||
| @@ -307,18 +307,17 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const | |||||
| auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx); | auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx); | ||||
| auto ref_real_node = real_kernel.first; | auto ref_real_node = real_kernel.first; | ||||
| auto ref_real_node_index = real_kernel.second; | auto ref_real_node_index = real_kernel.second; | ||||
| if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) && | |||||
| node_graph->IsFinalOutputKernel(ref_real_node)) { | |||||
| if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node, ref_real_node_index)) { | |||||
| auto kernel_info = ref_real_node->kernel_info(); | auto kernel_info = ref_real_node->kernel_info(); | ||||
| if (kernel_info == nullptr || !kernel_info->has_build_info()) { | if (kernel_info == nullptr || !kernel_info->has_build_info()) { | ||||
| MS_LOG(INFO) << "No kernel info"; | MS_LOG(INFO) << "No kernel info"; | ||||
| return; | return; | ||||
| } | } | ||||
| auto address = AnfAlgo::GetMutableOutputAddr(ref_real_node, ref_real_node_index); | |||||
| if (address == nullptr) { | |||||
| if (!opt::IsNopNode(ref_real_node) && !AnfAlgo::OutputAddrExist(ref_real_node, ref_real_node_index)) { | |||||
| MS_LOG(INFO) << "No kernel address"; | MS_LOG(INFO) << "No kernel address"; | ||||
| return; | return; | ||||
| } | } | ||||
| auto address = AnfAlgo::GetMutableOutputAddr(ref_real_node, ref_real_node_index); | |||||
| auto format = AnfAlgo::GetOutputFormat(ref_real_node, ref_real_node_index); | auto format = AnfAlgo::GetOutputFormat(ref_real_node, ref_real_node_index); | ||||
| auto type = AnfAlgo::GetOutputDeviceDataType(ref_real_node, ref_real_node_index); | auto type = AnfAlgo::GetOutputDeviceDataType(ref_real_node, ref_real_node_index); | ||||
| auto d_kernel_info = std::make_shared<device::KernelInfo>(); | auto d_kernel_info = std::make_shared<device::KernelInfo>(); | ||||
| @@ -1004,6 +1003,7 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std: | |||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| if (internal_output) { | if (internal_output) { | ||||
| MS_LOG(INFO) << "Internal output1: " << out->DebugString() << "To " << backend_real_kernel.first->DebugString(); | MS_LOG(INFO) << "Internal output1: " << out->DebugString() << "To " << backend_real_kernel.first->DebugString(); | ||||
| graph->AddInternalOutput(out, backend_real_kernel.first); | graph->AddInternalOutput(out, backend_real_kernel.first); | ||||
| @@ -1203,11 +1203,9 @@ void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, | |||||
| MS_EXCEPTION_IF_NULL(input_node); | MS_EXCEPTION_IF_NULL(input_node); | ||||
| if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) { | if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) { | ||||
| auto pk_node = input_node->cast<ParameterPtr>(); | auto pk_node = input_node->cast<ParameterPtr>(); | ||||
| mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim( | |||||
| pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes())); | |||||
| mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(pk_node->fullname_with_scope(), tensor); | |||||
| } | } | ||||
| } | } | ||||
| ps_init_ = true; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } // namespace session | } // namespace session | ||||
| @@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>; | |||||
| class SessionBasic { | class SessionBasic { | ||||
| public: | public: | ||||
| SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) { | |||||
| SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) { | |||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| debugger_ = nullptr; | debugger_ = nullptr; | ||||
| #endif | #endif | ||||
| @@ -152,7 +152,6 @@ class SessionBasic { | |||||
| CallBackFunc summary_callback_; | CallBackFunc summary_callback_; | ||||
| static GraphId graph_sum_; | static GraphId graph_sum_; | ||||
| uint32_t device_id_; | uint32_t device_id_; | ||||
| bool ps_init_; | |||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| std::shared_ptr<Debugger> debugger_; | std::shared_ptr<Debugger> debugger_; | ||||
| #endif | #endif | ||||
| @@ -378,10 +378,19 @@ AbstractBasePtr InferImplMakeIndexedSlices(const AnalysisEnginePtr &, const Prim | |||||
| auto elem = GetValue<int>(e); | auto elem = GetValue<int>(e); | ||||
| return elem; | return elem; | ||||
| }); | }); | ||||
| for (auto dense_shape_elem : dense_shape_vec) { | |||||
| if (dense_shape_elem < 0) { | |||||
| MS_EXCEPTION(TypeError) << "The element of dense_shape must be positive, but got " | |||||
| << dense_shape_value->ToString(); | |||||
| if (dense_shape_vec.size() != values_shp.size()) { | |||||
| MS_EXCEPTION(TypeError) << "The size of dense_shape must be the same with the dimension of values " | |||||
| << values_shp.size() << ", but got " << dense_shape_value->size(); | |||||
| } | |||||
| for (size_t i = 0; i < dense_shape_vec.size(); i++) { | |||||
| if (dense_shape_vec[i] < 0) { | |||||
| MS_EXCEPTION(TypeError) << "The " << i << "th element of dense_shape must be positive, but got " | |||||
| << dense_shape_vec[i]; | |||||
| } | |||||
| // The 0th mode might be less or exceed dense_shape[0] due to duplicated selection | |||||
| if (i != 0 && dense_shape_vec[i] != values_shp[i]) { | |||||
| MS_EXCEPTION(TypeError) << "The " << i << "th element of dense_shape must be same with the " << i | |||||
| << "th dimension of values " << values_shp[i] << ", but got " << dense_shape_vec[i]; | |||||
| } | } | ||||
| } | } | ||||
| auto ret = std::make_shared<AbstractIndexedSlices>(values->element()->BuildType(), dense_shape_vec); | auto ret = std::make_shared<AbstractIndexedSlices>(values->element()->BuildType(), dense_shape_vec); | ||||
| @@ -34,7 +34,8 @@ namespace parallel { | |||||
| #define OPERATOR_TO_OPERATOR_CONNECTOR "-" | #define OPERATOR_TO_OPERATOR_CONNECTOR "-" | ||||
| #define DEFAULT_DEVICE_MEMORY_CAPACITY (1024.0 * 1024.0 * 1024.0 * 16.0) | #define DEFAULT_DEVICE_MEMORY_CAPACITY (1024.0 * 1024.0 * 1024.0 * 16.0) | ||||
| #define DEFAULT_COST_MODEL_ALPHA 1.0 | #define DEFAULT_COST_MODEL_ALPHA 1.0 | ||||
| #define DEFAULT_COST_MODEL_BETA 400.0 | |||||
| #define DEFAULT_COST_MODEL_BETA_ASCEND 400.0 // for 'device_target = Ascend' | |||||
| #define DEFAULT_COST_MODEL_BETA_GPU 50.0 // for 'device_target = GPU' | |||||
| #define DEFAULT_COST_MODEL_GAMMA 0.001 | #define DEFAULT_COST_MODEL_GAMMA 0.001 | ||||
| #define DEFAULT_COST_MODEL_SIMPLIFY_CALCULATION true | #define DEFAULT_COST_MODEL_SIMPLIFY_CALCULATION true | ||||
| #define DEFAULT_COST_MODEL_COMMUNI_THRESHOLD 2048.0 | #define DEFAULT_COST_MODEL_COMMUNI_THRESHOLD 2048.0 | ||||
| @@ -73,7 +74,7 @@ class CostGraph { | |||||
| CostGraph() { | CostGraph() { | ||||
| dev_memory_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | dev_memory_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | ||||
| costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | ||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA; | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND; | |||||
| } | } | ||||
| ~CostGraph() = default; | ~CostGraph() = default; | ||||
| void AddOperator(const OperatorInfoPtr &op) { ops_.push_back(op); } | void AddOperator(const OperatorInfoPtr &op) { ops_.push_back(op); } | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "frontend/parallel/allreduce_fusion/allreduce_fusion.h" | #include "frontend/parallel/allreduce_fusion/allreduce_fusion.h" | ||||
| #include "frontend/parallel/auto_parallel/graph_costmodel.h" | #include "frontend/parallel/auto_parallel/graph_costmodel.h" | ||||
| #include "utils/context/ms_context.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace parallel { | namespace parallel { | ||||
| @@ -41,7 +42,7 @@ CostModelContext::CostModelContext() { | |||||
| void CostModelContext::ResetCostModel() { | void CostModelContext::ResetCostModel() { | ||||
| device_memory_capacity_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | device_memory_capacity_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | ||||
| costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | ||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA; | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND; | |||||
| costmodel_gamma_ = DEFAULT_COST_MODEL_GAMMA; | costmodel_gamma_ = DEFAULT_COST_MODEL_GAMMA; | ||||
| costmodel_communi_threshold_ = DEFAULT_COST_MODEL_COMMUNI_THRESHOLD; | costmodel_communi_threshold_ = DEFAULT_COST_MODEL_COMMUNI_THRESHOLD; | ||||
| costmodel_communi_const_ = DEFAULT_COST_MODEL_COMMUNI_CONST; | costmodel_communi_const_ = DEFAULT_COST_MODEL_COMMUNI_CONST; | ||||
| @@ -66,6 +67,12 @@ void CostModelContext::ResetAlgoParameters() { | |||||
| elementwise_stra_follow_ = DEFAULT_ELEMENTWISE_OP_STRA_FOLLOW; | elementwise_stra_follow_ = DEFAULT_ELEMENTWISE_OP_STRA_FOLLOW; | ||||
| } | } | ||||
| void CostModelContext::set_costmodel_context_for_device(const std::string &device_target) { | |||||
| if (device_target == kGPUDevice) { | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_GPU; | |||||
| } | |||||
| } | |||||
| void CostModelContext::set_device_memory_capacity(double dm_capacity) { device_memory_capacity_ = dm_capacity; } | void CostModelContext::set_device_memory_capacity(double dm_capacity) { device_memory_capacity_ = dm_capacity; } | ||||
| void CostModelContext::set_costmodel_alpha(double cm_alpha) { costmodel_alpha_ = cm_alpha; } | void CostModelContext::set_costmodel_alpha(double cm_alpha) { costmodel_alpha_ = cm_alpha; } | ||||
| @@ -35,6 +35,7 @@ class CostModelContext { | |||||
| static std::shared_ptr<CostModelContext> GetInstance(); | static std::shared_ptr<CostModelContext> GetInstance(); | ||||
| void set_costmodel_context_for_device(const std::string &); | |||||
| // DEVICE_MEMORY_CAPACITY | // DEVICE_MEMORY_CAPACITY | ||||
| void set_device_memory_capacity(double); | void set_device_memory_capacity(double); | ||||
| double device_memory_capacity() const { return device_memory_capacity_; } | double device_memory_capacity() const { return device_memory_capacity_; } | ||||
| @@ -57,15 +57,22 @@ constexpr char kMomentum[] = "momentum"; | |||||
| constexpr char kApplyMomentum[] = "ApplyMomentum"; | constexpr char kApplyMomentum[] = "ApplyMomentum"; | ||||
| constexpr char kSparseAdam[] = "Adam"; | constexpr char kSparseAdam[] = "Adam"; | ||||
| constexpr char kSparseFtrl[] = "Ftrl"; | constexpr char kSparseFtrl[] = "Ftrl"; | ||||
| constexpr char kApplyMomentumOp[] = "Momentum"; | |||||
| constexpr char kSparseAdamOp[] = "Adam"; | |||||
| constexpr char kSparseFtrlOp[] = "FTRL"; | |||||
| constexpr int kInitWeightsCmd = 10; | constexpr int kInitWeightsCmd = 10; | ||||
| constexpr int kInitWeightToOptimIdCmd = 11; | constexpr int kInitWeightToOptimIdCmd = 11; | ||||
| constexpr int kInitOptimInputsShapeCmd = 12; | constexpr int kInitOptimInputsShapeCmd = 12; | ||||
| constexpr int kInitKeyToPushNodeIdCmd = 13; | |||||
| constexpr int kInitEmbeddingsCmd = 20; | constexpr int kInitEmbeddingsCmd = 20; | ||||
| constexpr int kCheckReadyForPushCmd = 25; | |||||
| constexpr int kCheckReadyForPullCmd = 26; | |||||
| constexpr int kEmbeddingLookupCmd = 30; | constexpr int kEmbeddingLookupCmd = 30; | ||||
| constexpr int kFinalizeCmd = 40; | constexpr int kFinalizeCmd = 40; | ||||
| constexpr size_t kInvalidKey = UINT64_MAX; | constexpr size_t kInvalidKey = UINT64_MAX; | ||||
| constexpr int kInvalidID = -1; | |||||
| using Key = ::ps::Key; | using Key = ::ps::Key; | ||||
| using Keys = ::ps::SArray<Key>; | using Keys = ::ps::SArray<Key>; | ||||
| @@ -158,16 +158,19 @@ OptimizerInfo *SparseFtrlOptimInfoBuilder::BuildInputs(const WeightPtr &weight, | |||||
| } | } | ||||
| AddressPtr linear = std::make_shared<kernel::Address>(); | AddressPtr linear = std::make_shared<kernel::Address>(); | ||||
| linear->addr = new float[weight->size()]; | linear->addr = new float[weight->size()]; | ||||
| memcpy_s(linear->addr, weight->size() * sizeof(float), 0x00, weight->size() * sizeof(float)); | |||||
| auto ret = memset_s(linear->addr, weight->size() * sizeof(float), 0x00, weight->size() * sizeof(float)); | |||||
| if (ret != 0) { | |||||
| MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; | |||||
| } | |||||
| linear->size = weight->size() * sizeof(float); | linear->size = weight->size() * sizeof(float); | ||||
| const std::shared_ptr<std::vector<size_t>> &grad_shape = (*inputs_shape)[3]; | const std::shared_ptr<std::vector<size_t>> &grad_shape = (*inputs_shape)[3]; | ||||
| size_t total_grad_size = std::accumulate((*grad_shape).begin(), (*grad_shape).end(), 1, std::multiplies<size_t>()); | size_t total_grad_size = std::accumulate((*grad_shape).begin(), (*grad_shape).end(), 1, std::multiplies<size_t>()); | ||||
| AddressPtr grad = std::make_shared<kernel::Address>(); | AddressPtr grad = std::make_shared<kernel::Address>(); | ||||
| grad->addr = new float[total_grad_size * worker_num]; | grad->addr = new float[total_grad_size * worker_num]; | ||||
| auto ret = memcpy_s(grad->addr, lens[0] * sizeof(float), values.data(), lens[0] * sizeof(float)); | |||||
| if (ret != 0) { | |||||
| MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; | |||||
| auto ret1 = memcpy_s(grad->addr, lens[0] * sizeof(float), values.data(), lens[0] * sizeof(float)); | |||||
| if (ret1 != 0) { | |||||
| MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret1 << ")"; | |||||
| } | } | ||||
| grad->size = lens[0] * sizeof(float); | grad->size = lens[0] * sizeof(float); | ||||
| @@ -28,6 +28,7 @@ | |||||
| #include <thread> | #include <thread> | ||||
| #include <cmath> | #include <cmath> | ||||
| #include <random> | #include <random> | ||||
| #include <list> | |||||
| #include "ir/func_graph.h" | #include "ir/func_graph.h" | ||||
| #include "backend/session/session_basic.h" | #include "backend/session/session_basic.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| @@ -70,6 +71,7 @@ class ParameterServer { | |||||
| handler_(nullptr), | handler_(nullptr), | ||||
| func_graph_(nullptr), | func_graph_(nullptr), | ||||
| sess_(nullptr), | sess_(nullptr), | ||||
| running_(true), | |||||
| thread_(nullptr) {} | thread_(nullptr) {} | ||||
| ~ParameterServer() = default; | ~ParameterServer() = default; | ||||
| ParameterServer(const ParameterServer &) = delete; | ParameterServer(const ParameterServer &) = delete; | ||||
| @@ -89,6 +91,8 @@ class ParameterServer { | |||||
| ::ps::KVPairs<T> *res); | ::ps::KVPairs<T> *res); | ||||
| void HandleInitInputsShape(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | void HandleInitInputsShape(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | ||||
| void HandleInitEmbeddings(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | void HandleInitEmbeddings(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | ||||
| void HandleCheckReadyForPush(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | |||||
| void HandleCheckReadyForPull(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | |||||
| void HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | void HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | ||||
| void HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | void HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res); | ||||
| @@ -96,6 +100,9 @@ class ParameterServer { | |||||
| typedef void (ServerHandler::*RequestHandler)(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, | typedef void (ServerHandler::*RequestHandler)(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, | ||||
| ::ps::KVPairs<T> *res); | ::ps::KVPairs<T> *res); | ||||
| std::unordered_map<int, RequestHandler> handlers_; | std::unordered_map<int, RequestHandler> handlers_; | ||||
| std::unordered_map<Key, bool> init_weights_; | |||||
| std::unordered_map<Key, bool> init_weight_to_optim_; | |||||
| std::unordered_map<Key, bool> init_optim_info_; | |||||
| }; | }; | ||||
| bool Init(const FuncGraphPtr &func_graph); | bool Init(const FuncGraphPtr &func_graph); | ||||
| @@ -106,14 +113,18 @@ class ParameterServer { | |||||
| void InitGrad(const Key &key, const GradPtr &grad); | void InitGrad(const Key &key, const GradPtr &grad); | ||||
| void InitEmbeddingTable(const Key &key, | void InitEmbeddingTable(const Key &key, | ||||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes); | const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes); | ||||
| void Finalize(); | |||||
| void UpdateWeights(); | void UpdateWeights(); | ||||
| void AccumGrad(const Keys &key, const Values &values, const Lengths &lengths); | void AccumGrad(const Keys &key, const Values &values, const Lengths &lengths); | ||||
| WeightPtr weight(const Key &key); | WeightPtr weight(const Key &key); | ||||
| void DoEmbeddingLookup(Key key, const LookupIds &lookup_ids, ::ps::KVPairs<T> *res); | void DoEmbeddingLookup(Key key, const LookupIds &lookup_ids, ::ps::KVPairs<T> *res); | ||||
| int SumOfShapes(const std::vector<int> &shapes) const; | int SumOfShapes(const std::vector<int> &shapes) const; | ||||
| bool ReadyForUpdateWeights(); | bool ReadyForUpdateWeights(); | ||||
| bool ReadyForAccumGrads(); | |||||
| bool ReadyForPush(const Key &key); | |||||
| bool ReadyForPull(const Key &key); | |||||
| void ResetGradAccumCount(); | void ResetGradAccumCount(); | ||||
| std::mutex &mutex(); | |||||
| const CNodePtr GetCNode(const std::string &name) const; | |||||
| size_t pserver_num_; | size_t pserver_num_; | ||||
| size_t worker_num_; | size_t worker_num_; | ||||
| @@ -123,20 +134,23 @@ class ParameterServer { | |||||
| std::unique_ptr<ServerHandler> handler_; | std::unique_ptr<ServerHandler> handler_; | ||||
| FuncGraphPtr func_graph_; | FuncGraphPtr func_graph_; | ||||
| std::shared_ptr<session::SessionBasic> sess_; | std::shared_ptr<session::SessionBasic> sess_; | ||||
| bool running_; | |||||
| std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_; | std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_; | ||||
| std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_; | std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_; | ||||
| std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_; | std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_; | ||||
| std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_; | std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_; | ||||
| std::unordered_map<Key, std::string> weight_key_to_optims_; | std::unordered_map<Key, std::string> weight_key_to_optims_; | ||||
| std::unordered_map<Key, std::string> weight_key_to_optim_op_; | |||||
| std::unordered_map<Key, WeightPtr> weights_; | std::unordered_map<Key, WeightPtr> weights_; | ||||
| std::unordered_map<Key, bool> is_embedding_; | |||||
| std::unordered_map<Key, WeightPtr> grads_; | std::unordered_map<Key, WeightPtr> grads_; | ||||
| std::unordered_map<Key, size_t> grads_accum_counter_; | std::unordered_map<Key, size_t> grads_accum_counter_; | ||||
| std::unordered_map<Key, std::shared_ptr<PServerKernel>> embedding_lookup_ops_; | std::unordered_map<Key, std::shared_ptr<PServerKernel>> embedding_lookup_ops_; | ||||
| std::unordered_map<Key, uint64_t> tokens_; | |||||
| std::mutex mutex_; | std::mutex mutex_; | ||||
| std::condition_variable apply_grads_cv_; | std::condition_variable apply_grads_cv_; | ||||
| std::condition_variable accum_grads_cv_; | |||||
| std::unique_ptr<std::thread> thread_; | std::unique_ptr<std::thread> thread_; | ||||
| @@ -165,6 +179,8 @@ void ParameterServer<T>::ServerHandler::Init() { | |||||
| handlers_[kInitWeightToOptimIdCmd] = &ServerHandler::HandleInitWeightToOptimId; | handlers_[kInitWeightToOptimIdCmd] = &ServerHandler::HandleInitWeightToOptimId; | ||||
| handlers_[kInitOptimInputsShapeCmd] = &ServerHandler::HandleInitInputsShape; | handlers_[kInitOptimInputsShapeCmd] = &ServerHandler::HandleInitInputsShape; | ||||
| handlers_[kInitEmbeddingsCmd] = &ServerHandler::HandleInitEmbeddings; | handlers_[kInitEmbeddingsCmd] = &ServerHandler::HandleInitEmbeddings; | ||||
| handlers_[kCheckReadyForPushCmd] = &ServerHandler::HandleCheckReadyForPush; | |||||
| handlers_[kCheckReadyForPullCmd] = &ServerHandler::HandleCheckReadyForPull; | |||||
| handlers_[kEmbeddingLookupCmd] = &ServerHandler::HandleEmbeddingLookup; | handlers_[kEmbeddingLookupCmd] = &ServerHandler::HandleEmbeddingLookup; | ||||
| handlers_[kFinalizeCmd] = &ServerHandler::HandleFinalize; | handlers_[kFinalizeCmd] = &ServerHandler::HandleFinalize; | ||||
| } | } | ||||
| @@ -186,6 +202,7 @@ void ParameterServer<T>::ServerHandler::HandlePullReq(const ::ps::KVMeta &req_me | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::ServerHandler::HandleInitWeights(const ::ps::KVMeta &req_meta, | void ParameterServer<T>::ServerHandler::HandleInitWeights(const ::ps::KVMeta &req_meta, | ||||
| const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | ||||
| std::unique_lock<std::mutex> lock(ps_->mutex()); | |||||
| size_t key_num = req_data.keys.size(); | size_t key_num = req_data.keys.size(); | ||||
| T *data_ptr = req_data.vals.data(); | T *data_ptr = req_data.vals.data(); | ||||
| size_t pos = 0; | size_t pos = 0; | ||||
| @@ -207,10 +224,16 @@ template <typename T> | |||||
| void ParameterServer<T>::ServerHandler::HandleInitWeightToOptimId(const ::ps::KVMeta &req_meta, | void ParameterServer<T>::ServerHandler::HandleInitWeightToOptimId(const ::ps::KVMeta &req_meta, | ||||
| const ::ps::KVPairs<T> &req_data, | const ::ps::KVPairs<T> &req_data, | ||||
| ::ps::KVPairs<T> *res) { | ::ps::KVPairs<T> *res) { | ||||
| std::unique_lock<std::mutex> lock(ps_->mutex()); | |||||
| size_t key_num = req_data.keys.size(); | size_t key_num = req_data.keys.size(); | ||||
| for (size_t i = 0; i < key_num; i++) { | for (size_t i = 0; i < key_num; i++) { | ||||
| Key key = req_data.keys[i]; | Key key = req_data.keys[i]; | ||||
| T val = req_data.vals[i]; | T val = req_data.vals[i]; | ||||
| if (init_weight_to_optim_[key]) { | |||||
| continue; | |||||
| } else { | |||||
| init_weight_to_optim_[key] = true; | |||||
| } | |||||
| ps_->InitWeightKeyToOptims(key, val); | ps_->InitWeightKeyToOptims(key, val); | ||||
| } | } | ||||
| } | } | ||||
| @@ -218,12 +241,21 @@ void ParameterServer<T>::ServerHandler::HandleInitWeightToOptimId(const ::ps::KV | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::ServerHandler::HandleInitInputsShape(const ::ps::KVMeta &req_meta, | void ParameterServer<T>::ServerHandler::HandleInitInputsShape(const ::ps::KVMeta &req_meta, | ||||
| const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | ||||
| std::unique_lock<std::mutex> lock(ps_->mutex()); | |||||
| const Key &key = req_data.keys[0]; | |||||
| if (init_optim_info_[key]) { | |||||
| return; | |||||
| } else { | |||||
| init_optim_info_[key] = true; | |||||
| } | |||||
| ps_->InitOptimInputsShape(req_data.keys, req_data.vals, req_data.lens); | ps_->InitOptimInputsShape(req_data.keys, req_data.vals, req_data.lens); | ||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta &req_meta, | void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta &req_meta, | ||||
| const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | ||||
| std::unique_lock<std::mutex> lock(ps_->mutex()); | |||||
| const Key &key = req_data.keys[0]; | |||||
| std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes = | std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes = | ||||
| std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>(); | std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>(); | ||||
| std::shared_ptr<std::vector<size_t>> input_shape = std::make_shared<std::vector<size_t>>(); | std::shared_ptr<std::vector<size_t>> input_shape = std::make_shared<std::vector<size_t>>(); | ||||
| @@ -233,7 +265,6 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta | |||||
| shapes->push_back(indices_shape); | shapes->push_back(indices_shape); | ||||
| shapes->push_back(output_shape); | shapes->push_back(output_shape); | ||||
| const Key &key = req_data.keys[0]; | |||||
| const Lengths &lens = req_data.lens; | const Lengths &lens = req_data.lens; | ||||
| size_t index = 0; | size_t index = 0; | ||||
| for (int i = 0; i < lens[0]; i++) { | for (int i = 0; i < lens[0]; i++) { | ||||
| @@ -248,6 +279,26 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta | |||||
| ps_->InitEmbeddingTable(key, shapes); | ps_->InitEmbeddingTable(key, shapes); | ||||
| } | } | ||||
| template <typename T> | |||||
| void ParameterServer<T>::ServerHandler::HandleCheckReadyForPush(const ::ps::KVMeta &req_meta, | |||||
| const ::ps::KVPairs<T> &req_data, | |||||
| ::ps::KVPairs<T> *res) { | |||||
| const Key &key = req_data.keys[0]; | |||||
| bool ready = ps_->ReadyForPush(key); | |||||
| res->keys.push_back(key); | |||||
| res->vals.push_back(ready); | |||||
| } | |||||
| template <typename T> | |||||
| void ParameterServer<T>::ServerHandler::HandleCheckReadyForPull(const ::ps::KVMeta &req_meta, | |||||
| const ::ps::KVPairs<T> &req_data, | |||||
| ::ps::KVPairs<T> *res) { | |||||
| const Key &key = req_data.keys[0]; | |||||
| bool ready = ps_->ReadyForPull(key); | |||||
| res->keys.push_back(key); | |||||
| res->vals.push_back(ready); | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, | void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, | ||||
| const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { | ||||
| @@ -261,7 +312,7 @@ void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::ServerHandler::HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, | void ParameterServer<T>::ServerHandler::HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, | ||||
| ::ps::KVPairs<T> *res) { | ::ps::KVPairs<T> *res) { | ||||
| ::ps::Finalize(0, false); | |||||
| ps_->Finalize(); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -274,7 +325,6 @@ bool ParameterServer<T>::Init(const FuncGraphPtr &func_graph) { | |||||
| handler_->Init(); | handler_->Init(); | ||||
| InitOptimInfoBuilders(); | InitOptimInfoBuilders(); | ||||
| ps_->set_request_handle(*handler_); | ps_->set_request_handle(*handler_); | ||||
| thread_.reset(new std::thread(&ParameterServer::UpdateWeights, this)); | thread_.reset(new std::thread(&ParameterServer::UpdateWeights, this)); | ||||
| return true; | return true; | ||||
| @@ -296,6 +346,7 @@ void ParameterServer<T>::InitWeightKeyToOptims(const Key &key, const int &optim_ | |||||
| return; | return; | ||||
| } | } | ||||
| weight_key_to_optims_[key] = Util::optimizer_name(optim_id); | weight_key_to_optims_[key] = Util::optimizer_name(optim_id); | ||||
| weight_key_to_optim_op_[key] = Util::optimizer_node_name(optim_id); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -318,31 +369,49 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va | |||||
| } | } | ||||
| if (weight_key_to_optims_.count(key) > 0) { | if (weight_key_to_optims_.count(key) > 0) { | ||||
| const std::string &optim_name = weight_key_to_optims_[key]; | const std::string &optim_name = weight_key_to_optims_[key]; | ||||
| const std::string &optim_op_name = weight_key_to_optim_op_[key]; | |||||
| if (optimizers_.count(key) == 0 && optim_inputs_shape_.count(key) > 0) { | if (optimizers_.count(key) == 0 && optim_inputs_shape_.count(key) > 0) { | ||||
| const CNodePtr cnode = GetCNode(optim_op_name); | |||||
| MS_EXCEPTION_IF_NULL(cnode); | |||||
| if (optim_name == kSparseAdam) { | if (optim_name == kSparseAdam) { | ||||
| std::shared_ptr<PServerKernel> optimizer = | std::shared_ptr<PServerKernel> optimizer = | ||||
| std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_); | std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_); | ||||
| optimizer->InitKernel(optim_inputs_shape_[key]); | |||||
| optimizer->InitKernel(cnode, optim_inputs_shape_[key]); | |||||
| optimizers_[key] = optimizer; | optimizers_[key] = optimizer; | ||||
| } else if (optim_name == kApplyMomentum) { | } else if (optim_name == kApplyMomentum) { | ||||
| std::shared_ptr<PServerKernel> optimizer = | std::shared_ptr<PServerKernel> optimizer = | ||||
| std::make_shared<kernel::ps::ApplyMomentumPSKernel>(rank_id_, pserver_num_); | std::make_shared<kernel::ps::ApplyMomentumPSKernel>(rank_id_, pserver_num_); | ||||
| optimizer->InitKernel(optim_inputs_shape_[key]); | |||||
| optimizer->InitKernel(cnode, optim_inputs_shape_[key]); | |||||
| optimizers_[key] = optimizer; | optimizers_[key] = optimizer; | ||||
| } else if (optim_name == kSparseFtrl) { | } else if (optim_name == kSparseFtrl) { | ||||
| std::shared_ptr<PServerKernel> optimizer = | std::shared_ptr<PServerKernel> optimizer = | ||||
| std::make_shared<kernel::ps::SparseApplyFtrlPSKernel>(rank_id_, pserver_num_); | std::make_shared<kernel::ps::SparseApplyFtrlPSKernel>(rank_id_, pserver_num_); | ||||
| optimizer->InitKernel(optim_inputs_shape_[key]); | |||||
| optimizer->InitKernel(cnode, optim_inputs_shape_[key]); | |||||
| optimizers_[key] = optimizer; | optimizers_[key] = optimizer; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| template <typename T> | |||||
| const CNodePtr ParameterServer<T>::GetCNode(const std::string &name) const { | |||||
| std::list<CNodePtr> cnodes = func_graph_->GetOrderedCnodes(); | |||||
| for (CNodePtr cnode : cnodes) { | |||||
| std::string fullname = cnode->fullname_with_scope(); | |||||
| if (fullname.find(name) != std::string::npos && fullname.find("Push") != std::string::npos) { | |||||
| return cnode; | |||||
| } | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::InitWeight(const Key &key, const WeightPtr &weight) { | void ParameterServer<T>::InitWeight(const Key &key, const WeightPtr &weight) { | ||||
| if (weights_.count(key) == 0) { | |||||
| MS_LOG(INFO) << "Initializing weight for key " << key; | |||||
| if ((weights_.count(key) == 0) || (is_embedding_[key] && weights_.count(key) != 0)) { | |||||
| weights_[key] = weight; | weights_[key] = weight; | ||||
| tokens_[key] = 0; | |||||
| is_embedding_[key] = false; | |||||
| } | } | ||||
| } | } | ||||
| @@ -357,7 +426,7 @@ void ParameterServer<T>::InitGrad(const Key &key, const GradPtr &grad) { | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::InitEmbeddingTable( | void ParameterServer<T>::InitEmbeddingTable( | ||||
| const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { | ||||
| // Init embedding lookup kernel | |||||
| MS_LOG(INFO) << "Initializing embedding table for key " << key; | |||||
| std::shared_ptr<PServerKernel> lookup = std::make_shared<kernel::ps::EmbeddingLookUpPSKernel>(rank_id_, pserver_num_); | std::shared_ptr<PServerKernel> lookup = std::make_shared<kernel::ps::EmbeddingLookUpPSKernel>(rank_id_, pserver_num_); | ||||
| lookup->InitKernel(shapes); | lookup->InitKernel(shapes); | ||||
| embedding_lookup_ops_[key] = lookup; | embedding_lookup_ops_[key] = lookup; | ||||
| @@ -377,15 +446,26 @@ void ParameterServer<T>::InitEmbeddingTable( | |||||
| embedding_data[i] = random(engine); | embedding_data[i] = random(engine); | ||||
| } | } | ||||
| weights_[key] = embedding; | weights_[key] = embedding; | ||||
| tokens_[key] = 0; | |||||
| is_embedding_[key] = true; | |||||
| grads_accum_counter_[key] = 0; | grads_accum_counter_[key] = 0; | ||||
| } | } | ||||
| template <typename T> | |||||
| void ParameterServer<T>::Finalize() { | |||||
| running_ = false; | |||||
| apply_grads_cv_.notify_one(); | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::UpdateWeights() { | void ParameterServer<T>::UpdateWeights() { | ||||
| while (true) { | while (true) { | ||||
| std::unique_lock<std::mutex> lock(mutex_); | std::unique_lock<std::mutex> lock(mutex_); | ||||
| apply_grads_cv_.wait(lock, [this] { return this->ReadyForUpdateWeights(); }); | |||||
| apply_grads_cv_.wait(lock, [this] { return this->ReadyForUpdateWeights() || !running_; }); | |||||
| if (!running_) { | |||||
| break; | |||||
| } | |||||
| for (auto iter = weights_.begin(); iter != weights_.end(); iter++) { | for (auto iter = weights_.begin(); iter != weights_.end(); iter++) { | ||||
| Key key = iter->first; | Key key = iter->first; | ||||
| @@ -408,17 +488,17 @@ void ParameterServer<T>::UpdateWeights() { | |||||
| optim_info->ComputeMean(worker_num_); | optim_info->ComputeMean(worker_num_); | ||||
| optimizer->Execute(inputs, workspaces, outputs); | optimizer->Execute(inputs, workspaces, outputs); | ||||
| optim_info->Reset(); | optim_info->Reset(); | ||||
| if (!is_embedding_[key]) { | |||||
| tokens_[key] = worker_num_; | |||||
| } | |||||
| } | } | ||||
| ResetGradAccumCount(); | ResetGradAccumCount(); | ||||
| accum_grads_cv_.notify_all(); | |||||
| } | } | ||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const Lengths &lengths) { | void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const Lengths &lengths) { | ||||
| std::unique_lock<std::mutex> lock(mutex_); | std::unique_lock<std::mutex> lock(mutex_); | ||||
| accum_grads_cv_.wait(lock, [this] { return this->ReadyForAccumGrads(); }); | |||||
| const Key &key = keys[0]; | const Key &key = keys[0]; | ||||
| std::shared_ptr<OptimizerInfo> optim_info = optim_infos_[key]; | std::shared_ptr<OptimizerInfo> optim_info = optim_infos_[key]; | ||||
| @@ -451,14 +531,13 @@ void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const | |||||
| template <typename T> | template <typename T> | ||||
| WeightPtr ParameterServer<T>::weight(const Key &key) { | WeightPtr ParameterServer<T>::weight(const Key &key) { | ||||
| std::unique_lock<std::mutex> lock(mutex_); | std::unique_lock<std::mutex> lock(mutex_); | ||||
| if (weights_.count(key) == 0) { | if (weights_.count(key) == 0) { | ||||
| MS_LOG(ERROR) << "Invalid weight key " << key; | |||||
| return nullptr; | |||||
| MS_LOG(EXCEPTION) << "Invalid weight key " << key; | |||||
| } | } | ||||
| WeightPtr weight_ptr = weights_[key]; | WeightPtr weight_ptr = weights_[key]; | ||||
| WeightPtr copy_weight_ptr = std::make_shared<::ps::SArray<T>>(weight_ptr->size(), 0); | WeightPtr copy_weight_ptr = std::make_shared<::ps::SArray<T>>(weight_ptr->size(), 0); | ||||
| copy_weight_ptr->CopyFrom(weight_ptr->data(), weight_ptr->size()); | copy_weight_ptr->CopyFrom(weight_ptr->data(), weight_ptr->size()); | ||||
| tokens_[key] -= 1; | |||||
| return copy_weight_ptr; | return copy_weight_ptr; | ||||
| } | } | ||||
| @@ -529,8 +608,22 @@ inline bool ParameterServer<T>::ReadyForUpdateWeights() { | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| inline bool ParameterServer<T>::ReadyForAccumGrads() { | |||||
| return grad_accum_count_ < weights_.size(); | |||||
| inline bool ParameterServer<T>::ReadyForPush(const Key &key) { | |||||
| std::unique_lock<std::mutex> lock(mutex_); | |||||
| if (weights_.empty()) { | |||||
| MS_LOG(EXCEPTION) << "The weights in server is empty. Many reasons could cause this: 1.The Worker didn't send " | |||||
| "kInitWeightsCmd command. 2.The Server failed to initialize weights."; | |||||
| } | |||||
| return grad_accum_count_ < weights_.size() && tokens_[key] <= 0; | |||||
| } | |||||
| template <typename T> | |||||
| inline bool ParameterServer<T>::ReadyForPull(const Key &key) { | |||||
| std::unique_lock<std::mutex> lock(mutex_); | |||||
| if (tokens_.count(key) == 0 || weights_[key] == 0) { | |||||
| MS_LOG(EXCEPTION) << "Invalid weight key " << key; | |||||
| } | |||||
| return tokens_[key] > 0; | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -541,6 +634,11 @@ inline void ParameterServer<T>::ResetGradAccumCount() { | |||||
| } | } | ||||
| } | } | ||||
| template <typename T> | |||||
| inline std::mutex &ParameterServer<T>::mutex() { | |||||
| return mutex_; | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) { | void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) { | ||||
| ::ps::Start(0); | ::ps::Start(0); | ||||
| @@ -550,6 +648,8 @@ void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) { | |||||
| } | } | ||||
| Init(func_graph); | Init(func_graph); | ||||
| thread_->join(); | thread_->join(); | ||||
| ::ps::Finalize(0, true); | |||||
| exit(1); | |||||
| } | } | ||||
| } // namespace ps | } // namespace ps | ||||
| } // namespace parallel | } // namespace parallel | ||||
| @@ -23,9 +23,8 @@ namespace parallel { | |||||
| namespace ps { | namespace ps { | ||||
| void Scheduler::Run() { | void Scheduler::Run() { | ||||
| ::ps::Start(0); | ::ps::Start(0); | ||||
| while (true) { | |||||
| sleep(1); | |||||
| } | |||||
| ::ps::Finalize(0, true); | |||||
| exit(1); | |||||
| } | } | ||||
| } // namespace ps | } // namespace ps | ||||
| } // namespace parallel | } // namespace parallel | ||||
| @@ -33,6 +33,13 @@ std::unordered_map<int, std::string> Util::id_to_optimizers{ | |||||
| {1, kSparseAdam}, | {1, kSparseAdam}, | ||||
| {2, kSparseFtrl}, | {2, kSparseFtrl}, | ||||
| }; | }; | ||||
| std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{ | |||||
| {0, kApplyMomentumOp}, | |||||
| {1, kSparseAdamOp}, | |||||
| {2, kSparseFtrlOp}, | |||||
| }; | |||||
| bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); } | bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); } | ||||
| bool Util::IsRoleOfWorker() { | bool Util::IsRoleOfWorker() { | ||||
| @@ -112,6 +119,13 @@ std::string Util::optimizer_name(int id) { | |||||
| return ""; | return ""; | ||||
| } | } | ||||
| std::string Util::optimizer_node_name(int id) { | |||||
| if (id_to_optimizer_nodes.count(id) > 0) { | |||||
| return id_to_optimizer_nodes[id]; | |||||
| } | |||||
| return ""; | |||||
| } | |||||
| bool Util::is_optimizer(std::string name) { return optimizer_to_ids.count(name) > 0; } | bool Util::is_optimizer(std::string name) { return optimizer_to_ids.count(name) > 0; } | ||||
| int Util::LocalShard(int first_dim, int rank_id, int server_num) { | int Util::LocalShard(int first_dim, int rank_id, int server_num) { | ||||
| @@ -34,12 +34,14 @@ class Util { | |||||
| static void SetInternalEnvVar(); | static void SetInternalEnvVar(); | ||||
| static int optimizer_id(std::string name); | static int optimizer_id(std::string name); | ||||
| static std::string optimizer_name(int id); | static std::string optimizer_name(int id); | ||||
| static std::string optimizer_node_name(int id); | |||||
| static bool is_optimizer(std::string name); | static bool is_optimizer(std::string name); | ||||
| static int LocalShard(int first_dim, int rank_id, int server_num); | static int LocalShard(int first_dim, int rank_id, int server_num); | ||||
| private: | private: | ||||
| static std::unordered_map<std::string, int> optimizer_to_ids; | static std::unordered_map<std::string, int> optimizer_to_ids; | ||||
| static std::unordered_map<int, std::string> id_to_optimizers; | static std::unordered_map<int, std::string> id_to_optimizers; | ||||
| static std::unordered_map<int, std::string> id_to_optimizer_nodes; | |||||
| }; | }; | ||||
| } // namespace ps | } // namespace ps | ||||
| } // namespace parallel | } // namespace parallel | ||||
| @@ -24,6 +24,7 @@ | |||||
| #include <map> | #include <map> | ||||
| #include "ps/ps.h" | #include "ps/ps.h" | ||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "ir/tensor.h" | |||||
| #include "frontend/parallel/ps/util.h" | #include "frontend/parallel/ps/util.h" | ||||
| #include "frontend/parallel/ps/common.h" | #include "frontend/parallel/ps/common.h" | ||||
| #include "frontend/parallel/ps/worker_proxy.h" | #include "frontend/parallel/ps/worker_proxy.h" | ||||
| @@ -43,18 +44,20 @@ class Worker { | |||||
| void Push(const std::vector<size_t> &keys, std::vector<uintptr_t> addrs, const std::vector<int> &sizes); | void Push(const std::vector<size_t> &keys, std::vector<uintptr_t> addrs, const std::vector<int> &sizes); | ||||
| void Pull(const size_t key, void *dev_addr, const size_t size); | void Pull(const size_t key, void *dev_addr, const size_t size); | ||||
| size_t SetParamKey(const std::string ¶m_name); | size_t SetParamKey(const std::string ¶m_name); | ||||
| void SetParamInitInServer(const std::string ¶m_name, bool init_in_server); | |||||
| bool GetParamInitInServer(const std::string ¶m_name); | |||||
| void SetKeyOptimId(size_t key, const std::string &optimizer_name); | void SetKeyOptimId(size_t key, const std::string &optimizer_name); | ||||
| void SetOptimInputShapes(size_t key, const std::vector<int> &shape); | void SetOptimInputShapes(size_t key, const std::vector<int> &shape); | ||||
| void AddEmbeddingTable(const ::ps::Key &key, const size_t &row_count); | void AddEmbeddingTable(const ::ps::Key &key, const size_t &row_count); | ||||
| void InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vector<size_t> shapes, const std::vector<int> &sizes); | void InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vector<size_t> shapes, const std::vector<int> &sizes); | ||||
| void InitPSParamAndOptim(const std::string ¶m_name, void *param_data, size_t param_size); | |||||
| void InitPSParamAndOptim(const std::string ¶m_name, tensor::TensorPtr tensor); | |||||
| void DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids, | void DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids, | ||||
| const ::ps::SArray<int> &lens, ::ps::SArray<T> *lookup_result, int cmd); | const ::ps::SArray<int> &lens, ::ps::SArray<T> *lookup_result, int cmd); | ||||
| void Finalize(); | void Finalize(); | ||||
| private: | private: | ||||
| Worker() : kv_worker_(nullptr), running_(false), key_cnt_(0) {} | Worker() : kv_worker_(nullptr), running_(false), key_cnt_(0) {} | ||||
| ~Worker() { ::ps::Finalize(0, true); } | |||||
| ~Worker() = default; | |||||
| Worker(const Worker &) = delete; | Worker(const Worker &) = delete; | ||||
| Worker &operator=(const Worker &) = delete; | Worker &operator=(const Worker &) = delete; | ||||
| @@ -73,6 +76,7 @@ class Worker { | |||||
| std::map<size_t, bool> init_keys_; | std::map<size_t, bool> init_keys_; | ||||
| std::map<size_t, int> key_to_optimId_; | std::map<size_t, int> key_to_optimId_; | ||||
| std::map<size_t, std::vector<std::vector<int>>> key_to_optim_shapes_; | std::map<size_t, std::vector<std::vector<int>>> key_to_optim_shapes_; | ||||
| std::map<std::string, bool> param_to_init_in_server_; | |||||
| }; | }; | ||||
| template <typename T> | template <typename T> | ||||
| @@ -81,7 +85,6 @@ void Worker<T>::Run() { | |||||
| MS_LOG(INFO) << "'Worker is already running."; | MS_LOG(INFO) << "'Worker is already running."; | ||||
| return; | return; | ||||
| } | } | ||||
| ::ps::Start(0); | ::ps::Start(0); | ||||
| if (!::ps::IsWorker()) { | if (!::ps::IsWorker()) { | ||||
| MS_LOG(EXCEPTION) << "The role is not worker."; | MS_LOG(EXCEPTION) << "The role is not worker."; | ||||
| @@ -99,18 +102,30 @@ void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> add | |||||
| ::ps::SArray<T> total_buffer(total_size, 0); | ::ps::SArray<T> total_buffer(total_size, 0); | ||||
| size_t offset = 0; | size_t offset = 0; | ||||
| for (size_t i = 0; i < sizes.size(); i++) { | for (size_t i = 0; i < sizes.size(); i++) { | ||||
| memcpy_s(total_buffer.data() + offset / sizeof(T), sizes[i] * sizeof(T), reinterpret_cast<void *>(addrs[i]), | |||||
| sizes[i] * sizeof(T)); | |||||
| auto ret = memcpy_s(total_buffer.data() + offset / sizeof(T), sizes[i] * sizeof(T), | |||||
| reinterpret_cast<void *>(addrs[i]), sizes[i] * sizeof(T)); | |||||
| if (ret != 0) { | |||||
| MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; | |||||
| } | |||||
| offset += sizes[i] * sizeof(T); | offset += sizes[i] * sizeof(T); | ||||
| } | } | ||||
| while (!kv_worker_->IsReadyForPush(keys[0])) { | |||||
| continue; | |||||
| } | |||||
| kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes)); | kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes)); | ||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| void Worker<T>::Pull(const size_t key, void *dev_addr, const size_t size) { | void Worker<T>::Pull(const size_t key, void *dev_addr, const size_t size) { | ||||
| ::ps::SArray<T> variables(size / sizeof(T), 0); | ::ps::SArray<T> variables(size / sizeof(T), 0); | ||||
| while (!kv_worker_->IsReadyForPull(key)) { | |||||
| continue; | |||||
| } | |||||
| kv_worker_->Wait(kv_worker_->ZPull({key}, &variables)); | kv_worker_->Wait(kv_worker_->ZPull({key}, &variables)); | ||||
| memcpy_s(dev_addr, size, variables.data(), size); | |||||
| auto ret = memcpy_s(dev_addr, size, variables.data(), size); | |||||
| if (ret != 0) { | |||||
| MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; | |||||
| } | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -121,7 +136,11 @@ void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const : | |||||
| template <typename T> | template <typename T> | ||||
| void Worker<T>::Finalize() { | void Worker<T>::Finalize() { | ||||
| kv_worker_->Finalize(); | |||||
| if (running_) { | |||||
| kv_worker_->Finalize(); | |||||
| kv_worker_.reset(); | |||||
| running_ = false; | |||||
| } | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -192,6 +211,20 @@ size_t Worker<T>::SetParamKey(const std::string ¶m_name) { | |||||
| return key; | return key; | ||||
| } | } | ||||
| template <typename T> | |||||
| void Worker<T>::SetParamInitInServer(const std::string ¶m_name, bool init_in_server) { | |||||
| MS_LOG(INFO) << "Set parameter " << param_name << " init_in_server:" << init_in_server; | |||||
| param_to_init_in_server_[param_name] = init_in_server; | |||||
| } | |||||
| template <typename T> | |||||
| bool Worker<T>::GetParamInitInServer(const std::string ¶m_name) { | |||||
| if (param_to_init_in_server_.count(param_name) == 0) { | |||||
| return false; | |||||
| } | |||||
| return param_to_init_in_server_[param_name]; | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| size_t Worker<T>::GetParamKey(const std::string ¶m_name) { | size_t Worker<T>::GetParamKey(const std::string ¶m_name) { | ||||
| size_t key = kInvalidKey; | size_t key = kInvalidKey; | ||||
| @@ -237,17 +270,27 @@ void Worker<T>::InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vecto | |||||
| template <typename T> | template <typename T> | ||||
| // Initialize parameters and optimizer kernels of Parameter Server. | // Initialize parameters and optimizer kernels of Parameter Server. | ||||
| void Worker<T>::InitPSParamAndOptim(const std::string ¶m_name, void *param_data, size_t param_size) { | |||||
| void Worker<T>::InitPSParamAndOptim(const std::string ¶m_name, tensor::TensorPtr tensor) { | |||||
| void *param_data = tensor->data_c(); | |||||
| size_t param_size = LongToSize(tensor->data().nbytes()); | |||||
| std::vector<int> param_shape = tensor->shape_c(); | |||||
| size_t param_key = GetParamKey(param_name); | size_t param_key = GetParamKey(param_name); | ||||
| if (param_key == kInvalidKey) { | if (param_key == kInvalidKey) { | ||||
| MS_LOG(INFO) << "Parameter " << param_name << " has no key assigned."; | MS_LOG(INFO) << "Parameter " << param_name << " has no key assigned."; | ||||
| return; | return; | ||||
| } | } | ||||
| bool init_in_server = false; | |||||
| std::vector<int> shape_init_in_server = {1}; | |||||
| if (param_shape == shape_init_in_server) { | |||||
| init_in_server = true; | |||||
| } | |||||
| SetParamInitInServer(param_name, init_in_server); | |||||
| bool init = IsKeyInit(param_key); | bool init = IsKeyInit(param_key); | ||||
| if (!init) { | if (!init) { | ||||
| MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name; | |||||
| // No need to push embedding table data to Parameter Server. | |||||
| if (param_name.find("embedding_table") == std::string::npos && param_name.find("wide_w") == std::string::npos) { | |||||
| MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name | |||||
| << ", whether init in server: " << init_in_server; | |||||
| if (!init_in_server) { | |||||
| InitPSParamData({param_key}, param_data, param_size); | InitPSParamData({param_key}, param_data, param_size); | ||||
| } | } | ||||
| InitPSOptimId(param_key); | InitPSOptimId(param_key); | ||||
| @@ -56,6 +56,8 @@ class WorkerProxy : public ::ps::KVWorker<T> { | |||||
| int priority = 0); | int priority = 0); | ||||
| int InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, | int InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, | ||||
| const ::ps::SArray<int> &lens = {}, const Callback &cb = nullptr, int priority = 0); | const ::ps::SArray<int> &lens = {}, const Callback &cb = nullptr, int priority = 0); | ||||
| bool IsReadyForPush(const Key &key); | |||||
| bool IsReadyForPull(const Key &key); | |||||
| void PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, const ::ps::SArray<int> &lens = {}, | void PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, const ::ps::SArray<int> &lens = {}, | ||||
| int cmd = 0, int priority = 0); | int cmd = 0, int priority = 0); | ||||
| void Finalize(); | void Finalize(); | ||||
| @@ -134,6 +136,28 @@ int WorkerProxy<T>::InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, cons | |||||
| return ts; | return ts; | ||||
| } | } | ||||
| template <typename T> | |||||
| bool WorkerProxy<T>::IsReadyForPush(const Key &key) { | |||||
| ::ps::SArray<T> result(1, 0); | |||||
| this->Wait(this->ZPull({key}, &result, nullptr, kCheckReadyForPushCmd)); | |||||
| if (result[0] > 0) { | |||||
| return true; | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| bool WorkerProxy<T>::IsReadyForPull(const Key &key) { | |||||
| ::ps::SArray<T> result(1, 0); | |||||
| this->Wait(this->ZPull({key}, &result, nullptr, kCheckReadyForPullCmd)); | |||||
| if (result[0] > 0) { | |||||
| return true; | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void WorkerProxy<T>::PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, | void WorkerProxy<T>::PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals, | ||||
| const ::ps::SArray<int> &lens, int cmd, int priority) { | const ::ps::SArray<int> &lens, int cmd, int priority) { | ||||
| @@ -155,7 +179,7 @@ void WorkerProxy<T>::Finalize() { | |||||
| kvs.vals.push_back(0.0f); | kvs.vals.push_back(0.0f); | ||||
| Send(obj_, ts, true, false, kFinalizeCmd, kvs, broadcast_slicer_); | Send(obj_, ts, true, false, kFinalizeCmd, kvs, broadcast_slicer_); | ||||
| obj_->WaitRequest(ts); | obj_->WaitRequest(ts); | ||||
| ::ps::Finalize(0, false); | |||||
| ::ps::Finalize(0, true); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -47,7 +47,9 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/include | |||||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ORIGIN:$ORIGIN/lib") | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ORIGIN:$ORIGIN/lib") | ||||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default") | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default") | ||||
| ms_build_flatbuffers("engine/cache/de_tensor.fbs" ${CMAKE_CURRENT_SOURCE_DIR} generated_engine_files ${CMAKE_BINARY_DIR}) | |||||
| include_directories("${CMAKE_BINARY_DIR}/minddata/dataset/engine/cache") | |||||
| set(MD_FLATBUFFER_OU "${CMAKE_BINARY_DIR}/minddata/dataset/engine/cache") | |||||
| ms_build_flatbuffers("engine/cache/de_tensor.fbs" ${CMAKE_CURRENT_SOURCE_DIR} generated_engine_files ${MD_FLATBUFFER_OU}) | |||||
| ################## Include sub-modules ############################### | ################## Include sub-modules ############################### | ||||
| add_subdirectory(util) | add_subdirectory(util) | ||||
| @@ -410,6 +410,7 @@ Status DEPipeline::SaveDataset(const std::vector<std::string> &file_names, const | |||||
| std::vector<std::string> index_fields; | std::vector<std::string> index_fields; | ||||
| s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields); | s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields); | ||||
| RETURN_IF_NOT_OK(s); | RETURN_IF_NOT_OK(s); | ||||
| MS_LOG(DEBUG) << "Schema of saved mindrecord: " << mr_json.dump(); | |||||
| if (mindrecord::SUCCESS != | if (mindrecord::SUCCESS != | ||||
| mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) { | mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) { | ||||
| RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader."); | RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader."); | ||||
| @@ -569,6 +570,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string, | |||||
| if (column_name_id_map.empty()) { | if (column_name_id_map.empty()) { | ||||
| RETURN_STATUS_UNEXPECTED("Error: column not found."); | RETURN_STATUS_UNEXPECTED("Error: column not found."); | ||||
| } | } | ||||
| json dataset_schema; | |||||
| for (auto &col : column_name_id_map) { | for (auto &col : column_name_id_map) { | ||||
| auto idx = col.second; | auto idx = col.second; | ||||
| auto column_name = col.first; | auto column_name = col.first; | ||||
| @@ -580,6 +582,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string, | |||||
| auto shapes = column_shape.AsVector(); | auto shapes = column_shape.AsVector(); | ||||
| std::vector<int> mr_shape(shapes.begin(), shapes.end()); | std::vector<int> mr_shape(shapes.begin(), shapes.end()); | ||||
| std::string el = column_type.ToString(); | std::string el = column_type.ToString(); | ||||
| dataset_schema[column_name] = el; | |||||
| if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) { | if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) { | ||||
| std::string err_msg("Error: can not support data type: " + el); | std::string err_msg("Error: can not support data type: " + el); | ||||
| RETURN_STATUS_UNEXPECTED(err_msg); | RETURN_STATUS_UNEXPECTED(err_msg); | ||||
| @@ -605,6 +608,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string, | |||||
| if (mr_type == "bytes" || !mr_shape.empty()) continue; | if (mr_type == "bytes" || !mr_shape.empty()) continue; | ||||
| index_fields->emplace_back(column_name); // candidate of index fields | index_fields->emplace_back(column_name); // candidate of index fields | ||||
| } | } | ||||
| MS_LOG(DEBUG) << "Schema of dataset: " << dataset_schema.dump(); | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle, | Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle, | ||||
| @@ -268,6 +268,10 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) { | |||||
| std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool(); | std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool(); | ||||
| (*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool); | (*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool); | ||||
| int64_t byte_size = (*ptr)->SizeInBytes(); | int64_t byte_size = (*ptr)->SizeInBytes(); | ||||
| if (byte_size == 0) { | |||||
| return Status::OK(); | |||||
| } | |||||
| RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size)); | RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size)); | ||||
| unsigned char *data = static_cast<unsigned char *>(arr.request().ptr); | unsigned char *data = static_cast<unsigned char *>(arr.request().ptr); | ||||
| @@ -23,9 +23,9 @@ | |||||
| #include <utility> | #include <utility> | ||||
| #include <vector> | #include <vector> | ||||
| #include "./de_tensor_generated.h" | |||||
| #include "minddata/dataset/engine/data_buffer.h" | #include "minddata/dataset/engine/data_buffer.h" | ||||
| #include "minddata/dataset/engine/cache/cache_server.h" | #include "minddata/dataset/engine/cache/cache_server.h" | ||||
| #include "minddata/dataset/engine/cache/de_tensor_generated.h" | |||||
| #include "minddata/dataset/util/lock.h" | #include "minddata/dataset/util/lock.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -23,8 +23,8 @@ | |||||
| #include <utility> | #include <utility> | ||||
| #include <vector> | #include <vector> | ||||
| #include "./de_tensor_generated.h" | |||||
| #include "minddata/dataset/core/tensor_row.h" | #include "minddata/dataset/core/tensor_row.h" | ||||
| #include "minddata/dataset/engine/cache/de_tensor_generated.h" | |||||
| #include "minddata/dataset/util/slice.h" | #include "minddata/dataset/util/slice.h" | ||||
| #include "minddata/dataset/util/wait_post.h" | #include "minddata/dataset/util/wait_post.h" | ||||
| @@ -25,10 +25,10 @@ | |||||
| #include <utility> | #include <utility> | ||||
| #include <vector> | #include <vector> | ||||
| #include "./de_tensor_generated.h" | |||||
| #include "minddata/dataset/core/global_context.h" | #include "minddata/dataset/core/global_context.h" | ||||
| #include "minddata/dataset/core/tensor.h" | #include "minddata/dataset/core/tensor.h" | ||||
| #include "minddata/dataset/engine/cache/cache_request.h" | #include "minddata/dataset/engine/cache/cache_request.h" | ||||
| #include "minddata/dataset/engine/cache/de_tensor_generated.h" | |||||
| #include "minddata/dataset/util/arena.h" | #include "minddata/dataset/util/arena.h" | ||||
| #include "minddata/dataset/util/btree.h" | #include "minddata/dataset/util/btree.h" | ||||
| #include "minddata/dataset/util/cache_pool.h" | #include "minddata/dataset/util/cache_pool.h" | ||||
| @@ -84,6 +84,7 @@ class CacheService : public Service { | |||||
| public: | public: | ||||
| using state_type = std::underlying_type<State>::type; | using state_type = std::underlying_type<State>::type; | ||||
| ServiceStat() : min_(0), max_(0), state_(0) {} | ServiceStat() : min_(0), max_(0), state_(0) {} | ||||
| ~ServiceStat() = default; | |||||
| CachePool::CacheStat stat_{}; | CachePool::CacheStat stat_{}; | ||||
| row_id_type min_; | row_id_type min_; | ||||
| row_id_type max_; | row_id_type max_; | ||||
| @@ -388,6 +388,13 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) { | |||||
| op->tree_->Print(ss, op); | op->tree_->Print(ss, op); | ||||
| std::string ss_str = ss.str(); | std::string ss_str = ss.str(); | ||||
| // Filter out the Num workers field when generating the check sum | |||||
| ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), ""); | |||||
| ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*\\]"), ""); | |||||
| // Filter out Number of rows when generating the check sum | |||||
| ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), ""); | |||||
| // Filter out the Operator control flags field when generating the check sum | // Filter out the Operator control flags field when generating the check sum | ||||
| ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), ""); | ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), ""); | ||||
| @@ -400,6 +407,8 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) { | |||||
| ss_str = std::regex_replace(ss_str, std::regex("Cache crc.*\n"), ""); | ss_str = std::regex_replace(ss_str, std::regex("Cache crc.*\n"), ""); | ||||
| ss_str = std::regex_replace(ss_str, std::regex("Server cache id.*\n"), ""); | ss_str = std::regex_replace(ss_str, std::regex("Server cache id.*\n"), ""); | ||||
| MS_LOG(DEBUG) << "Printing the tree for generating crc:\n" << ss_str; | |||||
| uint32_t cache_crc = system::Crc32c::GetMaskCrc32cValue(ss_str.c_str(), ss_str.length()); | uint32_t cache_crc = system::Crc32c::GetMaskCrc32cValue(ss_str.c_str(), ss_str.length()); | ||||
| return cache_crc; | return cache_crc; | ||||
| } | } | ||||
| @@ -212,12 +212,12 @@ Status DeviceQueueOp::SendDataToGPU() { | |||||
| RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle)); | RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle)); | ||||
| total_batch++; | total_batch++; | ||||
| } | } | ||||
| if (!TaskManager::FindMe()->Interrupted()) | |||||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) | |||||
| RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | ||||
| else | else | ||||
| is_break_loop = true; | is_break_loop = true; | ||||
| } | } | ||||
| if (!TaskManager::FindMe()->Interrupted()) | |||||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) | |||||
| RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | ||||
| else | else | ||||
| is_break_loop = true; | is_break_loop = true; | ||||
| @@ -758,6 +758,11 @@ Status CsvOp::ComputeColMap() { | |||||
| } else { | } else { | ||||
| MS_LOG(WARNING) << "Column name map is already set!"; | MS_LOG(WARNING) << "Column name map is already set!"; | ||||
| } | } | ||||
| if (column_default_list_.size() < column_name_id_map_.size()) { | |||||
| for (int32_t i = column_default_list_.size(); i < column_name_id_map_.size(); i++) { | |||||
| column_default_list_.push_back(std::make_shared<CsvOp::Record<std::string>>(CsvOp::STRING, "")); | |||||
| } | |||||
| } | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| } // namespace dataset | } // namespace dataset | ||||
| @@ -679,9 +679,10 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor | |||||
| } | } | ||||
| cv::Mat result; | cv::Mat result; | ||||
| cv::merge(image_result, result); | cv::merge(image_result, result); | ||||
| result.convertTo(result, input_cv->mat().type()); | |||||
| std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result); | std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result); | ||||
| if (input_cv->Rank() == 2) output_cv->Squeeze(); | |||||
| (*output) = std::static_pointer_cast<Tensor>(output_cv); | (*output) = std::static_pointer_cast<Tensor>(output_cv); | ||||
| (*output)->Reshape(input->shape()); | |||||
| } catch (const cv::Exception &e) { | } catch (const cv::Exception &e) { | ||||
| RETURN_STATUS_UNEXPECTED("Error in auto contrast"); | RETURN_STATUS_UNEXPECTED("Error in auto contrast"); | ||||
| } | } | ||||
| @@ -781,8 +782,8 @@ Status Equalize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *o | |||||
| cv::Mat result; | cv::Mat result; | ||||
| cv::merge(image_result, result); | cv::merge(image_result, result); | ||||
| std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result); | std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result); | ||||
| if (input_cv->Rank() == 2) output_cv->Squeeze(); | |||||
| (*output) = std::static_pointer_cast<Tensor>(output_cv); | (*output) = std::static_pointer_cast<Tensor>(output_cv); | ||||
| (*output)->Reshape(input->shape()); | |||||
| } catch (const cv::Exception &e) { | } catch (const cv::Exception &e) { | ||||
| RETURN_STATUS_UNEXPECTED("Error in equalize."); | RETURN_STATUS_UNEXPECTED("Error in equalize."); | ||||
| } | } | ||||
| @@ -27,17 +27,34 @@ namespace dataset { | |||||
| SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab, | SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab, | ||||
| const SPieceTokenizerLoadType load_type, | const SPieceTokenizerLoadType load_type, | ||||
| const SPieceTokenizerOutType out_type) | const SPieceTokenizerOutType out_type) | ||||
| : vocab_(vocab), load_type_(load_type), out_type_(out_type) {} | |||||
| : vocab_(vocab), load_type_(load_type), out_type_(out_type) { | |||||
| auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto()); | |||||
| if (!status.ok()) { | |||||
| model_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "parser vocab model filed."); | |||||
| } else { | |||||
| model_status_ = Status::OK(); | |||||
| } | |||||
| } | |||||
| SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename, | SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename, | ||||
| const SPieceTokenizerLoadType load_type, | const SPieceTokenizerLoadType load_type, | ||||
| const SPieceTokenizerOutType out_type) | const SPieceTokenizerOutType out_type) | ||||
| : load_type_(load_type), out_type_(out_type) { | : load_type_(load_type), out_type_(out_type) { | ||||
| (void)GetModelRealPath(model_path, model_filename); | (void)GetModelRealPath(model_path, model_filename); | ||||
| auto status = processor_.Load(file_path_); | |||||
| if (!status.ok()) { | |||||
| model_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "load vocab model filed."); | |||||
| } else { | |||||
| model_status_ = Status::OK(); | |||||
| } | |||||
| } | } | ||||
| Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | ||||
| IO_CHECK(input, output); | IO_CHECK(input, output); | ||||
| if (!model_status_.IsOk()) { | |||||
| return model_status_; | |||||
| } | |||||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | ||||
| RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor"); | RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor"); | ||||
| } | } | ||||
| @@ -45,18 +62,6 @@ Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s | |||||
| std::string_view sentence_v; | std::string_view sentence_v; | ||||
| RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {})); | RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {})); | ||||
| std::string sentence{sentence_v}; | std::string sentence{sentence_v}; | ||||
| if (load_type_ == SPieceTokenizerLoadType::kFile) { | |||||
| auto status = processor_.Load(file_path_); | |||||
| if (!status.ok()) { | |||||
| RETURN_STATUS_UNEXPECTED("load sentence piece model failed."); | |||||
| } | |||||
| } else { | |||||
| RETURN_UNEXPECTED_IF_NULL(vocab_); | |||||
| auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto()); | |||||
| if (!status.ok()) { | |||||
| RETURN_STATUS_UNEXPECTED("sentence piece load model failed."); | |||||
| } | |||||
| } | |||||
| if (out_type_ == SPieceTokenizerOutType::kString) { | if (out_type_ == SPieceTokenizerOutType::kString) { | ||||
| std::vector<std::string> pieces; | std::vector<std::string> pieces; | ||||
| @@ -58,6 +58,7 @@ class SentencePieceTokenizerOp : public TensorOp { | |||||
| std::string file_path_; | std::string file_path_; | ||||
| SPieceTokenizerLoadType load_type_; | SPieceTokenizerLoadType load_type_; | ||||
| sentencepiece::SentencePieceProcessor processor_; | sentencepiece::SentencePieceProcessor processor_; | ||||
| Status model_status_; | |||||
| }; | }; | ||||
| } // namespace dataset | } // namespace dataset | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -296,7 +296,13 @@ Status TaskGroup::CreateAsyncTask(const std::string &my_name, const std::functio | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| void TaskGroup::interrupt_all() noexcept { intrp_svc_->InterruptAll(); } | |||||
| void TaskGroup::interrupt_all() noexcept { | |||||
| // There is a racing condition if we don't stop the interrupt service at this point. New resource | |||||
| // may come in and not being picked up after we call InterruptAll(). So stop new comers and then | |||||
| // interrupt any existing resources. | |||||
| (void)intrp_svc_->ServiceStop(); | |||||
| intrp_svc_->InterruptAll(); | |||||
| } | |||||
| Status TaskGroup::join_all(Task::WaitFlag wf) { | Status TaskGroup::join_all(Task::WaitFlag wf) { | ||||
| Status rc; | Status rc; | ||||
| @@ -312,7 +318,6 @@ Status TaskGroup::join_all(Task::WaitFlag wf) { | |||||
| } | } | ||||
| Status TaskGroup::DoServiceStop() { | Status TaskGroup::DoServiceStop() { | ||||
| intrp_svc_->ServiceStop(); | |||||
| interrupt_all(); | interrupt_all(); | ||||
| return (join_all(Task::WaitFlag::kNonBlocking)); | return (join_all(Task::WaitFlag::kNonBlocking)); | ||||
| } | } | ||||
| @@ -133,6 +133,7 @@ void BindGlobalParams(py::module *m) { | |||||
| (*m).attr("MAX_PAGE_SIZE") = kMaxPageSize; | (*m).attr("MAX_PAGE_SIZE") = kMaxPageSize; | ||||
| (*m).attr("MIN_SHARD_COUNT") = kMinShardCount; | (*m).attr("MIN_SHARD_COUNT") = kMinShardCount; | ||||
| (*m).attr("MAX_SHARD_COUNT") = kMaxShardCount; | (*m).attr("MAX_SHARD_COUNT") = kMaxShardCount; | ||||
| (*m).attr("MAX_FILE_COUNT") = kMaxFileCount; | |||||
| (*m).attr("MIN_CONSUMER_COUNT") = kMinConsumerCount; | (*m).attr("MIN_CONSUMER_COUNT") = kMinConsumerCount; | ||||
| (void)(*m).def("get_max_thread_num", &GetMaxThreadNum); | (void)(*m).def("get_max_thread_num", &GetMaxThreadNum); | ||||
| } | } | ||||
| @@ -104,7 +104,8 @@ const uint64_t kInt64Len = 8; | |||||
| const uint64_t kMinFileSize = kInt64Len; | const uint64_t kMinFileSize = kInt64Len; | ||||
| const int kMinShardCount = 1; | const int kMinShardCount = 1; | ||||
| const int kMaxShardCount = 1000; | |||||
| const int kMaxShardCount = 1000; // write | |||||
| const int kMaxFileCount = 4096; // read | |||||
| const int kMinConsumerCount = 1; | const int kMinConsumerCount = 1; | ||||
| const int kMaxConsumerCount = 128; | const int kMaxConsumerCount = 128; | ||||
| @@ -152,7 +152,7 @@ class ShardHeader { | |||||
| MSRStatus CheckIndexField(const std::string &field, const json &schema); | MSRStatus CheckIndexField(const std::string &field, const json &schema); | ||||
| void ParsePage(const json &page, int shard_index, bool load_dataset); | |||||
| MSRStatus ParsePage(const json &page, int shard_index, bool load_dataset); | |||||
| MSRStatus ParseStatistics(const json &statistics); | MSRStatus ParseStatistics(const json &statistics); | ||||
| @@ -252,7 +252,7 @@ std::vector<std::tuple<int, int, int, uint64_t>> ShardReader::ReadRowGroupSummar | |||||
| if (shard_count <= 0) { | if (shard_count <= 0) { | ||||
| return row_group_summary; | return row_group_summary; | ||||
| } | } | ||||
| if (shard_count <= kMaxShardCount) { | |||||
| if (shard_count <= kMaxFileCount) { | |||||
| for (int shard_id = 0; shard_id < shard_count; ++shard_id) { | for (int shard_id = 0; shard_id < shard_count; ++shard_id) { | ||||
| // return -1 when page's size equals to 0. | // return -1 when page's size equals to 0. | ||||
| auto last_page_id = shard_header_->GetLastPageId(shard_id); | auto last_page_id = shard_header_->GetLastPageId(shard_id); | ||||
| @@ -1054,7 +1054,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector<std::tuple<int, int, i | |||||
| } | } | ||||
| auto offsets = std::get<1>(ret); | auto offsets = std::get<1>(ret); | ||||
| auto local_columns = std::get<2>(ret); | auto local_columns = std::get<2>(ret); | ||||
| if (shard_count_ <= kMaxShardCount) { | |||||
| if (shard_count_ <= kMaxFileCount) { | |||||
| for (int shard_id = 0; shard_id < shard_count_; shard_id++) { | for (int shard_id = 0; shard_id < shard_count_; shard_id++) { | ||||
| for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) { | for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) { | ||||
| tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1], | tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1], | ||||
| @@ -83,7 +83,7 @@ MSRStatus ShardWriter::OpenDataFiles(bool append) { | |||||
| // if not append and mindrecord file exist, return FAILED | // if not append and mindrecord file exist, return FAILED | ||||
| fs->open(common::SafeCStr(file), std::ios::in | std::ios::binary); | fs->open(common::SafeCStr(file), std::ios::in | std::ios::binary); | ||||
| if (fs->good()) { | if (fs->good()) { | ||||
| MS_LOG(ERROR) << "MindRecord file already existed."; | |||||
| MS_LOG(ERROR) << "MindRecord file already existed, please delete file: " << common::SafeCStr(file); | |||||
| fs->close(); | fs->close(); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| @@ -55,7 +55,9 @@ MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers, bool l | |||||
| header_size_ = header["header_size"].get<uint64_t>(); | header_size_ = header["header_size"].get<uint64_t>(); | ||||
| page_size_ = header["page_size"].get<uint64_t>(); | page_size_ = header["page_size"].get<uint64_t>(); | ||||
| } | } | ||||
| ParsePage(header["page"], shard_index, load_dataset); | |||||
| if (SUCCESS != ParsePage(header["page"], shard_index, load_dataset)) { | |||||
| return FAILED; | |||||
| } | |||||
| shard_index++; | shard_index++; | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -248,11 +250,16 @@ MSRStatus ShardHeader::ParseIndexFields(const json &index_fields) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) { | |||||
| MSRStatus ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) { | |||||
| // set shard_index when load_dataset is false | // set shard_index when load_dataset is false | ||||
| if (pages_.empty() && shard_count_ <= kMaxShardCount) { | |||||
| if (shard_count_ > kMaxFileCount) { | |||||
| MS_LOG(ERROR) << "The number of mindrecord files is greater than max value: " << kMaxFileCount; | |||||
| return FAILED; | |||||
| } | |||||
| if (pages_.empty() && shard_count_ <= kMaxFileCount) { | |||||
| pages_.resize(shard_count_); | pages_.resize(shard_count_); | ||||
| } | } | ||||
| for (auto &page : pages) { | for (auto &page : pages) { | ||||
| int page_id = page["page_id"]; | int page_id = page["page_id"]; | ||||
| int shard_id = page["shard_id"]; | int shard_id = page["shard_id"]; | ||||
| @@ -275,6 +282,7 @@ void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_datase | |||||
| pages_[shard_index].push_back(std::move(parsed_page)); | pages_[shard_index].push_back(std::move(parsed_page)); | ||||
| } | } | ||||
| } | } | ||||
| return SUCCESS; | |||||
| } | } | ||||
| MSRStatus ShardHeader::ParseStatistics(const json &statistics) { | MSRStatus ShardHeader::ParseStatistics(const json &statistics) { | ||||
| @@ -715,7 +723,9 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) { | |||||
| std::string line; | std::string line; | ||||
| while (std::getline(page_in_handle, line)) { | while (std::getline(page_in_handle, line)) { | ||||
| ParsePage(json::parse(line), -1, true); | |||||
| if (SUCCESS != ParsePage(json::parse(line), -1, true)) { | |||||
| return FAILED; | |||||
| } | |||||
| } | } | ||||
| page_in_handle.close(); | page_in_handle.close(); | ||||
| @@ -17,6 +17,8 @@ | |||||
| */ | */ | ||||
| #include "pipeline/jit/parse/parse.h" | #include "pipeline/jit/parse/parse.h" | ||||
| #include <utility> | |||||
| #include <string> | #include <string> | ||||
| #include <memory> | #include <memory> | ||||
| #include <sstream> | #include <sstream> | ||||
| @@ -1480,21 +1482,25 @@ AnfNodePtr FindPhis(const std::unordered_map<ParameterPtr, AnfNodePtr> &removabl | |||||
| void Parser::RemoveUnnecessaryPhis() { | void Parser::RemoveUnnecessaryPhis() { | ||||
| // merge all removable phis to one map; | // merge all removable phis to one map; | ||||
| std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis; | std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis; | ||||
| std::vector<ParameterPtr> phis; | |||||
| for (FunctionBlockPtr &block : func_block_list_) { | for (FunctionBlockPtr &block : func_block_list_) { | ||||
| MS_EXCEPTION_IF_NULL(block); | MS_EXCEPTION_IF_NULL(block); | ||||
| removable_phis.insert(block->removable_phis().begin(), block->removable_phis().end()); | removable_phis.insert(block->removable_phis().begin(), block->removable_phis().end()); | ||||
| std::transform(block->removable_phis().begin(), block->removable_phis().end(), std::back_inserter(phis), | |||||
| [](std::pair<ParameterPtr, AnfNodePtr> pair) { return pair.first; }); | |||||
| } | } | ||||
| if (removable_phis.size() == 0) { | if (removable_phis.size() == 0) { | ||||
| return; | return; | ||||
| } | } | ||||
| auto fg_name = func_graph_->ToString(); | auto fg_name = func_graph_->ToString(); | ||||
| auto mng = Manage(func_graph_, false); | auto mng = Manage(func_graph_, false); | ||||
| // replace the nodes | // replace the nodes | ||||
| for (auto iter : removable_phis) { | |||||
| auto new_node = FindPhis(removable_phis, iter.first); | |||||
| MS_LOG(DEBUG) << "phi " << iter.first->DebugString() << " to " << new_node->DebugString(); | |||||
| mng->Replace(iter.first, new_node); | |||||
| // remove from inside to outside | |||||
| for (int idx = SizeToInt(phis.size() - 1); idx >= 0; idx--) { | |||||
| auto phi = phis[IntToSize(idx)]; | |||||
| auto new_node = FindPhis(removable_phis, phi); | |||||
| MS_LOG(DEBUG) << "phi " << phi->DebugString() << " to " << new_node->DebugString(); | |||||
| mng->Replace(phi, new_node); | |||||
| } | } | ||||
| // remove the parameter | // remove the parameter | ||||
| for (FunctionBlockPtr &block : func_block_list_) { | for (FunctionBlockPtr &block : func_block_list_) { | ||||
| @@ -45,6 +45,7 @@ | |||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| #include "frontend/parallel/ps/common.h" | #include "frontend/parallel/ps/common.h" | ||||
| #include "frontend/parallel/ps/util.h" | #include "frontend/parallel/ps/util.h" | ||||
| #include "frontend/parallel/ps/worker.h" | |||||
| #endif | #endif | ||||
| #if (ENABLE_GE || ENABLE_D) | #if (ENABLE_GE || ENABLE_D) | ||||
| @@ -261,6 +262,7 @@ void ExecutorPy::DelNetRes(const std::string &id) { | |||||
| for (auto &item : tmp_info) { | for (auto &item : tmp_info) { | ||||
| if (item.first.find(id) != string::npos) { | if (item.first.find(id) != string::npos) { | ||||
| MS_LOG(DEBUG) << "Delete network res:" << item.first; | MS_LOG(DEBUG) << "Delete network res:" << item.first; | ||||
| item.second = nullptr; | |||||
| (void)info_.erase(item.first); | (void)info_.erase(item.first); | ||||
| flag = true; | flag = true; | ||||
| } | } | ||||
| @@ -949,7 +951,13 @@ void ClearResAtexit() { | |||||
| pynative::ClearPyNativeSession(); | pynative::ClearPyNativeSession(); | ||||
| session::ClearPythonParasMap(); | session::ClearPythonParasMap(); | ||||
| device::KernelRuntimeManager::Instance().ClearRuntimeResource(); | device::KernelRuntimeManager::Instance().ClearRuntimeResource(); | ||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | |||||
| if (mindspore::parallel::ps::Util::IsParamServerMode()) { | |||||
| if (parallel::ps::Util::IsRoleOfWorker()) { | |||||
| parallel::ps::Worker<float>::GetInstance().Finalize(); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| ad::g_k_prims.clear(); | ad::g_k_prims.clear(); | ||||
| abstract::ClearPrimEvaluatorMap(); | abstract::ClearPrimEvaluatorMap(); | ||||
| @@ -150,7 +150,8 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() { | |||||
| using mindspore::parse::PyObjectWrapper; | using mindspore::parse::PyObjectWrapper; | ||||
| EvalResultPtr StandardPrimEvaluator::EvalPrim(const AnalysisEnginePtr &engine, const AbstractBasePtrList &args) { | EvalResultPtr StandardPrimEvaluator::EvalPrim(const AnalysisEnginePtr &engine, const AbstractBasePtrList &args) { | ||||
| if (prim_ != prim::kPrimMakeTuple && prim_ != prim::kPrimSwitch) { | |||||
| if (prim_ != prim::kPrimMakeTuple && prim_ != prim::kPrimSwitch && prim_ != prim::kPrimEnvSetItem && | |||||
| prim_ != prim::kPrimEnvGetItem) { | |||||
| auto ret_abstract = AbstractEval(args); | auto ret_abstract = AbstractEval(args); | ||||
| if (ret_abstract != nullptr) { | if (ret_abstract != nullptr) { | ||||
| MS_LOG(DEBUG) << "StandardPrimEvaluator eval Undetermined"; | MS_LOG(DEBUG) << "StandardPrimEvaluator eval Undetermined"; | ||||
| @@ -386,6 +387,16 @@ py::dict ConvertAbstractToPython(const AbstractBasePtr &abs_base) { | |||||
| dic["shape"] = arg_tensor->shape()->shape(); | dic["shape"] = arg_tensor->shape()->shape(); | ||||
| dic["dtype"] = arg_tensor->BuildType(); | dic["dtype"] = arg_tensor->BuildType(); | ||||
| dic["value"] = BuildValue(arg_tensor->BuildValue()); | dic["value"] = BuildValue(arg_tensor->BuildValue()); | ||||
| } else if (abs_base->isa<AbstractIndexedSlices>()) { | |||||
| auto arg = dyn_cast<AbstractIndexedSlices>(abs_base); | |||||
| dic["shape"] = arg->shape()->shape(); | |||||
| dic["dtype"] = arg->BuildType(); | |||||
| dic["value"] = BuildValue(arg->BuildValue()); | |||||
| } else if (abs_base->isa<AbstractSparseTensor>()) { | |||||
| auto arg = dyn_cast<AbstractSparseTensor>(abs_base); | |||||
| dic["shape"] = arg->shape()->shape(); | |||||
| dic["dtype"] = arg->BuildType(); | |||||
| dic["value"] = BuildValue(arg->BuildValue()); | |||||
| } else if (abs_base->isa<AbstractScalar>() || abs_base->isa<AbstractType>() || abs_base->isa<AbstractRefKey>()) { | } else if (abs_base->isa<AbstractScalar>() || abs_base->isa<AbstractType>() || abs_base->isa<AbstractRefKey>()) { | ||||
| std::vector<int> shape; | std::vector<int> shape; | ||||
| dic["shape"] = shape; | dic["shape"] = shape; | ||||
| @@ -59,7 +59,7 @@ struct OpExecInfo { | |||||
| using OpExecInfoPtr = std::shared_ptr<OpExecInfo>; | using OpExecInfoPtr = std::shared_ptr<OpExecInfo>; | ||||
| OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args); | OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args); | ||||
| const std::set<std::string> ignore_infer_prim = {"make_ref"}; | |||||
| const std::set<std::string> ignore_infer_prim = {"make_ref", "mixed_precision_cast"}; | |||||
| } // namespace pynative | } // namespace pynative | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -57,7 +57,7 @@ using mindspore::tensor::TensorPy; | |||||
| const char SINGLE_OP_GRAPH[] = "single_op_graph"; | const char SINGLE_OP_GRAPH[] = "single_op_graph"; | ||||
| // primitive unable to infer value for constant input in PyNative mode | // primitive unable to infer value for constant input in PyNative mode | ||||
| const std::set<std::string> vm_operators = {"make_ref", "HookBackward", "stop_gradient"}; | |||||
| const std::set<std::string> vm_operators = {"make_ref", "HookBackward", "stop_gradient", "mixed_precision_cast"}; | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace pynative { | namespace pynative { | ||||
| @@ -690,12 +690,15 @@ py::tuple RunOpInner(const OpExecInfoPtr &op_exec_info, const py::args &args) { | |||||
| return err_ret; | return err_ret; | ||||
| } | } | ||||
| auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result); | |||||
| if (node != nullptr) { | |||||
| node->set_abstract(op_exec_info->abstract); | |||||
| MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString(); | |||||
| if (op_exec_info->op_name != prim::kPrimMixedPrecisionCast->name()) { | |||||
| auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result); | |||||
| if (node != nullptr) { | |||||
| node->set_abstract(op_exec_info->abstract); | |||||
| MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString(); | |||||
| } | |||||
| MS_LOG(DEBUG) << "RunOp end"; | |||||
| } | } | ||||
| MS_LOG(DEBUG) << "RunOp end"; | |||||
| return result; | return result; | ||||
| } | } | ||||
| @@ -766,6 +769,9 @@ PynativeExecutor::PynativeExecutor() { grad_flag_ = false; } | |||||
| void PynativeExecutor::NewGraphInner(const py::object &cell, const py::args &args) { | void PynativeExecutor::NewGraphInner(const py::object &cell, const py::args &args) { | ||||
| auto cell_id = GetId(cell); | auto cell_id = GetId(cell); | ||||
| if (cell_graph_map_.count(cell_id) != 0) { | if (cell_graph_map_.count(cell_id) != 0) { | ||||
| if (cell_resource_map_.find(cell_id) != cell_resource_map_.end()) { | |||||
| resource_ = cell_resource_map_[cell_id]; | |||||
| } | |||||
| MS_LOG(DEBUG) << "Newgraph already compiled"; | MS_LOG(DEBUG) << "Newgraph already compiled"; | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -774,6 +780,8 @@ void PynativeExecutor::NewGraphInner(const py::object &cell, const py::args &arg | |||||
| if (top_g_ == nullptr) { | if (top_g_ == nullptr) { | ||||
| top_g_ = curr_g_ = g; | top_g_ = curr_g_ = g; | ||||
| resource_ = std::make_shared<pipeline::Resource>(); | |||||
| cell_resource_map_[cell_id] = resource_; | |||||
| df_builder_ = std::make_shared<FuncGraph>(); | df_builder_ = std::make_shared<FuncGraph>(); | ||||
| MS_LOG(DEBUG) << "First new graph" << top_g_.get(); | MS_LOG(DEBUG) << "First new graph" << top_g_.get(); | ||||
| Pushp(); | Pushp(); | ||||
| @@ -910,8 +918,8 @@ void PynativeExecutor::EndGraphInner(const py::object &cell, const py::object &o | |||||
| cnode->set_inputs(args); | cnode->set_inputs(args); | ||||
| set_obj_node_map(curr_g_, out_id, cnode); | set_obj_node_map(curr_g_, out_id, cnode); | ||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << "Graph has no this out: " << out_id; | |||||
| return; | |||||
| MS_LOG(DEBUG) << "Set ValueNode as output for graph, out id: " << out_id; | |||||
| MakeValueNode(out, out_id); | |||||
| } | } | ||||
| } | } | ||||
| EndGraphByOutId(out_id, cell, out, args); | EndGraphByOutId(out_id, cell, out, args); | ||||
| @@ -1075,6 +1083,7 @@ void PynativeExecutor::Clear(const std::string &flag) { | |||||
| MS_LOG(INFO) << "Clear res"; | MS_LOG(INFO) << "Clear res"; | ||||
| (void)graph_map_.erase(flag); | (void)graph_map_.erase(flag); | ||||
| (void)cell_graph_map_.erase(flag); | (void)cell_graph_map_.erase(flag); | ||||
| (void)cell_resource_map_.erase(flag); | |||||
| Clean(); | Clean(); | ||||
| // Maybe exit in the pynative runing op, so need reset pynative flag. | // Maybe exit in the pynative runing op, so need reset pynative flag. | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| @@ -1086,6 +1095,7 @@ void PynativeExecutor::Clear(const std::string &flag) { | |||||
| MS_LOG(INFO) << "Clear"; | MS_LOG(INFO) << "Clear"; | ||||
| top_g_ = nullptr; | top_g_ = nullptr; | ||||
| df_builder_ = nullptr; | |||||
| curr_g_ = nullptr; | curr_g_ = nullptr; | ||||
| graph_info_map_.clear(); | graph_info_map_.clear(); | ||||
| std::stack<FuncGraphPtr>().swap(graph_p_); | std::stack<FuncGraphPtr>().swap(graph_p_); | ||||
| @@ -1095,7 +1105,6 @@ void PynativeExecutor::Clean() { | |||||
| MS_LOG(INFO) << "Clean all res"; | MS_LOG(INFO) << "Clean all res"; | ||||
| Clear(); | Clear(); | ||||
| grad_flag_ = false; | grad_flag_ = false; | ||||
| df_builder_ = nullptr; | |||||
| ad::CleanRes(); | ad::CleanRes(); | ||||
| pipeline::ReclaimOptimizer(); | pipeline::ReclaimOptimizer(); | ||||
| } | } | ||||
| @@ -115,6 +115,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> { | |||||
| bool grad_flag_; | bool grad_flag_; | ||||
| std::unordered_map<std::string, FuncGraphPtr> graph_map_; | std::unordered_map<std::string, FuncGraphPtr> graph_map_; | ||||
| std::unordered_map<std::string, FuncGraphPtr> cell_graph_map_; | std::unordered_map<std::string, FuncGraphPtr> cell_graph_map_; | ||||
| std::unordered_map<std::string, ResourcePtr> cell_resource_map_; | |||||
| std::unordered_map<FuncGraphPtr, GraphInfo> graph_info_map_; | std::unordered_map<FuncGraphPtr, GraphInfo> graph_info_map_; | ||||
| std::stack<FuncGraphPtr> graph_p_; | std::stack<FuncGraphPtr> graph_p_; | ||||
| FuncGraphPtr top_g_; | FuncGraphPtr top_g_; | ||||
| @@ -484,7 +484,8 @@ bool AscendDeviceAddress::SyncDeviceToHostAndConvertFormat(const std::vector<int | |||||
| std::vector<size_t> device_shape = GetDeviceShape(&host_shape); | std::vector<size_t> device_shape = GetDeviceShape(&host_shape); | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(ms_context); | MS_EXCEPTION_IF_NULL(ms_context); | ||||
| if (ms_context->execution_mode() == kPynativeMode && type_id_name_map.find(type_id_) != type_id_name_map.end()) { | |||||
| if (ms_context->execution_mode() != kPynativeMode && ms_context->execution_mode() != kGraphMode && | |||||
| type_id_name_map.find(type_id_) != type_id_name_map.end()) { | |||||
| std::pair<std::string, std::string> type_format = std::make_pair(type_id_name_map.at(type_id_), format_); | std::pair<std::string, std::string> type_format = std::make_pair(type_id_name_map.at(type_id_), format_); | ||||
| if (use_trans_data.find(type_format) != use_trans_data.end()) { | if (use_trans_data.find(type_format) != use_trans_data.end()) { | ||||
| sync_ok = SyncDeviceToHostAndConvertFormatBasedOnTransData(host_shape, device_shape, size, type, host_ptr); | sync_ok = SyncDeviceToHostAndConvertFormatBasedOnTransData(host_shape, device_shape, size, type, host_ptr); | ||||
| @@ -672,10 +672,8 @@ void AscendStreamAssign::InsertEventForIndependentParallel(const NotNull<KernelG | |||||
| void AscendStreamAssign::GetNeedActiveStreams(const NotNull<KernelGraphPtr> &graph_ptr) { | void AscendStreamAssign::GetNeedActiveStreams(const NotNull<KernelGraphPtr> &graph_ptr) { | ||||
| CNodePtr cur_cnode_ptr = nullptr; | CNodePtr cur_cnode_ptr = nullptr; | ||||
| auto cnode_ptr_list = graph_ptr->execution_order(); | auto cnode_ptr_list = graph_ptr->execution_order(); | ||||
| // 1)first stream 0 should be actived first; | |||||
| need_first_active_streams_.emplace_back(0); | |||||
| // 2)stream witch kStreamNeedActivedFirst attr should be actived; | |||||
| // 1)stream witch kStreamNeedActivedFirst attr should be actived; | |||||
| for (size_t i = 0; i < cnode_ptr_list.size(); ++i) { | for (size_t i = 0; i < cnode_ptr_list.size(); ++i) { | ||||
| cur_cnode_ptr = cnode_ptr_list[i]; | cur_cnode_ptr = cnode_ptr_list[i]; | ||||
| MS_EXCEPTION_IF_NULL(cur_cnode_ptr); | MS_EXCEPTION_IF_NULL(cur_cnode_ptr); | ||||
| @@ -691,19 +689,25 @@ void AscendStreamAssign::GetNeedActiveStreams(const NotNull<KernelGraphPtr> &gra | |||||
| } | } | ||||
| } | } | ||||
| // 3)independent stream:if has not been activate, push to need active vector | |||||
| // 2)independent stream:if has not been activate, push to need active vector | |||||
| if (!independent_stream_activated_) { | if (!independent_stream_activated_) { | ||||
| for (auto &item : independent_stream_map_) { | for (auto &item : independent_stream_map_) { | ||||
| need_first_active_streams_.emplace_back(item.first); | need_first_active_streams_.emplace_back(item.first); | ||||
| } | } | ||||
| } | } | ||||
| // 4)hcom stream:if has not been activate, push to need active vector | |||||
| // 3)hcom stream:if has not been activate, push to need active vector | |||||
| if (!hcom_stream_activated_) { | if (!hcom_stream_activated_) { | ||||
| for (auto &item : hcom_stream_map_) { | for (auto &item : hcom_stream_map_) { | ||||
| need_first_active_streams_.emplace_back(item.first); | need_first_active_streams_.emplace_back(item.first); | ||||
| } | } | ||||
| } | } | ||||
| // 4)first stream 0 should be actived first; | |||||
| auto it = std::find(need_first_active_streams_.begin(), need_first_active_streams_.end(), 0); | |||||
| if (it == need_first_active_streams_.end()) { | |||||
| need_first_active_streams_.emplace_back(0); | |||||
| } | |||||
| } | } | ||||
| // section8 | // section8 | ||||
| @@ -958,7 +962,7 @@ void AscendStreamAssign::DFS(uint32_t start, std::vector<uint32_t> *group) { | |||||
| if (!IsVecExist(group)) { | if (!IsVecExist(group)) { | ||||
| stream_groups_.emplace_back(*group); | stream_groups_.emplace_back(*group); | ||||
| } else { | } else { | ||||
| MS_LOG(WARNING) << "DFS should not print this log"; | |||||
| MS_LOG(WARNING) << "DFS find same stream group, Not expected"; | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -492,6 +492,10 @@ void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, co | |||||
| AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) { | AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown && | |||||
| AnfAlgo::OutputAddrExist(real_input_node, 0)) { | |||||
| continue; | |||||
| } | |||||
| if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) { | if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) { | ||||
| std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)}; | std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)}; | ||||
| builder->SetOutputsFormat(output_format); | builder->SetOutputsFormat(output_format); | ||||
| @@ -52,6 +52,11 @@ bool CPUDeviceAddress::SyncDeviceToHost(const std::vector<int> & /*shape*/, size | |||||
| bool CPUDeviceAddress::SyncHostToDevice(const std::vector<int> & /*shape*/, size_t size, TypeId type, | bool CPUDeviceAddress::SyncHostToDevice(const std::vector<int> & /*shape*/, size_t size, TypeId type, | ||||
| const void *host_ptr) const { | const void *host_ptr) const { | ||||
| if (host_ptr == ptr_) { | |||||
| MS_LOG(DEBUG) << "host_ptr is equal to ptr_, request ignored."; | |||||
| return true; | |||||
| } | |||||
| if (type == kNumberTypeFloat16) { | if (type == kNumberTypeFloat16) { | ||||
| HalfToFloat(ptr_, host_ptr, size / 2); | HalfToFloat(ptr_, host_ptr, size / 2); | ||||
| } else if (type == kNumberTypeFloat64) { | } else if (type == kNumberTypeFloat64) { | ||||
| @@ -40,8 +40,7 @@ void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) { | |||||
| AssignValueNodeAddress(kernel_graph); | AssignValueNodeAddress(kernel_graph); | ||||
| AssignInputNodeAddress(kernel_graph); | AssignInputNodeAddress(kernel_graph); | ||||
| AssignKernelOutputAddress(kernel_graph); | AssignKernelOutputAddress(kernel_graph); | ||||
| resource_manager_.MemPlan(kernel_graph); | |||||
| resource_manager_.MemMalloc(kernel_graph); | |||||
| resource_manager_.AssignMemory(kernel_graph); | |||||
| } | } | ||||
| void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) { | void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) { | ||||
| @@ -186,11 +185,15 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &k | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| return CreatTensorForOutput(node, index, bound_addresses, need_sync_outputs); | return CreatTensorForOutput(node, index, bound_addresses, need_sync_outputs); | ||||
| } else if (input_node->isa<Parameter>() || input_node->isa<ValueNode>()) { | |||||
| } else if (input_node->isa<Parameter>()) { | |||||
| auto iter = input_map.find(input_node.get()); | auto iter = input_map.find(input_node.get()); | ||||
| if (iter != input_map.end()) { | if (iter != input_map.end()) { | ||||
| return iter->second; | return iter->second; | ||||
| } | } | ||||
| } else if (input_node->isa<ValueNode>()) { | |||||
| auto value_node = input_node->cast<ValueNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(value_node); | |||||
| return value_node->value(); | |||||
| } | } | ||||
| return BaseRef(); | return BaseRef(); | ||||
| } | } | ||||
| @@ -220,7 +223,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, | |||||
| (void)tensor->data_sync(); | (void)tensor->data_sync(); | ||||
| } | } | ||||
| if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) { | |||||
| if (tensor->data_type() == address->type_id_ || tensor->data_type() == kNumberTypeFloat32 || | |||||
| tensor->data_type() == kNumberTypeInt32) { | |||||
| address->ptr_ = tensor->data_c(); | address->ptr_ = tensor->data_c(); | ||||
| } else { | } else { | ||||
| std::vector<int> data_shape = tensor->shape(); | std::vector<int> data_shape = tensor->shape(); | ||||
| @@ -34,11 +34,13 @@ void CPUResourceManager::MemFree() { | |||||
| dynamic_mem_.clear(); | dynamic_mem_.clear(); | ||||
| } | } | ||||
| void CPUResourceManager::MemPlan(const session::KernelGraph *graph) { | |||||
| mem_plan_.MemPlan(graph); | |||||
| size_t graph_mem_size = mem_plan_.GetGraphMemSize(graph); | |||||
| void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { | |||||
| size_t graph_mem_size = mem_plan_.MemPlan(graph); | |||||
| if (graph_mem_size > mem_size_) { | if (graph_mem_size > mem_size_) { | ||||
| MemFree(); | |||||
| if (mem_size_ > 0) { | |||||
| dynamic_mem_[mem_ptr_] = mem_size_; | |||||
| mem_size_ = 0; | |||||
| } | |||||
| mem_ptr_ = reinterpret_cast<uint8_t *>(malloc(graph_mem_size)); | mem_ptr_ = reinterpret_cast<uint8_t *>(malloc(graph_mem_size)); | ||||
| if (mem_ptr_ != nullptr) { | if (mem_ptr_ != nullptr) { | ||||
| mem_size_ = graph_mem_size; | mem_size_ = graph_mem_size; | ||||
| @@ -48,9 +50,6 @@ void CPUResourceManager::MemPlan(const session::KernelGraph *graph) { | |||||
| dynamic_malloc_ = true; | dynamic_malloc_ = true; | ||||
| } | } | ||||
| } | } | ||||
| } | |||||
| void CPUResourceManager::MemMalloc(const session::KernelGraph *graph) { | |||||
| if (dynamic_malloc_) { | if (dynamic_malloc_) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -17,7 +17,7 @@ | |||||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ | #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include <unordered_map> | |||||
| #include <map> | |||||
| #include "backend/session/kernel_graph.h" | #include "backend/session/kernel_graph.h" | ||||
| #include "backend/session/session_basic.h" | #include "backend/session/session_basic.h" | ||||
| #include "runtime/device/device_address.h" | #include "runtime/device/device_address.h" | ||||
| @@ -30,8 +30,7 @@ class CPUResourceManager { | |||||
| CPUResourceManager() = default; | CPUResourceManager() = default; | ||||
| ~CPUResourceManager(); | ~CPUResourceManager(); | ||||
| void MemPlan(const session::KernelGraph *graph); | |||||
| void MemMalloc(const session::KernelGraph *graph); | |||||
| void AssignMemory(const session::KernelGraph *graph); | |||||
| void IncreaseAddressRefCount(const session::KernelGraph *graph); | void IncreaseAddressRefCount(const session::KernelGraph *graph); | ||||
| void DecreaseAddressRefCount(const AnfNodePtr &kernel); | void DecreaseAddressRefCount(const AnfNodePtr &kernel); | ||||
| void *MemMalloc(size_t mem_size); | void *MemMalloc(size_t mem_size); | ||||
| @@ -46,7 +45,7 @@ class CPUResourceManager { | |||||
| size_t mem_size_{0}; | size_t mem_size_{0}; | ||||
| uint8_t *mem_ptr_{nullptr}; | uint8_t *mem_ptr_{nullptr}; | ||||
| bool dynamic_malloc_{false}; | bool dynamic_malloc_{false}; | ||||
| std::unordered_map<void *, size_t> dynamic_mem_; | |||||
| std::map<void *, size_t> dynamic_mem_; | |||||
| }; | }; | ||||
| } // namespace cpu | } // namespace cpu | ||||
| } // namespace device | } // namespace device | ||||
| @@ -19,9 +19,9 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace cpu { | namespace cpu { | ||||
| void CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) { | |||||
| size_t CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| size_t total_mem_size = 0; | |||||
| size_t total_mem_size = 32; | |||||
| auto kernels = graph->execution_order(); | auto kernels = graph->execution_order(); | ||||
| for (const auto &kernel : kernels) { | for (const auto &kernel : kernels) { | ||||
| MS_EXCEPTION_IF_NULL(kernel); | MS_EXCEPTION_IF_NULL(kernel); | ||||
| @@ -58,15 +58,8 @@ void CPUSimpleMemPlan::MemPlan(const session::KernelGraph *graph) { | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| graph_mem_size_[graph] = total_mem_size; | |||||
| } | |||||
| size_t CPUSimpleMemPlan::GetGraphMemSize(const session::KernelGraph *graph) const { | |||||
| auto iter = graph_mem_size_.find(graph); | |||||
| if (iter != graph_mem_size_.end()) { | |||||
| return iter->second; | |||||
| } | |||||
| return 0; | |||||
| return total_mem_size; | |||||
| } | } | ||||
| void CPUSimpleMemPlan::MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr) { | void CPUSimpleMemPlan::MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr) { | ||||
| @@ -17,7 +17,6 @@ | |||||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_SIMPLE_MEM_PLAN_H_ | #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_SIMPLE_MEM_PLAN_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include <unordered_map> | |||||
| #include "backend/session/kernel_graph.h" | #include "backend/session/kernel_graph.h" | ||||
| #include "runtime/device/device_address.h" | #include "runtime/device/device_address.h" | ||||
| @@ -29,12 +28,8 @@ class CPUSimpleMemPlan { | |||||
| CPUSimpleMemPlan() = default; | CPUSimpleMemPlan() = default; | ||||
| ~CPUSimpleMemPlan() = default; | ~CPUSimpleMemPlan() = default; | ||||
| void MemPlan(const session::KernelGraph *graph); | |||||
| size_t MemPlan(const session::KernelGraph *graph); | |||||
| void MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr); | void MemAssign(const session::KernelGraph *graph, uint8_t *base_ptr); | ||||
| size_t GetGraphMemSize(const session::KernelGraph *graph) const; | |||||
| private: | |||||
| std::unordered_map<const session::KernelGraph *, size_t> graph_mem_size_; | |||||
| }; | }; | ||||
| } // namespace cpu | } // namespace cpu | ||||
| } // namespace device | } // namespace device | ||||
| @@ -355,7 +355,6 @@ void KernelRuntime::AssignStaticMemoryOutput(session::KernelGraph *graph) { | |||||
| if (!item_with_index.first->isa<CNode>() || !AnfAlgo::IsRealKernel(item_with_index.first)) { | if (!item_with_index.first->isa<CNode>() || !AnfAlgo::IsRealKernel(item_with_index.first)) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| graph->AddFinalOutputKernel(item_with_index.first); | |||||
| if (AnfAlgo::IsCommunicationOp(item_with_index.first)) { | if (AnfAlgo::IsCommunicationOp(item_with_index.first)) { | ||||
| AssignCommunicationNodeMem(kStaticMem, item_with_index.first); | AssignCommunicationNodeMem(kStaticMem, item_with_index.first); | ||||
| } else { | } else { | ||||
| @@ -309,12 +309,7 @@ INPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{1, INPUT_DESC(features)}, {2, INPUT | |||||
| ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP; | ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP; | ||||
| OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}}; | OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}}; | ||||
| // MeanGrad | |||||
| INPUT_MAP(MeanGrad) = {{1, INPUT_DESC(x)}}; | |||||
| INPUT_ATTR_MAP(MeanGrad) = {{2, ATTR_DESC(mean_grad_output_shape_value, kOpFormat_NHWC, | |||||
| AnyTraits<std::vector<int64_t>>(), AnyTraits<int64_t>())}}; | |||||
| ATTR_MAP(MeanGrad) = {{"mode", ATTR_DESC(mode, AnyTraits<int64_t>())}}; | |||||
| // SliceD | |||||
| INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}}; | INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}}; | ||||
| INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}, | INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}, | ||||
| {3, ATTR_DESC(size, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}}; | {3, ATTR_DESC(size, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}}; | ||||
| @@ -431,11 +426,6 @@ INPUT_MAP(TopK) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(k)}}; | |||||
| ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}}; | ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}}; | ||||
| OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}}; | OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}}; | ||||
| // Multiply | |||||
| INPUT_MAP(Multiply) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}}; | |||||
| ATTR_MAP(Multiply) = EMPTY_ATTR_MAP; | |||||
| OUTPUT_MAP(Multiply) = {{0, OUTPUT_DESC(z)}}; | |||||
| // TileD | // TileD | ||||
| INPUT_MAP(TileD) = {{1, INPUT_DESC(x)}}; | INPUT_MAP(TileD) = {{1, INPUT_DESC(x)}}; | ||||
| INPUT_ATTR_MAP(TileD) = {{2, ATTR_DESC(multiples, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}}; | INPUT_ATTR_MAP(TileD) = {{2, ATTR_DESC(multiples, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}}; | ||||