Browse Source

initial commit

master
Hsword 4 years ago
parent
commit
7fd3de80ce
100 changed files with 5479 additions and 134 deletions
  1. +6
    -0
      .gitmodules
  2. +59
    -0
      CMakeLists.txt
  3. +33
    -0
      COMMITTERS.md
  4. +53
    -0
      CONTRIBUTING.md
  5. +201
    -124
      LICENSE
  6. +167
    -10
      README.md
  7. +2
    -0
      bin/heturun
  8. +75
    -0
      cmake/Modules/FindCUDNN.cmake
  9. +70
    -0
      cmake/Modules/FindMETIS.cmake
  10. +14
    -0
      cmake/Modules/FindMKL.cmake
  11. +97
    -0
      cmake/Modules/FindNCCL.cmake
  12. +47
    -0
      cmake/Modules/FindZMQ.cmake
  13. +55
    -0
      cmake/config.example.cmake
  14. +84
    -0
      environment.yml
  15. +49
    -0
      examples/cnn/README.md
  16. +10
    -0
      examples/cnn/local_s1.yml
  17. +202
    -0
      examples/cnn/main.py
  18. +61
    -0
      examples/cnn/models/AlexNet.py
  19. +41
    -0
      examples/cnn/models/CNN.py
  20. +90
    -0
      examples/cnn/models/LSTM.py
  21. +46
    -0
      examples/cnn/models/LeNet.py
  22. +24
    -0
      examples/cnn/models/LogReg.py
  23. +33
    -0
      examples/cnn/models/MLP.py
  24. +56
    -0
      examples/cnn/models/RNN.py
  25. +125
    -0
      examples/cnn/models/ResNet.py
  26. +100
    -0
      examples/cnn/models/VGG.py
  27. +9
    -0
      examples/cnn/models/__init__.py
  28. +4
    -0
      examples/cnn/pytorch_models/__init__.py
  29. +20
    -0
      examples/cnn/pytorch_models/mlp.py
  30. +116
    -0
      examples/cnn/pytorch_models/resnet.py
  31. +36
    -0
      examples/cnn/pytorch_models/rnn.py
  32. +48
    -0
      examples/cnn/pytorch_models/vgg.py
  33. +309
    -0
      examples/cnn/run_tf_horovod.py
  34. +9
    -0
      examples/cnn/scripts/hetu_16gpu.sh
  35. +11
    -0
      examples/cnn/scripts/hetu_1gpu.sh
  36. +10
    -0
      examples/cnn/scripts/hetu_2gpu_ps.sh
  37. +8
    -0
      examples/cnn/scripts/hetu_8gpu.sh
  38. +11
    -0
      examples/cnn/scripts/horovod_16gpu.sh
  39. +6
    -0
      examples/cnn/scripts/horovod_8gpu.sh
  40. +18
    -0
      examples/cnn/scripts/pytorch_16gpu_0.sh
  41. +18
    -0
      examples/cnn/scripts/pytorch_16gpu_1.sh
  42. +7
    -0
      examples/cnn/scripts/pytorch_1gpu.sh
  43. +18
    -0
      examples/cnn/scripts/pytorch_8gpu.sh
  44. +15
    -0
      examples/cnn/scripts/tf_16gpu_worker0.sh
  45. +14
    -0
      examples/cnn/scripts/tf_16gpu_worker1.sh
  46. +10
    -0
      examples/cnn/scripts/tf_1gpu.sh
  47. +15
    -0
      examples/cnn/scripts/tf_8gpu.sh
  48. +23
    -0
      examples/cnn/settings/tf_dist_s1_w16.json
  49. +11
    -0
      examples/cnn/settings/tf_dist_s1_w4.json
  50. +15
    -0
      examples/cnn/settings/tf_dist_s1_w8.json
  51. +49
    -0
      examples/cnn/tf_launch_server.py
  52. +234
    -0
      examples/cnn/tf_launch_worker.py
  53. +194
    -0
      examples/cnn/tf_main.py
  54. +8
    -0
      examples/cnn/tf_models/__init__.py
  55. +45
    -0
      examples/cnn/tf_models/tf_CNN.py
  56. +81
    -0
      examples/cnn/tf_models/tf_LSTM.py
  57. +49
    -0
      examples/cnn/tf_models/tf_LeNet.py
  58. +23
    -0
      examples/cnn/tf_models/tf_LogReg.py
  59. +34
    -0
      examples/cnn/tf_models/tf_MLP.py
  60. +49
    -0
      examples/cnn/tf_models/tf_RNN.py
  61. +113
    -0
      examples/cnn/tf_models/tf_ResNet.py
  62. +103
    -0
      examples/cnn/tf_models/tf_VGG.py
  63. +213
    -0
      examples/cnn/torch_main.py
  64. +9
    -0
      examples/cnn/worker_conf0.json
  65. +9
    -0
      examples/cnn/worker_conf1.json
  66. +2
    -0
      examples/ctr/.gitignore
  67. +109
    -0
      examples/ctr/README.md
  68. +3
    -0
      examples/ctr/kill.sh
  69. +5
    -0
      examples/ctr/models/__init__.py
  70. +63
    -0
      examples/ctr/models/dc_criteo.py
  71. +68
    -0
      examples/ctr/models/dcn_criteo.py
  72. +59
    -0
      examples/ctr/models/deepfm_criteo.py
  73. +320
    -0
      examples/ctr/models/load_data.py
  74. +56
    -0
      examples/ctr/models/wdl_adult.py
  75. +42
    -0
      examples/ctr/models/wdl_criteo.py
  76. +230
    -0
      examples/ctr/run_hetu.py
  77. +174
    -0
      examples/ctr/run_tf_horovod.py
  78. +202
    -0
      examples/ctr/run_tf_local.py
  79. +211
    -0
      examples/ctr/run_tf_parallax.py
  80. +10
    -0
      examples/ctr/settings/local_s1.yml
  81. +10
    -0
      examples/ctr/settings/local_s1_w2.yml
  82. +10
    -0
      examples/ctr/settings/local_s1_w4.yml
  83. +10
    -0
      examples/ctr/settings/local_s1_w8.yml
  84. +6
    -0
      examples/ctr/settings/local_w4.yml
  85. +4
    -0
      examples/ctr/settings/plx_local_spec.yml
  86. +9
    -0
      examples/ctr/settings/tf_local_s1_w2.json
  87. +11
    -0
      examples/ctr/settings/tf_local_s1_w4.json
  88. +15
    -0
      examples/ctr/settings/tf_local_s1_w8.json
  89. +7
    -0
      examples/ctr/tests/hybrid_dcn_criteo.sh
  90. +7
    -0
      examples/ctr/tests/hybrid_dfm_criteo.sh
  91. +7
    -0
      examples/ctr/tests/hybrid_wdl_adult.sh
  92. +7
    -0
      examples/ctr/tests/hybrid_wdl_criteo.sh
  93. +6
    -0
      examples/ctr/tests/local_dcn_criteo.sh
  94. +6
    -0
      examples/ctr/tests/local_dfm_criteo.sh
  95. +6
    -0
      examples/ctr/tests/local_wdl_adult.sh
  96. +6
    -0
      examples/ctr/tests/local_wdl_criteo.sh
  97. +6
    -0
      examples/ctr/tests/ps_dcn_criteo.sh
  98. +6
    -0
      examples/ctr/tests/ps_dfm_criteo.sh
  99. +6
    -0
      examples/ctr/tests/ps_wdl_adult.sh
  100. +6
    -0
      examples/ctr/tests/ps_wdl_criteo.sh

+ 6
- 0
.gitmodules View File

@@ -0,0 +1,6 @@
[submodule "third_party/GraphMix"]
path = third_party/GraphMix
url = https://github.com/nox-410/GraphMix.git
[submodule "third_party/HetuML"]
path = third_party/HetuML
url = https://github.com/ccchengff/HetuML.git

+ 59
- 0
CMakeLists.txt View File

@@ -0,0 +1,59 @@
cmake_minimum_required(VERSION 3.18)

project(Hetu CXX)

include(cmake/config.cmake)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_CXX_FLAGS "-O3 -Wall")

# openmp
find_package(OpenMP REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

# compile flag
if(${HETU_VERSION} STREQUAL "all")
set(HETU_COMPILE_GPU ON)
set(HETU_COMPILE_MKL ON)
elseif(${HETU_VERSION} STREQUAL "gpu")
set(HETU_COMPILE_GPU ON)
set(HETU_COMPILE_MKL OFF)
elseif(${HETU_VERSION} STREQUAL "mkl")
set(HETU_COMPILE_GPU OFF)
set(HETU_COMPILE_MKL ON)
else()
message(FATAL_ERROR "unknown hetu version")
endif()
message(STATUS "HETU version: ${HETU_VERSION}")

# cuda
if(${HETU_COMPILE_GPU})
set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc)
file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION)
string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION})
if(${CUDA_VERSION} VERSION_LESS "10.1")
message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}")
endif()
find_package(CUDAToolkit REQUIRED)
enable_language(CUDA)
endif()

include(FetchContent) # download third_party

add_subdirectory(${CMAKE_SOURCE_DIR}/src)

if(${HETU_PS})
add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite)
endif()

if(${HETU_GEOMETRIC})
add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix)
endif()
if (HETU_ML)
add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML)
endif()
enable_testing()

+ 33
- 0
COMMITTERS.md View File

@@ -0,0 +1,33 @@
## Committer

Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer.

One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.

Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md).

## Committer Members
### Current Committer
- [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)>
- [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)>
- [codecaution](https://github.com/codecaution) - **Xiaonan Nie**
- [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang**
- [nox-410](https://github.com/nox-410) - **Yining Shi**
- [initzhang](https://github.com/initzhang) - **Xin Zhang**
- [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang**
- [AFDWang](https://github.com/AFDWang) - **Yujie Wang**
- [sj1104](https://github.com/sj1104) - **Jia Shen**
- [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou**
- [zmxdream](https://github.com/zmxdream) - **Minxu Zhang**

We would like to sincerely thank the following community members for their contributions to Hetu.

- [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)**
- [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)**
- [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)**
- [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)**
- [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)**
- [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao**
- [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue**
- **Lizi Su**
- **Dong Li**

+ 53
- 0
CONTRIBUTING.md View File

@@ -0,0 +1,53 @@
# Contributing to Hetu
Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing.


## Issues
We use Github Issues to track public bugs and feature requests.

### Search Known Issues First
Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant.

### Reporting New Issues
If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash.

## Pull Requests
We strongly welcome your pull request to make Hetu better.

### Branch Management
There are three main branches here:

1. `main` branch.

(1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`...

(2). **Don't submit any PR on `main` branch.**
2. `specific version` branchs.

(1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing branch. After full testing, `specific version` branch will be merged to `main` branch for the next release.

(2). **You are recommended to submit bugfix or feature PR on `specific version` branch.**


Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release.


### Make Pull Requests
The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay.

Before submitting a pull request, please make sure the followings are done:

1. Fork the repo and create your branch from `main` or `specific version`.
2. Update code or documentation if you have changed APIs.
3. Add the copyright notice to the top of any new files you've added.
4. Check your code lints and checkstyles.
5. Test and test again your code.
6. Now, you can submit your pull request on `specific version` branch.

## Code Style Guide
Use [Code Style](./.clang-format) for Python and C++.

## License
By contributing to Hetu, you agree that your contributions will be licensed
under [License](LICENSE)

+ 201
- 124
LICENSE View File

@@ -1,124 +1,201 @@
木兰宽松许可证, 第2版

2020年1月 http://license.coscl.org.cn/MulanPSL2

您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束:

0. 定义

“软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。

“贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。

“贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。

“法人实体” 是指提交贡献的机构及其“关联实体”。

“关联实体” 是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。

1. 授予版权许可

每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。

2. 授予专利许可

每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。

3. 无商标许可

“本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。

4. 分发限制

您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。

5. 免责声明与责任限制

“软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。

6. 语言

“本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。

条款结束

如何将木兰宽松许可证,第2版,应用到您的软件

如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步:

1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字;

2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中;

3, 请将如下声明文本放入每个源文件的头部注释中。

Copyright (c) [Year] [name of copyright holder]
[Software Name] is licensed under Mulan PSL v2.
You can use this software according to the terms and conditions of the Mulan PSL v2.
You may obtain a copy of Mulan PSL v2 at:
http://license.coscl.org.cn/MulanPSL2
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
See the Mulan PSL v2 for more details.
Mulan Permissive Software License,Version 2
Mulan Permissive Software License,Version 2 (Mulan PSL v2)

January 2020 http://license.coscl.org.cn/MulanPSL2

Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions:

0. Definition

Software means the program and related documents which are licensed under this License and comprise all Contribution(s).

Contribution means the copyrightable work licensed by a particular Contributor under this License.

Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License.

Legal Entity means the entity making a Contribution and all its Affiliates.

Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity.

1. Grant of Copyright License

Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not.

2. Grant of Patent License

Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken.

3. No Trademark License

No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4.

4. Distribution Restriction

You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software.

5. Disclaimer of Warranty and Limitation of Liability

THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

6. Language

THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL.

END OF THE TERMS AND CONDITIONS

How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software

To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps:

Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner;
Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package;
Attach the statement to the appropriate annotated syntax at the beginning of each source file.
Copyright (c) [Year] [name of copyright holder]
[Software Name] is licensed under Mulan PSL v2.
You can use this software according to the terms and conditions of the Mulan PSL v2.
You may obtain a copy of Mulan PSL v2 at:
http://license.coscl.org.cn/MulanPSL2
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
See the Mulan PSL v2 for more details.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.

"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:

(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and

(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and

(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and

(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.

You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [2021] [Peking University]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 167
- 10
README.md View File

@@ -1,20 +1,177 @@
#### 从命令行创建一个新的仓库
<div align=center>
<img src="./img/hetu.png" width="300" />
</div>


# HETU

<!--- [![license](https://img.shields.io/github/license/apache/zookeeper?color=282661)](LICENSE) --->

[Documentation](https://hetu-doc.readthedocs.io) |
[Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html)

Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by <a href="http://net.pku.edu.cn/~cuibin/" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics:

- Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms.

- Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks.

- Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce.

- Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark

- Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search.

We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details.

## Installation
1. Clone the repository.

2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used:
```conda env create -f environment.yml``` .
Please prepare Cuda toolkit and CuDNN in advance.

3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix.
```bash ```bash
touch README.md
git init
git add README.md
git commit -m "first commit"
git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
git push -u origin master
# modify paths and configurations in cmake/config.cmake

# generate Makefile
mkdir build && cd build && cmake ..


# compile
# make all
make -j 8
# make hetu, version is specified in cmake/config.cmake
make hetu -j 8
# make allreduce module
make allreduce -j 8
# make ps module
make ps -j 8
# make geometric module
make geometric -j 8
# make hetu-cache module
make hetu_cache -j 8
``` ```


#### 从命令行推送已经创建的仓库

4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` .



## Usage

Train logistic regression on gpu:


```bash ```bash
git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
git push -u origin master
bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST
```

Train a 3-layer mlp on gpu:


```bash
bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10
```

Train a 3-layer cnn with gpu:

```bash
bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST
``` ```


Train a 3-layer mlp with allreduce on 8 gpus (use mpirun):
```bash
bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10
```

Train a 3-layer mlp with PS on 1 server and 2 workers:
```bash
# in the script we launch the scheduler and server, and two workers
bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10
```


## More Examples
Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks.

## Community
* Email: xupeng.miao@pku.edu.cn
* Slack: coming soon
* Hetu homepage: https://hetu-doc.readthedocs.io
* [Committers & Contributors](COMMITTERS.md)
* [Contributing to Hetu](CONTRIBUTING.md)
* [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html)

## Enterprise Users

If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here.

<img src="./img/tencent.png" width = "200"/>
<img src="./img/alibabacloud.png" width = "200"/>
<img src="./img/kuaishou.png" width = "200"/>

## License

The entire codebase is under [license](LICENSE)

## Papers
1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021
2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021
3. coming soon

## Acknowledgements

We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel).

## Appendix
The prerequisites for different modules in Hetu is listed as follows:
```
"*" means you should prepare by yourself, while others support auto-download
Hetu: OpenMP(*), CMake(*)
Hetu (version mkl): MKL 1.6.1
Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*)
Hetu (version all): both

Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version

Hetu-PS: Protobuf(*), ZeroMQ 4.3.2

Hetu-Geometric: Pybind11(*), Metis(*)

Hetu-Cache: Pybind11(*), this module needs PS module

##################################################################
Tips for preparing the prerequisites
Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment):
1. download from https://developer.nvidia.com
2. install
3. modify paths in cmake/config.cmake if necessary
Preparing OpenMP:
Your just need to ensure your compiler support openmp.

Preparing CMake, Protobuf, Pybind11, Metis:
Install by anaconda:
conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis

Preparing OpenMPI (not necessary):
install by anaconda: `conda install -c conda-forge openmpi=4.0.3`
or
1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
2. build openmpi by `./configure /path/to/build && make -j8 && make install`
3. modify MPI_HOME to /path/to/build in cmake/config.cmake

Preparing MKL (not necessary):
install by anaconda: `conda install -c conda-forge onednn`
or
1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz
2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake

Preparing ZeroMQ (not necessary):
install by anaconda: `conda install -c anaconda zeromq=4.3.2`
or
1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip
2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake
```

+ 2
- 0
bin/heturun View File

@@ -0,0 +1,2 @@
#!/bin/bash
python $(cd $(dirname $0); pwd)/../python/runner.py $@

+ 75
- 0
cmake/Modules/FindCUDNN.cmake View File

@@ -0,0 +1,75 @@
# Find the CUDNN libraries
#
# The following variables are optionally searched for defaults
# CUDNN_ROOT: Base directory where CUDNN is found
# CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
# CUDNN_LIBRARY: Directory where CUDNN library is searched for
# CUDNN_STATIC: Are we looking for a static library? (default: no)
#
# The following are set after configuration is done:
# CUDNN_FOUND
# CUDNN_INCLUDE_PATH
# CUDNN_LIBRARY_PATH
#

set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
if (DEFINED $ENV{CUDNN_ROOT_DIR})
message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
endif()
list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})

# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})

set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")

find_path(CUDNN_INCLUDE_PATH cudnn.h
HINTS ${CUDNN_INCLUDE_DIR}
PATH_SUFFIXES cuda/include cuda include
REQUIRED)

option(CUDNN_STATIC "Look for static CUDNN" OFF)
if (CUDNN_STATIC)
set(CUDNN_LIBNAME "libcudnn_static.a")
else()
set(CUDNN_LIBNAME "cudnn")
endif()

set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
endif()

find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
PATHS ${CUDNN_LIBRARY}
PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
REQUIRED)

set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc")
file(WRITE ${file} "
#include <iostream>
#include \"${CUDNN_INCLUDE_PATH}/cudnn.h\"
int main()
{
std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL;
int x = cudnnGetVersion();
return x == CUDNN_VERSION;
}
")
try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE CUDNN_VERSION
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
LINK_LIBRARIES ${CUDNN_LIBRARY_PATH})
if (NOT CUDNN_VERSION_MATCHED)
message(FATAL_ERROR "Found CUDNN header version and library version do not match! \
(include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.")
endif()
message(STATUS "CUDNN version: ${CUDNN_VERSION}")

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(
CUDNN
REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH
VERSION_VAR CUDNN_VERSION)

mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)

+ 70
- 0
cmake/Modules/FindMETIS.cmake View File

@@ -0,0 +1,70 @@
# Accepts the following variables:
#
# METIS_ROOT: Prefix where METIS is installed.
# METIS_LIB_NAME: Name of the METIS library (default: metis).
# METIS_LIBRARY: Full path of the METIS library.

# Sets the following variables:
#
# METIS_LIBRARY: Full path of the METIS library.
# METIS_FOUND: True if ParMETIS was found.
# METIS_LIBRARIES: List of all libraries needed for linking with METIS,
#
# Provides the following macros:
#
# find_package(METIS)
#
# Searches for METIS (See above)


# search metis header
find_path(METIS_INCLUDE_DIR metis.h
PATHS ${METIS_DIR} ${METIS_ROOT}
PATH_SUFFIXES metis include include/metis Lib METISLib
NO_DEFAULT_PATH
DOC "Include directory of metis")
find_path(METIS_INCLUDE_DIR metis.h
PATH_SUFFIXES metis include include/metis Lib METISLib)

set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library")

# search metis library
if(NOT METIS_LIB_NAME)
set(METIS_LIB_NAME metis)
endif(NOT METIS_LIB_NAME)

find_library(METIS_LIBRARY ${METIS_LIB_NAME}
PATHS ${METIS_DIR} ${METIS_ROOT}
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
find_library(METIS_LIBRARY ${METIS_LIB_NAME}
PATH_SUFFIXES lib
)

# behave like a CMake module is supposed to behave
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(
"METIS"
DEFAULT_MSG
METIS_INCLUDE_DIR
METIS_LIBRARY
)

mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME)

# if both headers and library are found, store results
if(METIS_FOUND)
set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR})
set(METIS_LIBRARIES ${METIS_LIBRARY})
# log result
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
"Determing location of METIS succeded:\n"
"Include directory: ${METIS_INCLUDE_DIRS}\n"
"Library directory: ${METIS_LIBRARIES}\n\n")
else(METIS_FOUND)
# log errornous result
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
"Determing location of METIS failed:\n"
"Include directory: ${METIS_INCLUDE_DIRS}\n"
"Library directory: ${METIS_LIBRARIES}\n\n")
endif(METIS_FOUND)

+ 14
- 0
cmake/Modules/FindMKL.cmake View File

@@ -0,0 +1,14 @@
# - Try to find DNNL(MKL-DNN)
# Once done this will define
# DNNL_FOUND - System has DNNL
# DNNL_INCLUDE_DIR - The DNNL include directories
# DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build
# DNNL_LIBRARY - The libraries needed to use DNNL
# DNNL_DEFINITIONS - Compiler switches required for using DNNL

find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include )
find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include )
find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src )

include ( FindPackageHandleStandardArgs )
find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR )

+ 97
- 0
cmake/Modules/FindNCCL.cmake View File

@@ -0,0 +1,97 @@
# Try to find NCCL
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are found
# NCCL_ROOT_DIR: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

if (NOT DEFINED NCCL_ROOT)
set(NCCL_ROOT $ENV{CONDA_PREFIX})
endif()

set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")

find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS
${NCCL_ROOT}
${NCCL_ROOT}/include
${NCCL_INCLUDE_DIR}
${NCCL_ROOT_DIR}
${NCCL_ROOT_DIR}/include
${CUDA_TOOLKIT_ROOT_DIR}/include
REQUIRED)

if ($ENV{USE_STATIC_NCCL})
message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
set(NCCL_LIBNAME "libnccl_static.a")
else()
set(NCCL_LIBNAME "nccl")
endif()

find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS
${NCCL_LIB_DIR}
${NCCL_ROOT}
${NCCL_ROOT}/lib
${NCCL_ROOT}/lib/x86_64-linux-gnu
${NCCL_ROOT}/lib64
${NCCL_ROOT_DIR}
${NCCL_ROOT_DIR}/lib
${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
${NCCL_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib64
REQUIRED)

set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)

if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include \"${NCCL_HEADER_FILE}\"
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION}")
else()
message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})

mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(
NCCL
REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES
VERSION_VAR NCCL_VERSION)

+ 47
- 0
cmake/Modules/FindZMQ.cmake View File

@@ -0,0 +1,47 @@
# - Try to find ZMQ
# Once done this will define
# ZMQ_FOUND - System has ZMQ
# ZMQ_INCLUDE_DIRS - The ZMQ include directories
# ZMQ_LIBRARIES - The libraries needed to use ZMQ
# ZMQ_DEFINITIONS - Compiler switches required for using ZMQ

find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include )
find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib )

set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} )
set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} )

if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS)
set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc")
file(WRITE ${file} "
#include <iostream>
#include \"${ZMQ_INCLUDE_DIRS}/zmq.h\"
int main()
{
std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH;
int x, y, z;
zmq_version(&x, &y, &z);
return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH;
}
")
try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE ZMQ_VERSION
LINK_LIBRARIES ${ZMQ_LIBRARIES})
if (NOT ZMQ_VERSION_MATCHED)
message(WARNING "Found ZMQ header version and library version do not match! \
(include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.")
unset(ZMQ_INCLUDE_DIRS)
unset(ZMQ_LIBRARIES)
unset(ZMQ_VERSION)
else ()
message(STATUS "ZMQ version: ${ZMQ_VERSION}")
endif()
endif()

include ( FindPackageHandleStandardArgs )
# handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args (
ZMQ
REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS
VERSION_VAR ZMQ_VERSION)

+ 55
- 0
cmake/config.example.cmake View File

@@ -0,0 +1,55 @@
######################
### Set targets ######
######################

# hetu main version, choose from (mkl, gpu, all)
# if using mkl (for CPU) or all, OpenMP(*), mkl required
# if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required
set(HETU_VERSION "all")

# whether to compile allreduce module
# nccl(*), openmpi required
set(HETU_ALLREDUCE ON)

# whether to compile ps module
# protobuf(*), zeromq required
set(HETU_PS ON)

# whether to compile geometric module (for GNNs)
# pybind11(*), metis(*) required
set(HETU_GEOMETRIC ON)

# whether to compile cache module (for PS)
# to enable this, you must turn HETU_PS on
# pybind11(*) required
set(HETU_CACHE ON)

# whether to compile Hetu ML Module
set(HETU_ML ON)
set(HETU_PARALLEL_ML ON)

######################
### Set paths ########
######################

# CUDA version >= 10.1
set(CUDAToolkit_ROOT /usr/local/cuda)

# NCCL version >= 2.8
set(NCCL_ROOT $ENV{CONDA_PREFIX})

set(CUDNN_ROOT)

# MPI version >= 3.1 (OpenMPI version >= 4.0.3)
# if valid version not found, we'll download and compile it in time (openmpi-4.0.3)
set(MPI_HOME $ENV{CONDA_PREFIX})

# MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl
# if not found, we'll download and compile it in time
set(MKL_ROOT $ENV{CONDA_PREFIX})
set(MKL_BUILD $ENV{CONDA_PREFIX})

# ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq
# if not found, we'll download and compile it in time
set(ZMQ_ROOT $ENV{CONDA_PREFIX})
set(ZMQ_BUILD $ENV{CONDA_PREFIX})

+ 84
- 0
environment.yml View File

@@ -0,0 +1,84 @@
name: hetu
channels:
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- bcrypt=3.2.0=py37h5e8e339_1
- blas=1.0=mkl
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2021.7.5=h06a4308_1
- certifi=2021.5.30=py37h06a4308_0
- cffi=1.14.6=py37hc58025e_0
- cmake=3.18.2=ha30ef3c_0
- cryptography=3.4.7=py37h5d9358c_0
- cudatoolkit=10.1.243=h6bb024c_0
- expat=2.4.1=h2531618_2
- intel-openmp=2021.3.0=h06a4308_3350
- joblib=1.0.1=pyhd3eb1b0_0
- krb5=1.18.2=h173b8e3_0
- ld_impl_linux-64=2.35.1=h7274673_9
- libcurl=7.71.1=h20c2e04_1
- libedit=3.1.20210216=h27cfd23_1
- libffi=3.3=he6710b0_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgfortran-ng=7.5.0=h14aa051_19
- libgfortran4=7.5.0=h14aa051_19
- libgomp=9.3.0=h5101ec6_17
- libprotobuf=3.15.8=h780b84a_0
- libsodium=1.0.18=h7b6447c_0
- libssh2=1.9.0=h1ba5d50_1
- libstdcxx-ng=9.3.0=hd4cf53a_17
- libuv=1.40.0=h7b6447c_0
- lz4-c=1.9.3=h2531618_0
- metis=5.1.0=hf484d3e_4
- mkl=2021.3.0=h06a4308_520
- mkl-service=2.4.0=py37h7f8727e_0
- mkl_fft=1.3.0=py37h42c9631_2
- mkl_random=1.2.2=py37h51133e4_0
- mpi=1.0=openmpi
- nccl=2.8.3.1=hcaf9a05_0
- ncurses=6.2=he6710b0_1
- numpy=1.20.3=py37hf144106_0
- numpy-base=1.20.3=py37h74d4b33_0
- onednn=2.3=omp_hf4ef041_0
- onnx=1.9.0=py37h284874a_0
- onnxruntime=1.7.2=py37he8cb6d3_1
- openmpi=4.0.3=hdf1f1ad_1
- openssl=1.1.1k=h27cfd23_0
- pandas=1.2.5=py37h295c915_0
- paramiko=2.7.2=pyh9f0ad1d_0
- pip=21.1.3=py37h06a4308_0
- protobuf=3.15.8=py37hcd2ae1e_0
- psutil=5.8.0=py37h5e8e339_1
- pybind11=2.6.2=py37hff7bd54_1
- pycparser=2.20=pyh9f0ad1d_2
- pynacl=1.4.0=py37h5e8e339_2
- python=3.7.10=h12debd9_4
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python_abi=3.7=2_cp37m
- pytz=2021.1=pyhd3eb1b0_0
- pyyaml=5.4.1=py37h27cfd23_1
- re2=2021.04.01=h9c3ff4c_0
- readline=8.1=h27cfd23_0
- rhash=1.4.1=h3c74f83_1
- scikit-learn=0.24.2=py37ha9443f7_0
- scipy=1.6.2=py37had2a1c9_1
- setuptools=52.0.0=py37h06a4308_0
- six=1.16.0=pyhd3eb1b0_0
- sqlite=3.36.0=hc218d9a_0
- threadpoolctl=2.2.0=pyhb85f177_0
- tk=8.6.10=hbc83047_0
- tqdm=4.61.2=pyhd3eb1b0_1
- typing-extensions=3.10.0.0=hd8ed1ab_0
- typing_extensions=3.10.0.0=pyha770c72_0
- wheel=0.36.2=pyhd3eb1b0_0
- xz=5.2.5=h7b6447c_0
- yaml=0.2.5=h7b6447c_0
- zeromq=4.3.2=he6710b0_3
- zlib=1.2.11=h7b6447c_3
- zstd=1.4.9=haebb681_0
- pip:
- cloudpickle==1.6.0
- wget==3.2

+ 49
- 0
examples/cnn/README.md View File

@@ -0,0 +1,49 @@
# CNN Examples
In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison.
## Structure
```
- cnn
- models/ CNN models in HETU
- pytorch_models/ CNN models in PyTorch
- tf_models/ CNN models in TensorFlow
- scripts/ Test scripts
- main.py Trainer for HETU
- run_tf_horovod.py Trainer for Horovod
- tf_launch_server.py Trainer for TF-PS (role: server)
- tf_launch_worker.py Trainer for TF-PS (role: worker)
- tf_main.py Trainer for TensorFlow
- torch_main.py Trainer for Pytorch
-
```
## Usage
Here are some examples of running scripts.
```bash
bash scripts/hetu_1gpu.sh mlp CIFAR10 # mlp with CIFAR10 dataset in hetu
bash scripts/hetu_8gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (1-node)
bash scripts/hetu_16gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (2-nodes)
```
To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples.

We can change the setting in scripts. See `mnist_mlp.sh` below.
```bash
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py

### validate and timing
python ${mainpy} --model mlp --dataset CIFAR10 --validate --timing

### run in cpu
# python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing

```

For more details about training setting, please refer to `main.py`.
## Models
We provide following models with specific datasets.
```
CIFAR100: VGG, ResNet
CIFAR10: MLP, VGG, ResNet
MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN
```

+ 10
- 0
examples/cnn/local_s1.yml View File

@@ -0,0 +1,10 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13030
DMLC_NUM_WORKER : 2
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3
launch :
worker : 0
server : 1
scheduler : true

+ 202
- 0
examples/cnn/main.py View File

@@ -0,0 +1,202 @@
import hetu as ht
import models
import os
import numpy as np
import argparse
import json
import logging
from time import time
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def print_rank0(msg):
if device_id == 0:
logger.info(msg)


if __name__ == "__main__":
# argument parser
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True,
help='model to be tested')
parser.add_argument('--dataset', type=str, required=True,
help='dataset to be trained on')
parser.add_argument('--batch-size', type=int,
default=128, help='batch size')
parser.add_argument('--learning-rate', type=float,
default=0.1, help='learning rate')
parser.add_argument('--opt', type=str, default='sgd',
help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
parser.add_argument('--num-epochs', type=int,
default=10, help='epoch number')
parser.add_argument('--gpu', type=int, default=0,
help='gpu to be used, -1 means cpu')
parser.add_argument('--validate', action='store_true',
help='whether to use validation')
parser.add_argument('--timing', action='store_true',
help='whether to time the training phase')
parser.add_argument('--comm-mode', default=None, help='communication mode')
args = parser.parse_args()

global device_id
device_id = 0
print_rank0("Training {} on HETU".format(args.model))
if args.comm_mode in ('AllReduce', 'Hybrid'):
comm, device_id = ht.mpi_nccl_init()
executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
else:
if args.gpu == -1:
executor_ctx = ht.cpu(0)
print_rank0('Use CPU.')
else:
executor_ctx = ht.gpu(args.gpu)
print_rank0('Use GPU %d.' % args.gpu)
if args.comm_mode in ('PS', 'Hybrid'):
settings_file = open(os.path.join(os.path.abspath(
os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu))
settings = json.load(settings_file)
for key in settings:
if type(settings[key]) == str:
os.environ[key] = settings[key]
else:
os.environ[key] = str(settings[key]) # type is str

assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \
'Model not supported!'
model = eval('models.' + args.model)

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset
assert args.opt in ['sgd', 'momentum', 'nesterov',
'adagrad', 'adam'], 'Optimizer not supported!'

if args.opt == 'sgd':
print_rank0('Use SGD Optimizer.')
opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
elif args.opt == 'momentum':
print_rank0('Use Momentum Optimizer.')
opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
elif args.opt == 'nesterov':
print_rank0('Use Nesterov Momentum Optimizer.')
opt = ht.optim.MomentumOptimizer(
learning_rate=args.learning_rate, nesterov=True)
elif args.opt == 'adagrad':
print_rank0('Use AdaGrad Optimizer.')
opt = ht.optim.AdaGradOptimizer(
learning_rate=args.learning_rate, initial_accumulator_value=0.1)
else:
print_rank0('Use Adam Optimizer.')
opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)

# data loading
print_rank0('Loading %s data...' % dataset)
if dataset == 'MNIST':
datasets = ht.data.mnist()
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
# train_set_x: (50000, 784), train_set_y: (50000, 10)
# valid_set_x: (10000, 784), valid_set_y: (10000, 10)
# x_shape = (args.batch_size, 784)
# y_shape = (args.batch_size, 10)
elif dataset == 'CIFAR10':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
num_class=10)
if args.model == "mlp":
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
# train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10)
# valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10)
# x_shape = (args.batch_size, 3, 32, 32)
# y_shape = (args.batch_size, 10)
elif dataset == 'CIFAR100':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
num_class=100)
# train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100)
# valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100)
else:
raise NotImplementedError

# model definition
print_rank0('Building model {}'.format(args.model))
x = ht.dataloader_op([
ht.Dataloader(train_set_x, args.batch_size, 'train'),
ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
])
y_ = ht.dataloader_op([
ht.Dataloader(train_set_y, args.batch_size, 'train'),
ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
])
if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
loss, y = model(x, y_, 100)
else:
loss, y = model(x, y_)

train_op = opt.minimize(loss)

eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]}
executor = ht.Executor(eval_nodes, ctx=executor_ctx,
comm_mode=args.comm_mode)
n_train_batches = executor.get_batch_num('train')
n_valid_batches = executor.get_batch_num('validate')

# training
print_rank0("Start training loop...")
running_time = 0
for i in range(args.num_epochs + 1):
print_rank0("Epoch %d" % i)
loss_all = 0
batch_num = 0
if args.timing:
start = time()
correct_predictions = []
for minibatch_index in range(n_train_batches):
loss_val, predict_y, y_val, _ = executor.run(
'train', eval_node_list=[loss, y, y_, train_op])
# Loss for this minibatch
predict_y = predict_y.asnumpy()
y_val = y_val.asnumpy()
loss_all += loss_val.asnumpy()
batch_num += 1
# Predict accuracy for this minibatch
correct_prediction = np.equal(
np.argmax(y_val, 1),
np.argmax(predict_y, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)

loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Train loss = %f" % loss_all)
print_rank0("Train accuracy = %f" % accuracy)

if args.timing:
end = time()
during_time = end - start
print_rank0("Running time of current epoch = %fs" % (during_time))
if i != 0:
running_time += during_time
if args.validate:
val_loss_all = 0
batch_num = 0
correct_predictions = []
for minibatch_index in range(n_valid_batches):
loss_val, valid_y_predicted, y_val = executor.run(
'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True)
val_loss_all += loss_val
batch_num += 1
correct_prediction = np.equal(
np.argmax(y_val, 1),
np.argmax(valid_y_predicted, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)

val_loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Validation loss = %f" % val_loss_all)
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))
if args.comm_mode in ('AllReduce', 'Hybrid'):
ht.mpi_nccl_finish(comm)

+ 61
- 0
examples/cnn/models/AlexNet.py View File

@@ -0,0 +1,61 @@
import hetu as ht
from hetu import init


def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False):
weight = init.random_normal(
shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
bn_scale = init.random_normal(
shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale')
bn_bias = init.random_normal(
shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias')
x = ht.conv2d_op(x, weight, stride=1, padding=1)
x = ht.batch_normalization_op(x, bn_scale, bn_bias)
if with_relu:
x = ht.relu_op(x)
if with_pool:
x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0)
return x


def fc(x, shape, name, with_relu=True):
weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
x = ht.matmul_op(x, weight)
x = x + ht.broadcastto_op(bias, x)
if with_relu:
x = ht.relu_op(x)
return x


def alexnet(x, y_):
'''
AlexNet model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print('Building AlexNet model...')
x = ht.array_reshape_op(x, [-1, 1, 28, 28])
x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1',
with_relu=True, with_pool=True)
x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2',
with_relu=True, with_pool=True)
x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3',
with_relu=True, with_pool=False)
x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4',
with_relu=True, with_pool=False)
x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5',
with_relu=False, with_pool=True)
x = ht.array_reshape_op(x, (-1, 256*3*3))
x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True)
x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True)
y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 41
- 0
examples/cnn/models/CNN.py View File

@@ -0,0 +1,41 @@
import hetu as ht
from hetu import init


def conv_relu_avg(x, shape):
weight = init.random_normal(shape=shape, stddev=0.1)
x = ht.conv2d_op(x, weight, padding=2, stride=1)
x = ht.relu_op(x)
x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
return x


def fc(x, shape):
weight = init.random_normal(shape=shape, stddev=0.1)
bias = init.random_normal(shape=shape[-1:], stddev=0.1)
x = ht.array_reshape_op(x, (-1, shape[0]))
x = ht.matmul_op(x, weight)
y = x + ht.broadcastto_op(bias, x)
return y


def cnn_3_layers(x, y_):
'''
3-layer-CNN model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print('Building 3-layer-CNN model...')
x = ht.array_reshape_op(x, [-1, 1, 28, 28])
x = conv_relu_avg(x, (32, 1, 5, 5))
x = conv_relu_avg(x, (64, 32, 5, 5))
y = fc(x, (7 * 7 * 64, 10))
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 90
- 0
examples/cnn/models/LSTM.py View File

@@ -0,0 +1,90 @@
import hetu as ht
from hetu import init
import numpy as np


def lstm(x, y_):
'''
LSTM model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''
diminput = 28
dimhidden = 128
dimoutput = 10
nsteps = 28

forget_gate_w = init.random_normal(
shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w")
forget_gate_u = init.random_normal(
shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u")
forget_gate_b = init.random_normal(
shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b")
input_gate_w = init.random_normal(
shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w")
input_gate_u = init.random_normal(
shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u")
input_gate_b = init.random_normal(
shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b")
output_gate_w = init.random_normal(
shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w")
output_gate_u = init.random_normal(
shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u")
output_gate_b = init.random_normal(
shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b")
tanh_w = init.random_normal(
shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w")
tanh_u = init.random_normal(
shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u")
tanh_b = init.random_normal(
shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b")
out_weights = init.random_normal(
shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight")
out_bias = init.random_normal(
shape=(dimoutput,), stddev=0.1, name="lstm_out_bias")
initial_state = ht.Variable(value=np.zeros((1,)).astype(
np.float32), name='initial_state', trainable=False)

for i in range(nsteps):
cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput))
# forget gate
if i == 0:
temp = ht.matmul_op(cur_x, forget_gate_w)
last_c_state = ht.broadcastto_op(initial_state, temp)
last_h_state = ht.broadcastto_op(initial_state, temp)
cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp
else:
cur_forget = ht.matmul_op(
last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w)
cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget)
cur_forget = ht.sigmoid_op(cur_forget)
# input gate
cur_input = ht.matmul_op(
last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w)
cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input)
cur_input = ht.sigmoid_op(cur_input)
# output gate
cur_output = ht.matmul_op(
last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w)
cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output)
cur_output = ht.sigmoid_op(cur_output)
# tanh
cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \
ht.matmul_op(cur_x, tanh_w)
cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh)
cur_tanh = ht.tanh_op(cur_tanh)

last_c_state = ht.mul_op(last_c_state, cur_forget) + \
ht.mul_op(cur_input, cur_tanh)
last_h_state = ht.tanh_op(last_c_state) * cur_output

x = ht.matmul_op(last_h_state, out_weights)
y = x + ht.broadcastto_op(out_bias, x)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 46
- 0
examples/cnn/models/LeNet.py View File

@@ -0,0 +1,46 @@
import hetu as ht
from hetu import init


def conv_pool(x, in_channel, out_channel, name):
weight = init.random_normal(
shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight')
x = ht.conv2d_op(x, weight, padding=2, stride=1)
x = ht.relu_op(x)
x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
return x


def fc(x, shape, name, with_relu=True):
weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
x = ht.matmul_op(x, weight)
x = x + ht.broadcastto_op(bias, x)
if with_relu:
x = ht.relu_op(x)
return x


def lenet(x, y_):
'''
LeNet model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print('Building LeNet model...')
x = ht.array_reshape_op(x, (-1, 1, 28, 28))
x = conv_pool(x, 1, 6, name='lenet_conv1')
x = conv_pool(x, 6, 16, name='lenet_conv2')
x = ht.array_reshape_op(x, (-1, 7*7*16))
x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True)
x = fc(x, (120, 84), name='lenet_fc2', with_relu=True)
y = fc(x, (84, 10), name='lenet_fc3', with_relu=False)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 24
- 0
examples/cnn/models/LogReg.py View File

@@ -0,0 +1,24 @@
import hetu as ht
from hetu import init


def logreg(x, y_):
'''
Logistic Regression model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print("Build logistic regression model...")
weight = init.zeros((784, 10), name='logreg_weight')
bias = init.zeros((10,), name='logreg_bias')
x = ht.matmul_op(x, weight)
y = x + ht.broadcastto_op(bias, x)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 33
- 0
examples/cnn/models/MLP.py View File

@@ -0,0 +1,33 @@
import hetu as ht
from hetu import init


def fc(x, shape, name, with_relu=True):
weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
x = ht.matmul_op(x, weight)
x = x + ht.broadcastto_op(bias, x)
if with_relu:
x = ht.relu_op(x)
return x


def mlp(x, y_):
'''
MLP model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print("Building MLP model...")
x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True)
x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 56
- 0
examples/cnn/models/RNN.py View File

@@ -0,0 +1,56 @@
import hetu as ht
from hetu import init
import numpy as np


def rnn(x, y_):
'''
RNN model, for MNIST dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

print("Building RNN model...")
diminput = 28
dimhidden = 128
dimoutput = 10
nsteps = 28

weight1 = init.random_normal(
shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1')
bias1 = init.random_normal(
shape=(dimhidden, ), stddev=0.1, name='rnn_bias1')
weight2 = init.random_normal(
shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2')
bias2 = init.random_normal(
shape=(dimhidden, ), stddev=0.1, name='rnn_bias2')
weight3 = init.random_normal(
shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3')
bias3 = init.random_normal(
shape=(dimoutput, ), stddev=0.1, name='rnn_bias3')
last_state = ht.Variable(value=np.zeros((1,)).astype(
np.float32), name='initial_state', trainable=False)

for i in range(nsteps):
cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput))
h = ht.matmul_op(cur_x, weight1)
h = h + ht.broadcastto_op(bias1, h)

if i == 0:
last_state = ht.broadcastto_op(last_state, h)
s = ht.concat_op(h, last_state, axis=1)
s = ht.matmul_op(s, weight2)
s = s + ht.broadcastto_op(bias2, s)
last_state = ht.relu_op(s)

final_state = last_state
x = ht.matmul_op(final_state, weight3)
y = x + ht.broadcastto_op(bias3, x)
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
return loss, y

+ 125
- 0
examples/cnn/models/ResNet.py View File

@@ -0,0 +1,125 @@
import hetu as ht
from hetu import init


def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''):
weight = init.random_normal(
shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
x = ht.conv2d_op(x, weight, stride=stride, padding=padding)
return x


def batch_norm_with_relu(x, hidden, name):
scale = init.random_normal(
shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale')
bias = init.random_normal(shape=(1, hidden, 1, 1),
stddev=0.1, name=name+'_bias')
x = ht.batch_normalization_op(x, scale, bias)
x = ht.relu_op(x)
return x


def resnet_block(x, in_channel, num_blocks, is_first=False, name=''):
if is_first:
out_channel = in_channel
identity = x
x = conv2d(x, in_channel, out_channel, stride=1,
padding=1, name=name+'_conv1')
x = batch_norm_with_relu(x, out_channel, name+'_bn1')
x = conv2d(x, out_channel, out_channel, stride=1,
padding=1, name=name+'_conv2')
x = x + identity
else:
out_channel = 2 * in_channel
identity = x
x = batch_norm_with_relu(x, in_channel, name+'_bn0')
x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]])
x = conv2d(x, in_channel, out_channel, stride=2,
padding=0, name=name+'_conv1')
x = batch_norm_with_relu(x, out_channel, name+'_bn1')
x = conv2d(x, out_channel, out_channel, stride=1,
padding=1, name=name+'_conv2')
identity = ht.avg_pool2d_op(
identity, kernel_H=2, kernel_W=2, padding=0, stride=2)
identity = ht.pad_op(
identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])
x = x + identity

for i in range(1, num_blocks):
identity = x
x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i))
x = conv2d(x, out_channel, out_channel, stride=1,
padding=1, name=name+'_conv%d' % (2 * i + 1))
x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1))
x = conv2d(x, out_channel, out_channel, stride=1,
padding=1, name=name+'_conv%d' % (2 * i + 2))
x = x + identity

return x


def fc(x, shape, name):
weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
x = ht.matmul_op(x, weight)
x = x + ht.broadcastto_op(bias, x)
return x


def resnet(x, y_, num_layers=18, num_class=10):
'''
ResNet model, for CIFAR10 dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
num_layers: 18 or 34
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

base_size = 16

x = conv2d(x, 3, base_size, stride=1, padding=1,
name='resnet_initial_conv')
x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn')

if num_layers == 18:
print("Building ResNet-18 model...")
x = resnet_block(x, base_size, num_blocks=2,
is_first=True, name='resnet_block1')
x = resnet_block(x, base_size, num_blocks=2,
is_first=False, name='resnet_block2')
x = resnet_block(x, 2 * base_size, num_blocks=2,
is_first=False, name='resnet_block3')
x = resnet_block(x, 4 * base_size, num_blocks=2,
is_first=False, name='resnet_block4')
elif num_layers == 34:
print("Building ResNet-34 model...")
x = resnet_block(x, base_size, num_blocks=3,
is_first=True, name='resnet_block1')
x = resnet_block(x, base_size, num_blocks=4,
is_first=False, name='resnet_block2')
x = resnet_block(x, 2 * base_size, num_blocks=6,
is_first=False, name='resnet_block3')
x = resnet_block(x, 4 * base_size, num_blocks=3,
is_first=False, name='resnet_block4')
else:
assert False, "Number of layers should be 18 or 34 !"

x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn')
x = ht.array_reshape_op(x, (-1, 128 * base_size))
y = fc(x, (128 * base_size, num_class), name='resnet_final_fc')
# here we don't use cudnn for softmax crossentropy to avoid overflows
loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False)
loss = ht.reduce_mean_op(loss, [0])
return loss, y


def resnet18(x, y_, num_class=10):
return resnet(x, y_, 18, num_class)


def resnet34(x, y_, num_class=10):
return resnet(x, y_, 34, num_class)

+ 100
- 0
examples/cnn/models/VGG.py View File

@@ -0,0 +1,100 @@
import hetu as ht
from hetu import init


def conv_bn_relu(x, in_channel, out_channel, name):
weight = init.random_normal(shape=(out_channel, in_channel, 3, 3),
stddev=0.1, name=name+'_weight')
bn_scale = init.random_normal(shape=(1, out_channel, 1, 1),
stddev=0.1, name=name+'_bn_scale')
bn_bias = init.random_normal(shape=(1, out_channel, 1, 1),
stddev=0.1, name=name+'_bn_bias')

x = ht.conv2d_op(x, weight, padding=1, stride=1)
x = ht.batch_normalization_op(x, bn_scale, bn_bias)
act = ht.relu_op(x)
return act


def vgg_2block(x, in_channel, out_channel, name):
x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
return x


def vgg_3block(x, in_channel, out_channel, name):
x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
return x


def vgg_4block(x, in_channel, out_channel, name):
x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4')
x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
return x


def vgg_fc(x, in_feat, out_feat, name):
weight = init.random_normal(shape=(in_feat, out_feat),
stddev=0.1, name=name+'_weight')
bias = init.random_normal(shape=(out_feat,),
stddev=0.1, name=name+'_bias')
x = ht.matmul_op(x, weight)
x = x + ht.broadcastto_op(bias, x)
return x


def vgg(x, y_, num_layers, num_class=10):
'''
VGG model, for CIFAR10/CIFAR100 dataset.

Parameters:
x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
num_layers: 16 or 19
Return:
loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
'''

if num_layers == 16:
print('Building VGG-16 model...')
x = vgg_2block(x, 3, 64, 'vgg_block1')
x = vgg_2block(x, 64, 128, 'vgg_block2')
x = vgg_3block(x, 128, 256, 'vgg_block3')
x = vgg_3block(x, 256, 512, 'vgg_block4')
x = vgg_3block(x, 512, 512, 'vgg_block5')

elif num_layers == 19:
print('Building VGG-19 model...')
x = vgg_2block(x, 3, 64, 'vgg_block1')
x = vgg_2block(x, 64, 128, 'vgg_block2')
x = vgg_4block(x, 128, 256, 'vgg_block3')
x = vgg_4block(x, 256, 512, 'vgg_block4')
x = vgg_4block(x, 512, 512, 'vgg_block5')

else:
assert False, 'VGG model should have 16 or 19 layers!'

x = ht.array_reshape_op(x, (-1, 512))
x = vgg_fc(x, 512, 4096, 'vgg_fc1')
x = vgg_fc(x, 4096, 4096, 'vgg_fc2')
y = vgg_fc(x, 4096, num_class, 'vgg_fc3')
loss = ht.softmaxcrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])

return loss, y


def vgg16(x, y_, num_class=10):
return vgg(x, y_, 16, num_class)


def vgg19(x, y_, num_class=10):
return vgg(x, y_, 19, num_class)

+ 9
- 0
examples/cnn/models/__init__.py View File

@@ -0,0 +1,9 @@
from .VGG import vgg, vgg16, vgg19
from .LogReg import logreg
from .CNN import cnn_3_layers
from .AlexNet import alexnet
from .LeNet import lenet
from .MLP import mlp
from .RNN import rnn
from .LSTM import lstm
from .ResNet import resnet, resnet18, resnet34

+ 4
- 0
examples/cnn/pytorch_models/__init__.py View File

@@ -0,0 +1,4 @@
from .mlp import mlp
from .resnet import resnet18, resnet34, resnet50
from .vgg import vgg16, vgg19
from .rnn import rnn

+ 20
- 0
examples/cnn/pytorch_models/mlp.py View File

@@ -0,0 +1,20 @@
import torch.nn.functional as F
import torch.nn as nn


class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.fc1 = nn.Linear(3072, 256)
self.fc2 = nn.Linear(256, 256)
self.fc3 = nn.Linear(256, 10)

def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
out = self.fc3(x)
return out


def mlp():
return MLP()

+ 116
- 0
examples/cnn/pytorch_models/resnet.py View File

@@ -0,0 +1,116 @@
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
expansion = 1

def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(
in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)

def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, in_planes, planes, stride=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, self.expansion *
planes, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(self.expansion*planes)

self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)

def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = F.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
out += self.shortcut(x)
out = F.relu(out)
return out


class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64

self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512*block.expansion, num_classes)

def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)

def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out


def resnet18(num_classes=10):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)


def resnet34(num_classes=10):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)


def resnet50(num_classes=10):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)


def resnet101(num_classes=10):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)


def resnet152(num_classes=10):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)

+ 36
- 0
examples/cnn/pytorch_models/rnn.py View File

@@ -0,0 +1,36 @@
import torch
import torch.nn as nn
import torch.nn.functional as F


class RNN(nn.Module):
def __init__(self, diminput, dimoutput, dimhidden, nsteps):
super(RNN, self).__init__()
self.diminput = diminput
self.dimoutput = dimoutput
self.dimhidden = dimhidden
self.nsteps = nsteps
self.fc1 = nn.Linear(diminput, dimhidden)
self.fc2 = nn.Linear(dimhidden*2, dimhidden)
self.fc3 = nn.Linear(dimhidden, dimoutput)

def forward(self, x):
last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device)
for i in range(self.nsteps):
t = i % self.nsteps
index = torch.Tensor([idx for idx in range(
t*self.diminput, (t+1)*self.diminput)]).long().to(x.device)
cur_x = torch.index_select(x, 1, index)
h = self.fc1(cur_x)
s = torch.cat([h, last_state], axis=1)
s = self.fc2(s)
last_state = F.relu(s)

final_state = last_state
y = self.fc3(final_state)
return y


def rnn(diminput, dimoutput, dimhidden, nsteps):

return RNN(diminput, dimoutput, dimhidden, nsteps)

+ 48
- 0
examples/cnn/pytorch_models/vgg.py View File

@@ -0,0 +1,48 @@
import torch
import torch.nn as nn


cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
def __init__(self, vgg_name, num_class=10):
super(VGG, self).__init__()
self.features = self._make_layers(cfg[vgg_name])
self.fc1 = nn.Linear(512, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.classifier = nn.Linear(4096, num_class)

def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.fc2(self.fc1(out))
out = self.classifier(out)
return out

def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)


def vgg16(num_class=10):
return VGG('VGG16', num_class)


def vgg19(num_class=10):
return VGG('VGG19', num_class)

+ 309
- 0
examples/cnn/run_tf_horovod.py View File

@@ -0,0 +1,309 @@
import os
import numpy as np
import tensorflow as tf
import tf_models
import time
import argparse
from tqdm import tqdm
from sklearn import metrics
import horovod.tensorflow as hvd
import hetu as ht
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def print_rank0(msg):
if rank % 8 == 0:
logger.info(msg)


def pop_env():
for k in ['https_proxy', 'http_proxy']:
if k in os.environ:
os.environ.pop(k)


pop_env()

# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
# horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8
# python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn


# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
# we can also use mpirun (default gloo):
# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
'''
def train(model, args):
hvd.init()

def get_current_shard(data):
part_size = data.shape[0] // hvd.size()
start = part_size * hvd.rank()
end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0]
return data[start:end]

batch_size = 128
if args.model == 'tf_resnet34':
train_images, train_labels, test_images,\
test_labels = ht.data.tf_normalize_cifar10()
x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3])
y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
else:
datasets = ht.data.mnist()
train_images, train_labels = datasets[0]
test_images, test_labels = datasets[2]
x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784])
y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])


n_train_batches = train_images.shape[0] // batch_size

loss, y = model(x, y_)
opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)

global_step = tf.train.get_or_create_global_step()
# here in DistributedOptimizer by default all tensor are reduced on GPU
# can use device_sparse=xxx, device_dense=xxx to modify
# if using device_sparse='/cpu:0', the performance degrades
train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step)

gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank()))
# here horovod default use gpu to initialize, which will cause OOM
hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

iterations = train_images.shape[0] // batch_size
total_epoch = 10
start_index = 0
total_time = 0
for ep in range(total_epoch + 1):
print("epoch %d" % ep)
st_time = time.time()
train_loss, train_acc = [], []
for it in range(n_train_batches):
x_val = train_images[start_index: start_index + batch_size]
y_val = train_labels[start_index : start_index+batch_size]
start_index += batch_size
if start_index + batch_size > train_images.shape[0]:
start_index = 0
loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val})
pred_val = loss_val[1]
true_val = loss_val[2]
acc_val = np.equal(
true_val,
pred_val > 0.5)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
en_time = time.time()
train_time = en_time - st_time
if ep != 0:
total_time += train_time
printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\
% (tra_loss, tra_accuracy, train_time)

print("training time:", total_time)



def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True, help="model to be tested")
parser.add_argument("--all", action="store_true", help="whether to use all data")
args = parser.parse_args()
raw_model = args.model
import tf_models
model = eval('tf_models.' + raw_model)
print('Model:', raw_model)
train(model, args)

if __name__ == '__main__':
main()
'''

if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True,
help='model to be tested')
parser.add_argument('--dataset', type=str, required=True,
help='dataset to be trained on')
parser.add_argument('--batch-size', type=int,
default=128, help='batch size')
parser.add_argument('--learning-rate', type=float,
default=0.1, help='learning rate')
parser.add_argument('--opt', type=str, default='sgd',
help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
parser.add_argument('--num-epochs', type=int,
default=20, help='epoch number')
parser.add_argument('--validate', action='store_true',
help='whether to use validation')
parser.add_argument('--timing', action='store_true',
help='whether to time the training phase')
args = parser.parse_args()

hvd.init()
global rank
rank = hvd.rank()
assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
'Model not supported now.'
model = eval('tf_models.' + args.model)

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset

assert args.opt in ['sgd', 'momentum', 'nesterov',
'adagrad', 'adam'], 'Optimizer not supported!'
if args.opt == 'sgd':
print_rank0('Use SGD Optimizer.')
opt = tf.train.GradientDescentOptimizer(
learning_rate=args.learning_rate)
elif args.opt == 'momentum':
print_rank0('Use Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9)
elif args.opt == 'nesterov':
print_rank0('Use Nesterov Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
elif args.opt == 'adagrad':
print_rank0('Use AdaGrad Optimizer.')
opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
else:
print_rank0('Use Adam Optimizer.')
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)

if dataset == 'MNIST':
datasets = ht.data.mnist()
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 784), train_set_y: (50000,)
# valid_set_x: (10000, 784), valid_set_y: (10000,)
elif dataset == 'CIFAR10':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=10)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
if args.model == "tf_mlp":
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
elif dataset == 'CIFAR100':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=100)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
else:
raise NotImplementedError

if dataset == 'MNIST':
x = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 784), name='x')
y_ = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_)
elif dataset == 'CIFAR10':
if args.model == "tf_mlp":
x = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 3072), name='x')
y_ = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
else:
x = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
y_ = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_, 10)
elif dataset == 'CIFAR100':
x = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
y_ = tf.compat.v1.placeholder(
dtype=tf.float32, shape=(None, 100), name='y_')
loss, y = model(x, y_, 100)

global_step = tf.train.get_or_create_global_step()
# here in DistributedOptimizer by default all tensor are reduced on GPU
# can use device_sparse=xxx, device_dense=xxx to modify
# if using device_sparse='/cpu:0', the performance degrades
train_op = hvd.DistributedOptimizer(
opt).minimize(loss, global_step=global_step)

gpu_options = tf.compat.v1.GPUOptions(
allow_growth=True, visible_device_list=str(hvd.local_rank()))
# here horovod default use gpu to initialize, which will cause OOM
hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
sess = tf.compat.v1.train.MonitoredTrainingSession(
hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

# sess.run(tf.compat.v1.global_variables_initializer())

# training
print_rank0("Start training loop...")
running_time = 0
for i in range(args.num_epochs + 1):
print_rank0("Epoch %d" % i)
loss_all = 0
batch_num = 0
if args.timing:
start = time.time()
correct_predictions = []
for minibatch_index in range(n_train_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
x_val = train_set_x[minibatch_start:minibatch_end]
y_val = train_set_y[minibatch_start:minibatch_end]
loss_val, predict_y, _ = sess.run([loss, y, train_op],
feed_dict={x: x_val, y_: y_val})
correct_prediction = np.equal(
np.argmax(y_val, 1),
np.argmax(predict_y, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
batch_num += 1
loss_all += loss_val
loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Train loss = %f" % loss_all)
print_rank0("Train accuracy = %f" % accuracy)

if args.timing:
end = time.time()
print_rank0("Running time of current epoch = %fs" % (end - start))
if i != 0:
running_time += (end - start)

if args.validate:
val_loss_all = 0
batch_num = 0
correct_predictions = []
for minibatch_index in range(n_valid_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
valid_x_val = valid_set_x[minibatch_start:minibatch_end]
valid_y_val = valid_set_y[minibatch_start:minibatch_end]
loss_val, valid_y_predicted = sess.run([loss, y],
feed_dict={x: valid_x_val, y_: valid_y_val})
correct_prediction = np.equal(
np.argmax(valid_y_val, 1),
np.argmax(valid_y_predicted, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
val_loss_all += loss_all
batch_num += 1
val_loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Validation loss = %f" % val_loss_all)
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))

+ 9
- 0
examples/cnn/scripts/hetu_16gpu.sh View File

@@ -0,0 +1,9 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..
echo $depsdir
### validate and timing
$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce


+ 11
- 0
examples/cnn/scripts/hetu_1gpu.sh View File

@@ -0,0 +1,11 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py


# model:
# e.g. bash hetu_1gpu.sh mlp CIFAR10

### validate and timing
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing

+ 10
- 0
examples/cnn/scripts/hetu_2gpu_ps.sh View File

@@ -0,0 +1,10 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py

### validate and timing
python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched &
python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 &
python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 &
wait

+ 8
- 0
examples/cnn/scripts/hetu_8gpu.sh View File

@@ -0,0 +1,8 @@
#!/bin/bash
workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..

### validate and timing
#
NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce

+ 11
- 0
examples/cnn/scripts/horovod_16gpu.sh View File

@@ -0,0 +1,11 @@

#!/bin/bash
workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_tf_horovod.py

# horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing

horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing

# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model

+ 6
- 0
examples/cnn/scripts/horovod_8gpu.sh View File

@@ -0,0 +1,6 @@

#!/bin/bash
workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_tf_horovod.py

horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing

+ 18
- 0
examples/cnn/scripts/pytorch_16gpu_0.sh View File

@@ -0,0 +1,18 @@
#!/bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=162.105.146.117
MASTER_PORT=6000
NNODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../torch_main.py

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
${mainpy} \
--model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed

+ 18
- 0
examples/cnn/scripts/pytorch_16gpu_1.sh View File

@@ -0,0 +1,18 @@
#!/bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=162.105.146.117
MASTER_PORT=39575
NNODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../torch_main.py

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
${mainpy} \
--model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed

+ 7
- 0
examples/cnn/scripts/pytorch_1gpu.sh View File

@@ -0,0 +1,7 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../torch_main.py

## validate and timing
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing

+ 18
- 0
examples/cnn/scripts/pytorch_8gpu.sh View File

@@ -0,0 +1,18 @@
#!/bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../torch_main.py

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
${mainpy} \
--model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed

+ 15
- 0
examples/cnn/scripts/tf_16gpu_worker0.sh View File

@@ -0,0 +1,15 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../tf_launch_worker.py

python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate &
wait


+ 14
- 0
examples/cnn/scripts/tf_16gpu_worker1.sh View File

@@ -0,0 +1,14 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../tf_launch_worker.py

python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate &
wait

+ 10
- 0
examples/cnn/scripts/tf_1gpu.sh View File

@@ -0,0 +1,10 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../tf_main.py

### validate and timing
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing

### run in cpu
# python ${mainpy} --model tf_mlp --gpu -1 --validate --timing

+ 15
- 0
examples/cnn/scripts/tf_8gpu.sh View File

@@ -0,0 +1,15 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../tf_launch_worker.py

python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate &
python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate &
wait


+ 23
- 0
examples/cnn/settings/tf_dist_s1_w16.json View File

@@ -0,0 +1,23 @@
{
"worker": [
"162.105.146.117:34569",
"162.105.146.117:34568",
"162.105.146.117:34567",
"162.105.146.117:34566",
"162.105.146.117:34565",
"162.105.146.117:34564",
"162.105.146.117:34563",
"162.105.146.117:34562",
"162.105.146.118:34779",
"162.105.146.118:34778",
"162.105.146.118:34777",
"162.105.146.118:34776",
"162.105.146.118:34775",
"162.105.146.118:34774",
"162.105.146.118:34773",
"162.105.146.118:34772"
],
"ps": [
"162.105.146.117:34575"
]
}

+ 11
- 0
examples/cnn/settings/tf_dist_s1_w4.json View File

@@ -0,0 +1,11 @@
{
"worker": [
"162.105.146.119:34569",
"162.105.146.119:34568",
"162.105.146.119:34567",
"162.105.146.119:34566"
],
"ps": [
"162.105.146.119:34575"
]
}

+ 15
- 0
examples/cnn/settings/tf_dist_s1_w8.json View File

@@ -0,0 +1,15 @@
{
"worker": [
"162.105.146.119:34569",
"162.105.146.119:34568",
"162.105.146.119:34567",
"162.105.146.119:34566",
"162.105.146.119:34565",
"162.105.146.119:34564",
"162.105.146.119:34563",
"162.105.146.119:34562"
],
"ps": [
"162.105.146.119:34575"
]
}

+ 49
- 0
examples/cnn/tf_launch_server.py View File

@@ -0,0 +1,49 @@
import os
import tensorflow as tf
import multiprocessing
import signal
import json
import argparse


def pop_env():
for k in ['https_proxy', 'http_proxy']:
if k in os.environ:
os.environ.pop(k)
os.environ['CUDA_VISIBLE_DEVICES'] = ''


pop_env()


def start_server(cluster, task_id):
server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
server.join()


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path")
parser.add_argument("--id", type=int, required=True)
args = parser.parse_args()
raw_config = args.config
config = json.load(open(raw_config))
cluster = tf.train.ClusterSpec(config)
global proc
proc = multiprocessing.Process(
target=start_server, args=[cluster, args.id, ])
proc.start()
signal.signal(signal.SIGINT, signal_handler)
proc.join()


def signal_handler(signal, frame):
print("SIGINT signal caught, stop Training")
global proc
proc.kill()
exit(0)


if __name__ == '__main__':
main()

+ 234
- 0
examples/cnn/tf_launch_worker.py View File

@@ -0,0 +1,234 @@
import tensorflow as tf
import tf_models
import hetu as ht

import numpy as np
import argparse
import json
from time import time
import os
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def print_rank0(msg):
if task_id % 8 == 0:
logger.info(msg)


def pop_env():
for k in ['https_proxy', 'http_proxy']:
if k in os.environ:
os.environ.pop(k)


pop_env()

if __name__ == "__main__":
# argument parser
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True,
help='model to be tested')
parser.add_argument('--dataset', type=str, required=True,
help='dataset to be trained on')
parser.add_argument('--batch-size', type=int,
default=128, help='batch size')
parser.add_argument('--learning-rate', type=float,
default=0.1, help='learning rate')
parser.add_argument('--opt', type=str, default='sgd',
help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
parser.add_argument('--num-epochs', type=int,
default=20, help='epoch number')
parser.add_argument('--gpu', type=int, default=0,
help='gpu to be used, -1 means cpu')
parser.add_argument('--validate', action='store_true',
help='whether to use validation')
parser.add_argument('--timing', action='store_true',
help='whether to time the training phase')
parser.add_argument("--rank", type=int, required=True,
help="rank of process")
parser.add_argument(
"--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")

args = parser.parse_args()
global task_id

task_id = int(args.rank)
print_rank0("task id %d" % (task_id))
raw_config = args.config

if args.gpu == -1:
device = '/job:worker/task:%d/cpu:0' % (task_id)
print_rank0('Use CPU.')
else:
device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu)
print_rank0('Use GPU %d.' % args.gpu)

config = json.load(open(raw_config))
cluster = tf.train.ClusterSpec(config)

assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
'Model not supported now.'
model = eval('tf_models.' + args.model)

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset

assert args.opt in ['sgd', 'momentum', 'nesterov',
'adagrad', 'adam'], 'Optimizer not supported!'
if args.opt == 'sgd':
print_rank0('Use SGD Optimizer.')
opt = tf.train.GradientDescentOptimizer(
learning_rate=args.learning_rate)
elif args.opt == 'momentum':
print_rank0('Use Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9)
elif args.opt == 'nesterov':
print_rank0('Use Nesterov Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
elif args.opt == 'adagrad':
print_rank0('Use AdaGrad Optimizer.')
opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
else:
print_rank0('Use Adam Optimizer.')
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)

with tf.device(
tf.compat.v1.train.replica_device_setter(
worker_device=device,
cluster=cluster)):
# data loading
print_rank0('Loading %s data...' % dataset)
if dataset == 'MNIST':
datasets = ht.data.mnist()
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 784), train_set_y: (50000,)
# valid_set_x: (10000, 784), valid_set_y: (10000,)
elif dataset == 'CIFAR10':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=10)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
if args.model == "tf_mlp":
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)

# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
elif dataset == 'CIFAR100':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=100)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
else:
raise NotImplementedError

if dataset == 'MNIST':
x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_)
elif dataset == 'CIFAR10':
if args.model == "tf_mlp":
x = tf.placeholder(
dtype=tf.float32, shape=(None, 3072), name='x')
y_ = tf.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
else:
x = tf.placeholder(dtype=tf.float32, shape=(
None, 32, 32, 3), name='x')
y_ = tf.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_, 10)
elif dataset == 'CIFAR100':
x = tf.placeholder(dtype=tf.float32, shape=(
None, 32, 32, 3), name='x')
y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
loss, y = model(x, y_, 100)
train_op = opt.minimize(loss)

server = tf.train.Server(
cluster, job_name="worker", task_index=task_id)

init = tf.compat.v1.global_variables_initializer()
sv = tf.train.Supervisor(
is_chief=(task_id == 0),
init_op=init,
recovery_wait_secs=1)
sess_config = tf.compat.v1.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps",
"/job:worker/task:%d" % task_id])
sess = sv.prepare_or_wait_for_session(
server.target, config=sess_config)

sess.run(init)
# training
print_rank0("Start training loop...")
running_time = 0
for i in range(args.num_epochs + 1):
print_rank0("Epoch %d" % i)
loss_all = 0
batch_num = 0
if args.timing:
start = time()
correct_predictions = []
for minibatch_index in range(n_train_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
x_val = train_set_x[minibatch_start:minibatch_end]
y_val = train_set_y[minibatch_start:minibatch_end]
loss_val, predict_y, _ = sess.run([loss, y, train_op],
feed_dict={x: x_val, y_: y_val})
correct_prediction = np.equal(
np.argmax(y_val, 1),
np.argmax(predict_y, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
batch_num += 1
loss_all += loss_val
loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Train loss = %f" % loss_all)
print_rank0("Train accuracy = %f" % accuracy)

if args.timing:
end = time()
print_rank0("Running time of current epoch = %fs" %
(end - start))
if i != 0:
running_time += (end - start)

if args.validate:
val_loss_all = 0
batch_num = 0
correct_predictions = []
for minibatch_index in range(n_valid_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
valid_x_val = valid_set_x[minibatch_start:minibatch_end]
valid_y_val = valid_set_y[minibatch_start:minibatch_end]
loss_val, valid_y_predicted = sess.run([loss, y],
feed_dict={x: valid_x_val, y_: valid_y_val})
correct_prediction = np.equal(
np.argmax(valid_y_val, 1),
np.argmax(valid_y_predicted, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
val_loss_all += loss_all
batch_num += 1
val_loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Validation loss = %f" % val_loss_all)
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))

+ 194
- 0
examples/cnn/tf_main.py View File

@@ -0,0 +1,194 @@
import tensorflow as tf
import tf_models
import hetu as ht
import numpy as np
import argparse
from time import time
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def print_rank0(msg):
logger.info(msg)


if __name__ == "__main__":
# argument parser
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True,
help='model to be tested')
parser.add_argument('--dataset', type=str, required=True,
help='dataset to be trained on')
parser.add_argument('--batch-size', type=int,
default=128, help='batch size')
parser.add_argument('--learning-rate', type=float,
default=0.1, help='learning rate')
parser.add_argument('--opt', type=str, default='sgd',
help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
parser.add_argument('--num-epochs', type=int,
default=20, help='epoch number')
parser.add_argument('--gpu', type=int, default=0,
help='gpu to be used, -1 means cpu')
parser.add_argument('--validate', action='store_true',
help='whether to use validation')
parser.add_argument('--timing', action='store_true',
help='whether to time the training phase')
args = parser.parse_args()

if args.gpu == -1:
device = '/cpu:0'
print_rank0('Use CPU.')
else:
device = '/gpu:%d' % args.gpu
print_rank0('Use GPU %d.' % args.gpu)

print_rank0("Training {} on TensorFlow".format(args.model))
assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
'Model not supported now.'
model = eval('tf_models.' + args.model)

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset

assert args.opt in ['sgd', 'momentum', 'nesterov',
'adagrad', 'adam'], 'Optimizer not supported!'
if args.opt == 'sgd':
print_rank0('Use SGD Optimizer.')
opt = tf.train.GradientDescentOptimizer(
learning_rate=args.learning_rate)
elif args.opt == 'momentum':
print_rank0('Use Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9)
elif args.opt == 'nesterov':
print_rank0('Use Nesterov Momentum Optimizer.')
opt = tf.train.MomentumOptimizer(
learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
elif args.opt == 'adagrad':
print_rank0('Use AdaGrad Optimizer.')
opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
else:
print_rank0('Use Adam Optimizer.')
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)

# model definition
print_rank0('Building model...')
with tf.device(device):
if dataset == 'MNIST':
x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_)
elif dataset == 'CIFAR10':
if args.model == "tf_mlp":
x = tf.placeholder(
dtype=tf.float32, shape=(None, 3072), name='x')
y_ = tf.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
else:
x = tf.placeholder(dtype=tf.float32, shape=(
None, 32, 32, 3), name='x')
y_ = tf.placeholder(
dtype=tf.float32, shape=(None, 10), name='y_')
loss, y = model(x, y_, 10)
elif dataset == 'CIFAR100':
x = tf.placeholder(dtype=tf.float32, shape=(
None, 32, 32, 3), name='x')
y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
loss, y = model(x, y_, 100)

train_op = opt.minimize(loss)

# data loading
print_rank0('Loading %s data...' % dataset)
if dataset == 'MNIST':
datasets = ht.data.mnist()
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 784), train_set_y: (50000,)
# valid_set_x: (10000, 784), valid_set_y: (10000,)
elif dataset == 'CIFAR10':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=10)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
if args.model == "tf_mlp":
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
elif dataset == 'CIFAR100':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
num_class=100)
n_train_batches = train_set_x.shape[0] // args.batch_size
n_valid_batches = valid_set_x.shape[0] // args.batch_size
# train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
# valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
else:
raise NotImplementedError

# training
print_rank0("Start training loop...")
running_time = 0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(args.num_epochs + 1):
print_rank0("Epoch %d" % i)
loss_all = 0
batch_num = 0
if args.timing:
start = time()
correct_predictions = []
for minibatch_index in range(n_train_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
x_val = train_set_x[minibatch_start:minibatch_end]
y_val = train_set_y[minibatch_start:minibatch_end]
loss_val, predict_y, _ = sess.run([loss, y, train_op],
feed_dict={x: x_val, y_: y_val})
correct_prediction = np.equal(
np.argmax(y_val, 1),
np.argmax(predict_y, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
batch_num += 1
loss_all += loss_val
loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Train loss = %f" % loss_all)
print_rank0("Train accuracy = %f" % accuracy)

if args.timing:
end = time()
print_rank0("Running time of current epoch = %fs" %
(end - start))
if i != 0:
running_time += (end - start)

if args.validate:
val_loss_all = 0
batch_num = 0
correct_predictions = []
for minibatch_index in range(n_valid_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
valid_x_val = valid_set_x[minibatch_start:minibatch_end]
valid_y_val = valid_set_y[minibatch_start:minibatch_end]
loss_val, valid_y_predicted = sess.run([loss, y],
feed_dict={x: valid_x_val, y_: valid_y_val})
correct_prediction = np.equal(
np.argmax(valid_y_val, 1),
np.argmax(valid_y_predicted, 1)).astype(np.float32)
correct_predictions.extend(correct_prediction)
val_loss_all += loss_all
batch_num += 1
val_loss_all /= batch_num
accuracy = np.mean(correct_predictions)
print_rank0("Validation loss = %f" % val_loss_all)
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))

+ 8
- 0
examples/cnn/tf_models/__init__.py View File

@@ -0,0 +1,8 @@
from .tf_LogReg import tf_logreg
from .tf_CNN import tf_cnn_3_layers
from .tf_LeNet import tf_lenet
from .tf_MLP import tf_mlp
from .tf_RNN import tf_rnn
from .tf_LSTM import tf_lstm
from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34
from .tf_VGG import tf_vgg16, tf_vgg19

+ 45
- 0
examples/cnn/tf_models/tf_CNN.py View File

@@ -0,0 +1,45 @@
import numpy as np
import tensorflow as tf


def tf_conv_relu_avg(x, shape):
weight = tf.Variable(np.random.normal(
scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32))
x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
x = tf.nn.relu(x)
x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1],
padding='VALID', strides=[1, 2, 2, 1])
return x


def tf_fc(x, shape):
weight = tf.Variable(np.random.normal(
scale=0.1, size=shape).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=shape[-1:]).astype(np.float32))
x = tf.reshape(x, (-1, shape[0]))
y = tf.matmul(x, weight) + bias
return y


def tf_cnn_3_layers(x, y_):
'''
3-layer-CNN model in TensorFlow, for MNIST dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print('Building 3-layer-CNN model in tensorflow...')
x = tf.reshape(x, [-1, 28, 28, 1])
x = tf_conv_relu_avg(x, (32, 1, 5, 5))
x = tf_conv_relu_avg(x, (64, 32, 5, 5))
x = tf.transpose(x, [0, 3, 1, 2])
y = tf_fc(x, (7 * 7 * 64, 10))
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 81
- 0
examples/cnn/tf_models/tf_LSTM.py View File

@@ -0,0 +1,81 @@
import numpy as np
import tensorflow as tf


def tf_lstm(x, y_):
'''
LSTM model in TensorFlow, for MNIST dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print("Building LSTM model in tensorflow...")
diminput = 28
dimhidden = 128
dimoutput = 10
nsteps = 28

forget_gate_w = tf.Variable(np.random.normal(
scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
forget_gate_u = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
forget_gate_b = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden,)).astype(np.float32))
input_gate_w = tf.Variable(np.random.normal(
scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
input_gate_u = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
input_gate_b = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden,)).astype(np.float32))
output_gate_w = tf.Variable(np.random.normal(
scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
output_gate_u = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
output_gate_b = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden,)).astype(np.float32))
tanh_w = tf.Variable(np.random.normal(
scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
tanh_u = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
tanh_b = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden,)).astype(np.float32))
out_weights = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
out_bias = tf.Variable(np.random.normal(
scale=0.1, size=(dimoutput,)).astype(np.float32))
initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32)

last_c_state = initial_state
last_h_state = initial_state

for i in range(nsteps):
cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
# forget gate
cur_forget = tf.matmul(last_h_state, forget_gate_u) + \
tf.matmul(cur_x, forget_gate_w) + forget_gate_b
cur_forget = tf.sigmoid(cur_forget)
# input gate
cur_input = tf.matmul(last_h_state, input_gate_u) + \
tf.matmul(cur_x, input_gate_w) + input_gate_b
cur_input = tf.sigmoid(cur_input)
# output gate
cur_output = tf.matmul(last_h_state, output_gate_u) + \
tf.matmul(cur_x, output_gate_w) + output_gate_b
cur_output = tf.sigmoid(cur_output)
# tanh
cur_tanh = tf.matmul(last_h_state, tanh_u) + \
tf.matmul(cur_x, tanh_w) + tanh_b
cur_tanh = tf.tanh(cur_tanh)

last_c_state = last_c_state * cur_forget + cur_input * cur_tanh
last_h_state = tf.tanh(last_c_state) * cur_output

y = tf.matmul(last_h_state, out_weights) + out_bias
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 49
- 0
examples/cnn/tf_models/tf_LeNet.py View File

@@ -0,0 +1,49 @@
import numpy as np
import tensorflow as tf


def tf_conv_pool(x, in_channel, out_channel):
weight = tf.Variable(np.random.normal(scale=0.1, size=(
out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
x = tf.nn.relu(x)
x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
padding='VALID', strides=[1, 2, 2, 1])
return x


def tf_fc(x, shape, with_relu=True):
weight = tf.Variable(np.random.normal(
scale=0.1, size=shape).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=shape[-1:]).astype(np.float32))
x = tf.matmul(x, weight) + bias
if with_relu:
x = tf.nn.relu(x)
return x


def tf_lenet(x, y_):
'''
LeNet model in TensorFlow, for MNIST dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print('Building LeNet model in tensorflow...')
x = tf.reshape(x, [-1, 28, 28, 1])
x = tf_conv_pool(x, 1, 6)
x = tf_conv_pool(x, 6, 16)
x = tf.transpose(x, [0, 3, 1, 2])
x = tf.reshape(x, (-1, 7*7*16))
x = tf_fc(x, (7*7*16, 120), with_relu=True)
x = tf_fc(x, (120, 84), with_relu=True)
y = tf_fc(x, (84, 10), with_relu=False)
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 23
- 0
examples/cnn/tf_models/tf_LogReg.py View File

@@ -0,0 +1,23 @@
import numpy as np
import tensorflow as tf


def tf_logreg(x, y_):
'''
Logistic Regression model in TensorFlow, for MNIST dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print("Build logistic regression model in tensorflow...")
weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32))
bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32))
y = tf.matmul(x, weight) + bias
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 34
- 0
examples/cnn/tf_models/tf_MLP.py View File

@@ -0,0 +1,34 @@
import numpy as np
import tensorflow as tf


def tf_fc(x, shape, with_relu=True):
weight = tf.Variable(np.random.normal(
scale=0.1, size=shape).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=shape[-1:]).astype(np.float32))
x = tf.matmul(x, weight) + bias
if with_relu:
x = tf.nn.relu(x)
return x


def tf_mlp(x, y_, num_class=10):
'''
MLP model in TensorFlow, for CIFAR dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print("Building MLP model in tensorflow...")
x = tf_fc(x, (3072, 256), with_relu=True)
x = tf_fc(x, (256, 256), with_relu=True)
y = tf_fc(x, (256, num_class), with_relu=False)
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 49
- 0
examples/cnn/tf_models/tf_RNN.py View File

@@ -0,0 +1,49 @@
import numpy as np
import tensorflow as tf


def tf_rnn(x, y_):
'''
RNN model in TensorFlow, for MNIST dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''

print("Building RNN model in tensorflow...")
diminput = 28
dimhidden = 128
dimoutput = 10
nsteps = 28

weight1 = tf.Variable(np.random.normal(
scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
bias1 = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, )).astype(np.float32))
weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
dimhidden + dimhidden, dimhidden)).astype(np.float32))
bias2 = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, )).astype(np.float32))
weight3 = tf.Variable(np.random.normal(
scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
bias3 = tf.Variable(np.random.normal(
scale=0.1, size=(dimoutput, )).astype(np.float32))
last_state = tf.zeros((128, dimhidden), dtype=tf.float32)

for i in range(nsteps):
cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
h = tf.matmul(cur_x, weight1) + bias1

s = tf.concat([h, last_state], axis=1)
s = tf.matmul(s, weight2) + bias2
last_state = tf.nn.relu(s)

final_state = last_state
y = tf.matmul(final_state, weight3) + bias3
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y

+ 113
- 0
examples/cnn/tf_models/tf_ResNet.py View File

@@ -0,0 +1,113 @@
import numpy as np
import tensorflow as tf


def tf_conv2d(x, in_channel, out_channel, stride=1):
weight = tf.Variable(np.random.normal(scale=0.1, size=(
out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME')
return x


def tf_batch_norm_with_relu(x, hidden):
scale = tf.Variable(np.random.normal(
scale=0.1, size=(hidden,)).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=(hidden,)).astype(np.float32))
axis = list(range(len(x.shape) - 1))
a_mean, a_var = tf.nn.moments(x, axis)
x = tf.nn.batch_normalization(
x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
x = tf.nn.relu(x)
return x


def tf_resnet_block(x, in_channel, num_blocks, is_first=False):
if is_first:
out_channel = in_channel
identity = x
x = tf_conv2d(x, in_channel, out_channel, stride=1)
x = tf_batch_norm_with_relu(x, out_channel)
x = tf_conv2d(x, out_channel, out_channel, stride=1)
x = x + identity
else:
out_channel = 2 * in_channel
identity = x
x = tf_batch_norm_with_relu(x, in_channel)
x = tf_conv2d(x, in_channel, out_channel, stride=2)
x = tf_batch_norm_with_relu(x, out_channel)
x = tf_conv2d(x, out_channel, out_channel, stride=1)
identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[
1, 2, 2, 1], padding='VALID')
identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [
in_channel // 2, in_channel // 2]])
x = x + identity

for i in range(1, num_blocks):
identity = x
x = tf_batch_norm_with_relu(x, out_channel)
x = tf_conv2d(x, out_channel, out_channel, stride=1)
x = tf_batch_norm_with_relu(x, out_channel)
x = tf_conv2d(x, out_channel, out_channel, stride=1)
x = x + identity

return x


def tf_fc(x, shape):
weight = tf.Variable(np.random.normal(
scale=0.1, size=shape).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=shape[-1:]).astype(np.float32))
x = tf.matmul(x, weight) + bias
return x


def tf_resnet(x, y_, num_layers, num_class=10):
'''
ResNet model in TensorFlow, for CIFAR10 dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
num_layers: 18 or 34
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''
print("Number of Class: {}".format(num_class))
base_size = 16

x = tf_conv2d(x, 3, base_size, stride=1)
x = tf_batch_norm_with_relu(x, base_size)

if num_layers == 18:
print("Building ResNet-18 model in tensorflow...")
x = tf_resnet_block(x, base_size, num_blocks=2, is_first=True)
x = tf_resnet_block(x, base_size, num_blocks=2)
x = tf_resnet_block(x, 2 * base_size, num_blocks=2)
x = tf_resnet_block(x, 4 * base_size, num_blocks=2)
elif num_layers == 34:
print("Building ResNet-34 model in tensorflow...")
x = tf_resnet_block(x, base_size, num_blocks=3, is_first=True)
x = tf_resnet_block(x, base_size, num_blocks=4)
x = tf_resnet_block(x, 2 * base_size, num_blocks=6)
x = tf_resnet_block(x, 4 * base_size, num_blocks=3)
else:
assert False, "Number of layers should be 18 or 34 !"

x = tf_batch_norm_with_relu(x, 8 * base_size)
x = tf.transpose(x, [0, 3, 1, 2])
x = tf.reshape(x, [-1, 128 * base_size])
y = tf_fc(x, (128 * base_size, num_class))
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y


def tf_resnet18(x, y_, num_class=10):
return tf_resnet(x, y_, 18, num_class)


def tf_resnet34(x, y_, num_class=10):
return tf_resnet(x, y_, 34, num_class)

+ 103
- 0
examples/cnn/tf_models/tf_VGG.py View File

@@ -0,0 +1,103 @@
import numpy as np
import tensorflow as tf


def conv_bn_relu(x, in_channel, out_channel):
weight = tf.Variable(np.random.normal(scale=0.1, size=(
out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
scale = tf.Variable(np.random.normal(
scale=0.1, size=(out_channel,)).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=(out_channel,)).astype(np.float32))
x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME')
axis = list(range(len(x.shape) - 1))
a_mean, a_var = tf.nn.moments(x, axis)
x = tf.nn.batch_normalization(
x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
x = tf.nn.relu(x)
return x


def vgg_2block(x, in_channel, out_channel):
x = conv_bn_relu(x, in_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
1, 2, 2, 1], padding='VALID')
return x


def vgg_3block(x, in_channel, out_channel):
x = conv_bn_relu(x, in_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
1, 2, 2, 1], padding='VALID')
return x


def vgg_4block(x, in_channel, out_channel):
x = conv_bn_relu(x, in_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = conv_bn_relu(x, out_channel, out_channel)
x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
1, 2, 2, 1], padding='VALID')
return x


def tf_fc(x, in_feat, out_feat):
weight = tf.Variable(np.random.normal(
scale=0.1, size=(in_feat, out_feat)).astype(np.float32))
bias = tf.Variable(np.random.normal(
scale=0.1, size=(out_feat,)).astype(np.float32))
x = tf.matmul(x, weight) + bias
return x


def tf_vgg(x, y_, num_layers, num_class=10):
'''
ResNet model in TensorFlow, for CIFAR10 dataset.

Parameters:
x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
num_layers: 18 or 34
Return:
loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
'''
if num_layers == 16:
print('Building VGG-16 model in tensorflow')
x = vgg_2block(x, 3, 64)
x = vgg_2block(x, 64, 128)
x = vgg_3block(x, 128, 256)
x = vgg_3block(x, 256, 512)
x = vgg_3block(x, 512, 512)

elif num_layers == 19:
print('Building VGG-19 model in tensorflow')
x = vgg_2block(x, 3, 64)
x = vgg_2block(x, 64, 128)
x = vgg_4block(x, 128, 256)
x = vgg_4block(x, 256, 512)
x = vgg_4block(x, 512, 512)
else:
assert False, "Number of layers should be 18 or 34 !"

x = tf.reshape(x, [-1, 512])
x = tf_fc(x, 512, 4096)
x = tf_fc(x, 4096, 4096)
y = tf_fc(x, 4096, num_class)
print("Number of Class: {}".format(num_class))

loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
loss = tf.reduce_mean(loss)
return loss, y


def tf_vgg16(x, y_, num_class=10):
return tf_vgg(x, y_, 16, num_class)


def tf_vgg19(x, y_, num_class=10):
return tf_vgg(x, y_, 34, num_class)

+ 213
- 0
examples/cnn/torch_main.py View File

@@ -0,0 +1,213 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from pytorch_models import *
import hetu as ht
import numpy as np
import argparse
from time import time
import os
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def print_rank0(msg):
if local_rank % 8 == 0:
logger.info(msg)


def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None):
print_rank0('Epoch: %d' % epoch)
n_train_batches = data.shape[0] // batch_size

net.train()

train_loss = 0
correct = 0
total = 0

for minibatch_index in range(n_train_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
inputs = torch.Tensor(data[minibatch_start:minibatch_end])
targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()

inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()

train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()

print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1)))
print_rank0("Train accuracy = %f" % (100.*correct/total))


def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None):
net.eval()
n_test_batches = data.shape[0] // batch_size
test_loss = 0
correct = 0
total = 0

with torch.no_grad():
for minibatch_index in range(n_test_batches):
minibatch_start = minibatch_index * args.batch_size
minibatch_end = (minibatch_index + 1) * args.batch_size
inputs = torch.Tensor(data[minibatch_start:minibatch_end])
targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()

inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()

print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1)))
print_rank0("Validation accuracy = %f" % (100.*correct/total))


if __name__ == "__main__":
# argument parser
global local_rank
local_rank = 0
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True,
help='model to be tested')
parser.add_argument('--dataset', type=str, required=True,
help='dataset to be trained on')
parser.add_argument('--batch-size', type=int,
default=128, help='batch size')
parser.add_argument('--learning-rate', type=float,
default=0.1, help='learning rate')
parser.add_argument('--opt', type=str, default='sgd',
help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
parser.add_argument('--num-epochs', type=int,
default=20, help='epoch number')
parser.add_argument('--gpu', type=int, default=0,
help='gpu to be used, -1 means cpu')
parser.add_argument('--validate', action='store_true',
help='whether to use validation')
parser.add_argument('--timing', action='store_true',
help='whether to time the training phase')
parser.add_argument('--distributed', action='store_true',
help='whether to distributed training')
parser.add_argument('--local_rank', type=int, default=-1)
args = parser.parse_args()

if args.distributed == True:
init_method = 'tcp://'
master_ip = os.getenv('MASTER_ADDR', 'localhost')
master_port = os.getenv('MASTER_PORT', '6000')
init_method += master_ip + ':' + master_port
rank = int(os.getenv('RANK', '0'))
world_size = int(os.getenv("WORLD_SIZE", '1'))
print("***"*50)
print(init_method)
torch.distributed.init_process_group(backend="nccl",
world_size=world_size,
rank=rank,
init_method=init_method)

if args.gpu == -1:
device = 'cpu'
else:
if args.distributed == True:
local_rank = rank % torch.cuda.device_count()
torch.cuda.set_device(local_rank)
device = torch.device('cuda:%d' % local_rank)
logger.info('Use GPU %d.' % local_rank)
else:
device = torch.device('cuda:%d' % args.gpu)
torch.cuda.set_device(args.gpu)
print_rank0('Use GPU %d.' % args.gpu)

assert args.model in ['mlp', 'resnet18', 'resnet34',
'vgg16', 'vgg19', 'rnn'], 'Model not supported now.'

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset

if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
net = eval(args.model)(100)
elif args.model == 'rnn':
net = eval(args.model)(28, 10, 128, 28)
else:
net = eval(args.model)()

assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
dataset = args.dataset

net.to(device)
if args.distributed:
net = torch.nn.parallel.DistributedDataParallel(
net, device_ids=[local_rank])

assert args.opt in ['sgd', 'momentum', 'nesterov',
'adagrad', 'adam'], 'Optimizer not supported!'
if args.opt == 'sgd':
print_rank0('Use SGD Optimizer.')
opt = optim.SGD(net.parameters(), lr=args.learning_rate)
elif args.opt == 'momentum':
print_rank0('Use Momentum Optimizer.')
opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9)
elif args.opt == 'nesterov':
print_rank0('Use Nesterov Momentum Optimizer.')
opt = optim.SGD(net.parameters(), lr=args.learning_rate,
momentum=0.9, nesterov=True)
elif args.opt == 'adagrad':
print_rank0('Use AdaGrad Optimizer.')
opt = optim.Adagrad(net.parameters(), lr=args.learning_rate)
else:
print_rank0('Use Adam Optimizer.')
opt = optim.Adam(lr=args.learning_rate)

criterion = nn.CrossEntropyLoss()

# data loading
print_rank0('Loading %s data...' % dataset)
if dataset == 'MNIST':
datasets = ht.data.mnist(onehot=False)
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
elif dataset == 'CIFAR10':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
num_class=10, onehot=False)
if args.model == "mlp":
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
elif dataset == 'CIFAR100':
train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
num_class=100, onehot=False)

running_time = 0
# training
print_rank0("Start training loop...")
for i in range(args.num_epochs + 1):
if args.timing:
start = time()
train(epoch=i, net=net, data=train_set_x, label=train_set_y,
batch_size=args.batch_size, criterion=criterion, optimizer=opt)
if args.timing:
end = time()
print_rank0("Running time of current epoch = %fs" % (end - start))
if i != 0:
running_time += (end - start)
test(epoch=i, net=net, data=valid_set_x, label=valid_set_y,
batch_size=args.batch_size, criterion=criterion)

print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))

+ 9
- 0
examples/cnn/worker_conf0.json View File

@@ -0,0 +1,9 @@
{
"DMLC_ROLE":"worker",
"WORKER_ID":"0",
"DMLC_PS_ROOT_URI":"127.0.0.1",
"DMLC_PS_ROOT_PORT":"13030",
"DMLC_NUM_WORKER":"2",
"DMLC_NUM_SERVER":"1",
"DMLC_PS_VAN_TYPE":"p3"
}

+ 9
- 0
examples/cnn/worker_conf1.json View File

@@ -0,0 +1,9 @@
{
"DMLC_ROLE":"worker",
"WORKER_ID":"1",
"DMLC_PS_ROOT_URI":"127.0.0.1",
"DMLC_PS_ROOT_PORT":"13030",
"DMLC_NUM_WORKER":"2",
"DMLC_NUM_SERVER":"1",
"DMLC_PS_VAN_TYPE":"p3"
}

+ 2
- 0
examples/ctr/.gitignore View File

@@ -0,0 +1,2 @@
datasets/
logs/

+ 109
- 0
examples/ctr/README.md View File

@@ -0,0 +1,109 @@
# CTR Examples (with Distributed Settings)
In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset.

## Structure
```
- ctr
- datasets/ contains sampled criteo data
- models/ ctr models in hetu
- tf_models/ ctr models in tensorflow
- settings/ configurations for distributed training
- tests/ test scripts
- kill.sh script to kill all python processes
- run_hetu.py basic trainer for hetu
- run_tf_local.py local trainer for tensorflow
- run_tf_horovod.py trainer for tensorflow in horovod setting
- run_tf_parallax.py trainer for tensorflow in parallax setting
- tf_launch_server.py launcher for server in tensorflow
- tf_launch_worker.py launcher for worker in tensorflow
```

## Prepare criteo data
* We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files.
* To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset.


## Flags for test files
Here we explain some of the flags you may use in test files:
* model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult')
* config: to specify the configuration file in settings.
* val: whether using validation.
* cache: whether using cache in PS/Hybrid mode.
* bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.)
* all: whether to use all criteo data.
* bound: per embedding entry staleness in cache setting, default to be 100.


## Usage
If memory available, you can try to run the model locally, by running
```bash
# run locally
bash tests/local_{model}_{dataset}.sh
# run in ps setting (locally)
bash tests/ps_{model}_{dataset}.sh
# run in hybrid setting (locally)
bash tests/hybrid_{model}_{dataset}.sh

# run tensorflow locally
python run_tf_local.py --model {model}_{dataset}
# run tensorflow in horovod
horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset}
# run tensorflow in parallax
python {absolute_path_to}/run_tf_parallax.py
# run tensorflow in ps setting
python tf_launch_server.py --config {config} --id {rank}
python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config}
```


## Configuration
We use a simple yaml file to specify the run configuration.

```yaml
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 4
DMLC_NUM_SERVER : 1
launch :
worker : 4
server : 1
scheduler : true
```

The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.

The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by


## Examples
### Local execution
Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data):
```bash
python run_hetu.py --model wdl_criteo (--all) (--val)
```

### PS mode execution
Run ps locally, here we can also run on multiple nodes.
```bash
# launch scheduler and server, -n means number of servers, --sched means using scheduler
python -m hetu.launcher {config} -n 1 --sched
# launch workers (or run scheduler and server together if configured in config file)
python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10)
```
You can also specify the cache to be used and also the cache bound.


### Hybrid mode execution
You must launch a scheduler and server in one terminal:
```bash
python -m hetu.launcher {config} -n 1 --sched
```
And then launch the workers simultaneously using mpirun command:
```bash
mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ...
```
Or if in distributed nodes setting:
```
mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ...
```

+ 3
- 0
examples/ctr/kill.sh View File

@@ -0,0 +1,3 @@
#/bin/bash
#pkill -f mnist_mlp_ps.py
kill -9 $(pidof python)

+ 5
- 0
examples/ctr/models/__init__.py View File

@@ -0,0 +1,5 @@
from .wdl_adult import wdl_adult
from .dcn_criteo import dcn_criteo
from .dc_criteo import dc_criteo
from .wdl_criteo import wdl_criteo
from .deepfm_criteo import dfm_criteo

+ 63
- 0
examples/ctr/models/dc_criteo.py View File

@@ -0,0 +1,63 @@
import hetu as ht
from hetu import init

import numpy as np
import time


def residual_layer(x0, input_dim, hidden_dim):

embedding_len = input_dim
weight_1 = init.random_normal(
shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1')
bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1')
weight_2 = init.random_normal(
shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2')
bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2')

x0w = ht.matmul_op(x0, weight_1) # (batch, hidden_dim)
x0w_b = x0w + ht.broadcastto_op(bias_1, x0w)

relu1 = ht.relu_op(x0w_b)
x1w = ht.matmul_op(relu1, weight_2) # (batch, input_dim)
x1w_b = x1w + ht.broadcastto_op(bias_2, x1w)
residual = x1w_b + x0
y = ht.relu_op(residual)
return y


def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3):
for i in range(num_layers):
x0 = residual_layer(x0, input_dim, hidden_dim)
return x0


def dc_criteo(dense_input, sparse_input, y_):

feature_dimension = 33762577
embedding_size = 8
learning_rate = 0.001

Embedding = init.random_normal(
[feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding")
sparse_input = ht.embedding_lookup_op(Embedding, sparse_input)
sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))

# dc_model
x = ht.concat_op(sparse_input, dense_input, axis=1)

input_dim = 26 * 8 + 13
hidden_dim = input_dim
residual_out = build_residual_layers(
x, input_dim, hidden_dim, num_layers=5)

W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4")
y = ht.matmul_op(residual_out, W4)
y = ht.sigmoid_op(y)

loss = ht.binarycrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
train_op = opt.minimize(loss)

return loss, y, y_, train_op

+ 68
- 0
examples/ctr/models/dcn_criteo.py View File

@@ -0,0 +1,68 @@
import hetu as ht
from hetu import init

import numpy as np
import time


def cross_layer(x0, x1):
# x0: input embedding feature (batch_size, 26 * embedding_size + 13)
# x1: the output of last layer (batch_size, 26 * embedding_size + 13)

embedding_len = 26 * 128 + 13
weight = init.random_normal(
shape=(embedding_len, 1), stddev=0.01, name='weight')
bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias')
x1w = ht.matmul_op(x1, weight) # (batch_size, 1)
y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0))
y = y + x1 + ht.broadcastto_op(bias, y)
return y


def build_cross_layer(x0, num_layers=3):
x1 = x0
for i in range(num_layers):
x1 = cross_layer(x0, x1)
return x1


def dcn_criteo(dense_input, sparse_input, y_):
feature_dimension = 33762577
embedding_size = 128
learning_rate = 0.003

Embedding = init.random_normal(
[feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
sparse_input = ht.embedding_lookup_op(
Embedding, sparse_input, ctx=ht.cpu(0))
sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
x = ht.concat_op(sparse_input, dense_input, axis=1)
# Cross Network
cross_output = build_cross_layer(x, num_layers=3)

# DNN
flatten = x
W1 = init.random_normal(
[26*embedding_size + 13, 256], stddev=0.01, name="W1")
W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

W4 = init.random_normal(
[256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4")

fc1 = ht.matmul_op(flatten, W1)
relu1 = ht.relu_op(fc1)
fc2 = ht.matmul_op(relu1, W2)
relu2 = ht.relu_op(fc2)
y3 = ht.matmul_op(relu2, W3)

y4 = ht.concat_op(cross_output, y3, axis=1)
y = ht.matmul_op(y4, W4)
y = ht.sigmoid_op(y)

loss = ht.binarycrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
train_op = opt.minimize(loss)

return loss, y, y_, train_op

+ 59
- 0
examples/ctr/models/deepfm_criteo.py View File

@@ -0,0 +1,59 @@
import hetu as ht
from hetu import init

import numpy as np
import time


def dfm_criteo(dense_input, sparse_input, y_):
feature_dimension = 33762577
embedding_size = 128
learning_rate = 0.01

# FM
Embedding1 = init.random_normal(
[feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0))
FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
sparse_1dim_input = ht.embedding_lookup_op(
Embedding1, sparse_input, ctx=ht.cpu(0))
fm_dense_part = ht.matmul_op(dense_input, FM_W)
fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1)
# fst order output
y1 = fm_dense_part + fm_sparse_part

Embedding2 = init.random_normal(
[feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
sparse_2dim_input = ht.embedding_lookup_op(
Embedding2, sparse_input, ctx=ht.cpu(0))
sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1)
sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum)

sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input)
sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1)
sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
sparse_2dim_half = sparse_2dim * 0.5
# snd order output
y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)

# DNN
flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size))
W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1")
W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
W3 = init.random_normal([256, 1], stddev=0.01, name="W3")

fc1 = ht.matmul_op(flatten, W1)
relu1 = ht.relu_op(fc1)
fc2 = ht.matmul_op(relu1, W2)
relu2 = ht.relu_op(fc2)
y3 = ht.matmul_op(relu2, W3)

y4 = y1 + y2
y = y4 + y3
y = ht.sigmoid_op(y)

loss = ht.binarycrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
train_op = opt.minimize(loss)

return loss, y, y_, train_op

+ 320
- 0
examples/ctr/models/load_data.py View File

@@ -0,0 +1,320 @@
import os
import numpy as np


###########################################################################
# criteo
###########################################################################

def download_criteo(path):
import tarfile
import pandas as pd
from six.moves import urllib
if not os.path.exists(path):
os.makedirs(path)
assert os.path.isdir(path), 'Please provide a directory path.'
# this source may be invalid, please use other valid sources.
origin = (
'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'
)
print('Downloading data from %s' % origin)
dataset = os.path.join(path, 'criteo.tar.gz')
urllib.request.urlretrieve(origin, dataset)
print("Extracting criteo zip...")
with tarfile.open(dataset) as f:
f.extractall(path=path)
print("Create local files...")

# save csv filed
df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None)
df.columns = ['label'] + ["I" +
str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)]
df.to_csv(os.path.join(path, "train.csv"), index=0)
print('Csv file saved.')

# save numpy arrays
target_path = [os.path.join(path, filename) for filename in [
'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy',
'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']]
dense_feats = [col for col in df.columns if col.startswith('I')]
sparse_feats = [col for col in df.columns if col.startswith('C')]
labels = df['label']
dense_feats = process_dense_feats(df, dense_feats)
sparse_feats = process_sparse_feats(df, sparse_feats)
num_data = dense_feats.shape[0]
perm = np.random.permutation(num_data)
# split data in 2 parts
test_num = num_data // 10
processed_data = [
dense_feats[perm[:-test_num]], # train dense
sparse_feats[perm[:-test_num]], # train sparse
labels[perm[:-test_num]], # train labels
dense_feats[perm[-test_num:]], # validate dense
sparse_feats[perm[-test_num:]], # validate sparse
labels[perm[-test_num:]], # validate labels
]
print('Array shapes:')
for i in range(len(processed_data)):
print(os.path.split(target_path[i])
[-1].split('.')[0], processed_data[i].shape)
np.save(target_path[i], processed_data[i])
print('Numpy arrays saved.')


def process_dense_feats(data, feats):
d = data.copy()
d = d[feats].fillna(0.0)
for f in feats:
d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
return d


def process_sparse_feats(data, feats):
from sklearn.preprocessing import LabelEncoder
# process to embeddings.
d = data.copy()
d = d[feats].fillna("-1")
for f in feats:
label_encoder = LabelEncoder()
d[f] = label_encoder.fit_transform(d[f])
feature_cnt = 0
for f in feats:
d[f] += feature_cnt
feature_cnt += d[f].nunique()
return d


def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True):
import pandas as pd
csv_path = os.path.join(path, "train.csv")
if not os.path.exists(csv_path):
download_criteo(path)
df = pd.read_csv(csv_path, nrows=nrows, header=0)
dense_feats = [col for col in df.columns if col.startswith('I')]
sparse_feats = [col for col in df.columns if col.startswith('C')]
labels = np.array(df['label']).reshape(-1, 1)
dense_feats = np.array(process_dense_feats(df, dense_feats))
sparse_feats = np.array(process_sparse_feats(
df, sparse_feats)).astype(np.int32)
if return_val:
test_num = nrows // 10
train_dense = dense_feats[:-test_num]
train_sparse = sparse_feats[:-test_num]
train_label = labels[:-test_num]
validate_dense = dense_feats[-test_num:]
validate_sparse = sparse_feats[-test_num:]
validate_label = labels[-test_num:]
return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label)
else:
return dense_feats, sparse_feats, labels


def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')):
# all data should be available! no checking.
processed_data = [np.load(os.path.join(path, filename))
for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']]
return tuple(processed_data)


def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True):
file_paths = [os.path.join(path, filename) for filename in [
'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy',
'test_sparse_feats.npy', 'train_labels.npy', 'test_labels.npy']]
if not all([os.path.exists(p) for p in file_paths]):
download_criteo(path)
files = [np.load(filename) for filename in file_paths]
if return_val:
return (files[0], files[1]), (files[2], files[3]), (files[4], files[5])
else:
return files[0], files[2], files[4]


###########################################################################
# adult
###########################################################################

def maybe_download(train_data, test_data):
import pandas as pd
"""if adult data "train.csv" and "test.csv" are not in your directory,
download them.
"""

COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]

if not os.path.exists(train_data):
print("downloading training data...")
df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
names=COLUMNS, skipinitialspace=True)
else:
df_train = pd.read_csv("train.csv")

if not os.path.exists(test_data):
print("downloading testing data...")
df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
names=COLUMNS, skipinitialspace=True, skiprows=1)
else:
df_test = pd.read_csv("test.csv")

return df_train, df_test


def cross_columns(x_cols):
"""simple helper to build the crossed columns in a pandas dataframe
"""
crossed_columns = dict()
colnames = ['_'.join(x_c) for x_c in x_cols]
for cname, x_c in zip(colnames, x_cols):
crossed_columns[cname] = x_c
return crossed_columns


def val2idx(df, cols):
"""helper to index categorical columns before embeddings.
"""
val_types = dict()
for c in cols:
val_types[c] = df[c].unique()

val_to_idx = dict()
for k, v in val_types.items():
val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

for k, v in val_to_idx.items():
df[k] = df[k].apply(lambda x: v[x])

unique_vals = dict()
for c in cols:
unique_vals[c] = df[c].nunique()

return df, unique_vals


def onehot(x):
from sklearn.preprocessing import OneHotEncoder
return np.array(OneHotEncoder().fit_transform(x).todense())


def wide(df_train, df_test, wide_cols, x_cols, target):
import pandas as pd
print('Processing wide data')
df_train['IS_TRAIN'] = 1
df_test['IS_TRAIN'] = 0
df_wide = pd.concat([df_train, df_test])

crossed_columns_d = cross_columns(x_cols)
categorical_columns = list(
df_wide.select_dtypes(include=['object']).columns)

wide_cols += list(crossed_columns_d.keys())

for k, v in crossed_columns_d.items():
df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)

df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]

dummy_cols = [
c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])

train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
assert all(train.columns == test.columns)

cols = [c for c in train.columns if c != target]
X_train = train[cols].values
y_train = train[target].values.reshape(-1, 1)
X_test = test[cols].values
y_test = test[target].values.reshape(-1, 1)
return X_train, y_train, X_test, y_test


def load_adult_data(return_val=True):
import pandas as pd
df_train, df_test = maybe_download("train.csv", "test.csv")

df_train['income_label'] = (
df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test['income_label'] = (
df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

age_groups = [0, 25, 65, 90]
age_labels = range(len(age_groups) - 1)
df_train['age_group'] = pd.cut(
df_train['age'], age_groups, labels=age_labels)
df_test['age_group'] = pd.cut(
df_test['age'], age_groups, labels=age_labels)

# columns for wide model
wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
'relationship', 'race', 'gender', 'native_country', 'age_group']
x_cols = (['education', 'occupation'], ['native_country', 'occupation'])

# columns for deep model
embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
'relationship', 'race', 'gender', 'native_country']
cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']

target = 'income_label'

x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
df_train, df_test, wide_cols, x_cols, target)
x_train_wide = np.array(x_train_wide).astype(np.float32)
x_test_wide = np.array(x_test_wide).astype(np.float32)

print('Processing deep data')
df_train['IS_TRAIN'] = 1
df_test['IS_TRAIN'] = 0
df_deep = pd.concat([df_train, df_test])

deep_cols = embedding_cols + cont_cols
df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
columns=cont_cols)
df_deep, unique_vals = val2idx(df_deep, embedding_cols)

train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)

x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)

x_train_deep = np.transpose(x_train_deep)
x_test_deep = np.transpose(x_test_deep)
y_train = onehot(y_train)
y_test = onehot(y_test)

if return_val:
return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
else:
return x_train_deep, x_train_wide, y_train


###########################################################################
# avazu
###########################################################################

def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')):
import pandas as pd
# please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data
train_file = os.path.join(path, 'train.csv')
# test_file = os.path.join(path, 'test.csv') # useless, no labels

df_train = pd.read_csv(train_file)
sparse_feats = process_sparse_feats(df_train, df_train.columns[2:])
# the embedding num for each feature:
# [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60]
# sum: 9449445

np.save(os.path.join(path, 'sparse.npy'), sparse_feats)


if __name__ == '__main__':
download_criteo(os.path.join(os.path.split(
os.path.abspath(__file__)), '../datasets/criteo'))

+ 56
- 0
examples/ctr/models/wdl_adult.py View File

@@ -0,0 +1,56 @@
import hetu as ht
from hetu import init


def wdl_adult(X_deep, X_wide, y_):
lr = 5 / 128
dim_wide = 809
dim_deep = 68

W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
b1 = init.random_normal([50], stddev=0.1, name="b1")
W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
b2 = init.random_normal([20], stddev=0.1, name="b2")

# deep
Embedding = []
X_deep_input = None

for i in range(8):
Embedding_name = "Embedding_deep_" + str(i)
Embedding.append(init.random_normal(
[50, 8], stddev=0.1, name=Embedding_name))
now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
now = ht.array_reshape_op(now, (-1, 8))
if X_deep_input is None:
X_deep_input = now
else:
X_deep_input = ht.concat_op(X_deep_input, now, 1)

for i in range(4):
now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
X_deep_input = ht.concat_op(X_deep_input, now, 1)

mat1 = ht.matmul_op(X_deep_input, W1)
add1 = mat1 + ht.broadcastto_op(b1, mat1)
relu1 = ht.relu_op(add1)
dropout1 = relu1
mat2 = ht.matmul_op(dropout1, W2)
add2 = mat2 + ht.broadcastto_op(b2, mat2)
relu2 = ht.relu_op(add2)
dropout2 = relu2
dmodel = dropout2

# wide
wmodel = ht.concat_op(X_wide, dmodel, 1)
wmodel = ht.matmul_op(wmodel, W)

prediction = wmodel
loss = ht.softmaxcrossentropy_op(prediction, y_)
loss = ht.reduce_mean_op(loss, [0])

opt = ht.optim.SGDOptimizer(learning_rate=lr)
train_op = opt.minimize(loss)

return loss, prediction, y_, train_op

+ 42
- 0
examples/ctr/models/wdl_criteo.py View File

@@ -0,0 +1,42 @@
import hetu as ht
from hetu import init

import numpy as np
import time


def wdl_criteo(dense_input, sparse_input, y_):
feature_dimension = 33762577
embedding_size = 128
learning_rate = 0.01
Embedding = init.random_normal(
[feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
sparse_input = ht.embedding_lookup_op(
Embedding, sparse_input, ctx=ht.cpu(0))
sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))

# DNN
flatten = dense_input
W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

W4 = init.random_normal(
[256 + 26*embedding_size, 1], stddev=0.01, name="W4")

fc1 = ht.matmul_op(flatten, W1)
relu1 = ht.relu_op(fc1)
fc2 = ht.matmul_op(relu1, W2)
relu2 = ht.relu_op(fc2)
y3 = ht.matmul_op(relu2, W3)

y4 = ht.concat_op(sparse_input, y3, axis=1)
y = ht.matmul_op(y4, W4)
y = ht.sigmoid_op(y)

loss = ht.binarycrossentropy_op(y, y_)
loss = ht.reduce_mean_op(loss, [0])
opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
train_op = opt.minimize(loss)

return loss, y, y_, train_op

+ 230
- 0
examples/ctr/run_hetu.py View File

@@ -0,0 +1,230 @@
import hetu as ht
from hetu.launcher import launch

import os
import os.path as osp
import numpy as np
import yaml
import time
import argparse
from tqdm import tqdm
from sklearn import metrics


def worker(args):
def train(iterations, auc_enabled=True, tqdm_enabled=False):
localiter = tqdm(range(iterations)
) if tqdm_enabled else range(iterations)
train_loss = []
train_acc = []
if auc_enabled:
train_auc = []
for it in localiter:
loss_val, predict_y, y_val, _ = executor.run(
'train', convert_to_numpy_ret_vals=True)
if y_val.shape[1] == 1: # for criteo case
acc_val = np.equal(
y_val,
predict_y > 0.5).astype(np.float32)
else:
acc_val = np.equal(
np.argmax(y_val, 1),
np.argmax(predict_y, 1)).astype(np.float32)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
if auc_enabled:
train_auc.append(metrics.roc_auc_score(y_val, predict_y))
if auc_enabled:
return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
else:
return np.mean(train_loss), np.mean(train_acc)

def validate(iterations, tqdm_enabled=False):
localiter = tqdm(range(iterations)
) if tqdm_enabled else range(iterations)
test_loss = []
test_acc = []
test_auc = []
for it in localiter:
loss_val, test_y_predicted, y_test_val = executor.run(
'validate', convert_to_numpy_ret_vals=True)
if y_test_val.shape[1] == 1: # for criteo case
correct_prediction = np.equal(
y_test_val,
test_y_predicted > 0.5).astype(np.float32)
else:
correct_prediction = np.equal(
np.argmax(y_test_val, 1),
np.argmax(test_y_predicted, 1)).astype(np.float32)
test_loss.append(loss_val[0])
test_acc.append(correct_prediction)
test_auc.append(metrics.roc_auc_score(
y_test_val, test_y_predicted))
return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)

def get_current_shard(data):
if args.comm is not None:
part_size = data.shape[0] // nrank
start = part_size * rank
end = start + part_size if rank != nrank - 1 else data.shape[0]
return data[start:end]
else:
return data

batch_size = 128
dataset = args.dataset
model = args.model
device_id = 0

if args.comm == 'PS':
rank = ht.get_worker_communicate().rank()
nrank = int(os.environ['DMLC_NUM_WORKER'])
device_id = rank % 8
elif args.comm == 'Hybrid':
comm = ht.wrapped_mpi_nccl_init()
device_id = comm.dev_id
rank = comm.rank
nrank = int(os.environ['DMLC_NUM_WORKER'])

if dataset == 'criteo':
# define models for criteo
if args.all:
from models.load_data import process_all_criteo_data
dense, sparse, labels = process_all_criteo_data(
return_val=args.val)
elif args.val:
from models.load_data import process_head_criteo_data
dense, sparse, labels = process_head_criteo_data(return_val=True)
else:
from models.load_data import process_sampled_criteo_data
dense, sparse, labels = process_sampled_criteo_data()
if isinstance(dense, tuple):
dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [
get_current_shard(dense[1]), batch_size, 'validate']])
sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [
get_current_shard(sparse[1]), batch_size, 'validate']])
y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [
get_current_shard(labels[1]), batch_size, 'validate']])
else:
dense_input = ht.dataloader_op(
[[get_current_shard(dense), batch_size, 'train']])
sparse_input = ht.dataloader_op(
[[get_current_shard(sparse), batch_size, 'train']])
y_ = ht.dataloader_op(
[[get_current_shard(labels), batch_size, 'train']])
elif dataset == 'adult':
from models.load_data import load_adult_data
x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data()
dense_input = [
ht.dataloader_op([
[get_current_shard(x_train_deep[:, i]), batch_size, 'train'],
[get_current_shard(x_test_deep[:, i]), batch_size, 'validate'],
]) for i in range(12)
]
sparse_input = ht.dataloader_op([
[get_current_shard(x_train_wide), batch_size, 'train'],
[get_current_shard(x_test_wide), batch_size, 'validate'],
])
y_ = ht.dataloader_op([
[get_current_shard(y_train), batch_size, 'train'],
[get_current_shard(y_test), batch_size, 'validate'],
])
else:
raise NotImplementedError
print("Data loaded.")

loss, prediction, y_, train_op = model(dense_input, sparse_input, y_)

eval_nodes = {'train': [loss, prediction, y_, train_op]}
if args.val:
print('Validation enabled...')
eval_nodes['validate'] = [loss, prediction, y_]
executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs')
executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id),
comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path)

if args.all and dataset == 'criteo':
print('Processing all data...')
file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[
args.comm], args.raw_model)
file_path += '%d.log' % rank if args.comm else '.log'
file_path = osp.join(osp.dirname(
osp.abspath(__file__)), 'logs', file_path)
log_file = open(file_path, 'w')
total_epoch = args.nepoch if args.nepoch > 0 else 11
for ep in range(total_epoch):
print("ep: %d" % ep)
ep_st = time.time()
train_loss, train_acc, train_auc = train(executor.get_batch_num(
'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True)
ep_en = time.time()
if args.val:
val_loss, val_acc, val_auc = validate(
executor.get_batch_num('validate'))
printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
% (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st)
else:
printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
% (train_loss, train_acc, train_auc, ep_en - ep_st)
print(printstr)
log_file.write(printstr + '\n')
log_file.flush()
else:
total_epoch = args.nepoch if args.nepoch > 0 else 50
for ep in range(total_epoch):
if ep == 5:
start = time.time()
print("epoch %d" % ep)
ep_st = time.time()
train_loss, train_acc = train(
executor.get_batch_num('train'), auc_enabled=False)
ep_en = time.time()
if args.val:
val_loss, val_acc, val_auc = validate(
executor.get_batch_num('validate'))
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
% (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
else:
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
% (train_loss, train_acc, ep_en - ep_st))
print('all time:', time.time() - start)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True,
help="model to be tested")
parser.add_argument("--val", action="store_true",
help="whether to use validation")
parser.add_argument("--all", action="store_true",
help="whether to use all data")
parser.add_argument("--comm", default=None,
help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
parser.add_argument("--bsp", action="store_true",
help="whether to use bsp instead of asp")
parser.add_argument("--cache", default=None, help="cache policy")
parser.add_argument("--bound", default=100, help="cache bound")
parser.add_argument("--config", type=str, default=osp.join(osp.dirname(
osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps")
parser.add_argument("--nepoch", type=int, default=-1,
help="num of epochs, each train 1/10 data")
args = parser.parse_args()
import models
print('Model:', args.model)
model = eval('models.' + args.model)
args.dataset = args.model.split('_')[-1]
args.raw_model = args.model
args.model = model
if args.comm is None:
worker(args)
elif args.comm == 'Hybrid':
settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
value = settings['shared']
os.environ['DMLC_ROLE'] = 'worker'
for k, v in value.items():
os.environ[k] = str(v)
worker(args)
elif args.comm == 'PS':
launch(worker, args)
else:
raise NotImplementedError

+ 174
- 0
examples/ctr/run_tf_horovod.py View File

@@ -0,0 +1,174 @@
import os
import numpy as np
import tensorflow as tf
import time
import argparse
from tqdm import tqdm
from sklearn import metrics
import horovod.tensorflow as hvd


def pop_env():
for k in ['https_proxy', 'http_proxy']:
if k in os.environ:
os.environ.pop(k)


pop_env()

# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
# we can also use mpirun (default gloo):
# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model


def train_criteo(model, args):
hvd.init()

def get_current_shard(data):
part_size = data.shape[0] // hvd.size()
start = part_size * hvd.rank()
end = start + part_size if hvd.rank() != hvd.size() - \
1 else data.shape[0]
return data[start:end]

if args.all:
from models.load_data import process_all_criteo_data
dense, sparse, all_labels = process_all_criteo_data()
dense_feature = get_current_shard(dense[0])
sparse_feature = get_current_shard(sparse[0])
labels = get_current_shard(all_labels[0])
val_dense = get_current_shard(dense[1])
val_sparse = get_current_shard(sparse[1])
val_labels = get_current_shard(all_labels[1])
else:
from models.load_data import process_sampled_criteo_data
dense_feature, sparse_feature, labels = process_sampled_criteo_data()
dense_feature = get_current_shard(dense_feature)
sparse_feature = get_current_shard(sparse_feature)
labels = get_current_shard(labels)

batch_size = 128
dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])

loss, y, opt = model(dense_input, sparse_input, y_)
global_step = tf.train.get_or_create_global_step()
# here in DistributedOptimizer by default all tensor are reduced on GPU
# can use device_sparse=xxx, device_dense=xxx to modify
# if using device_sparse='/cpu:0', the performance degrades
train_op = hvd.DistributedOptimizer(
opt).minimize(loss, global_step=global_step)

gpu_options = tf.compat.v1.GPUOptions(
allow_growth=True, visible_device_list=str(hvd.local_rank()))
# here horovod default use gpu to initialize, which will cause OOM
hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
sess = tf.compat.v1.train.MonitoredTrainingSession(
hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

my_feed_dict = {
dense_input: np.empty(shape=(batch_size, 13)),
sparse_input: np.empty(shape=(batch_size, 26)),
y_: np.empty(shape=(batch_size, 1)),
}

if args.all:
raw_log_file = './logs/tf_hvd_%s_%d.log' % (
args.model, hvd.local_rank())
print('Processing all data, log to', raw_log_file)
log_file = open(raw_log_file, 'w')
iterations = dense_feature.shape[0] // batch_size
total_epoch = 400
start_index = 0
for ep in range(total_epoch):
print("epoch %d" % ep)
st_time = time.time()
train_loss, train_acc, train_auc = [], [], []
for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
start_index += batch_size
if start_index + batch_size > dense_feature.shape[0]:
start_index = 0
loss_val = sess.run([loss, y, y_, train_op],
feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
acc_val = np.equal(
true_val,
pred_val > 0.5)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
train_auc.append(metrics.roc_auc_score(true_val, pred_val))
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
tra_auc = np.mean(train_auc)
en_time = time.time()
train_time = en_time - st_time
printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
% (tra_loss, tra_accuracy, tra_auc, train_time)
print(printstr)
log_file.write(printstr + '\n')
log_file.flush()

else:
iterations = dense_feature.shape[0] // batch_size

epoch = 50
for ep in range(epoch):
print('epoch', ep)
if ep == 5:
start = time.time()
ep_st = time.time()
train_loss = []
train_acc = []
for idx in range(iterations):
start_index = idx * batch_size
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]

loss_val = sess.run([loss, y, y_, train_op],
feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
if pred_val.shape[1] == 1: # for criteo case
acc_val = np.equal(
true_val,
pred_val > 0.5)
else:
acc_val = np.equal(
np.argmax(pred_val, 1),
np.argmax(true_val, 1)).astype(np.float32)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
ep_en = time.time()
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
% (tra_loss, tra_accuracy, ep_en - ep_st))
print('all time:', (time.time() - start))


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True,
help="model to be tested")
parser.add_argument("--all", action="store_true",
help="whether to use all data")
args = parser.parse_args()
raw_model = args.model
import tf_models
model = eval('tf_models.' + raw_model)
dataset = raw_model.split('_')[-1]
print('Model:', raw_model)
train_criteo(model, args)


if __name__ == '__main__':
main()

+ 202
- 0
examples/ctr/run_tf_local.py View File

@@ -0,0 +1,202 @@
import numpy as np
import tensorflow as tf
import time
import argparse
from tqdm import tqdm
from sklearn import metrics


def train_criteo(model, args):
if args.all:
from models.load_data import process_all_criteo_data
dense, sparse, all_labels = process_all_criteo_data()
dense_feature, val_dense = dense
sparse_feature, val_sparse = sparse
labels, val_labels = all_labels
else:
from models.load_data import process_sampled_criteo_data
dense_feature, sparse_feature, labels = process_sampled_criteo_data()

batch_size = 128
dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])

loss, y, opt = model(dense_input, sparse_input, y_)
train_op = opt.minimize(loss)

init = tf.compat.v1.global_variables_initializer()
gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
sess = tf.compat.v1.Session(
config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
sess.run(init)

my_feed_dict = {
dense_input: np.empty(shape=(batch_size, 13)),
sparse_input: np.empty(shape=(batch_size, 26)),
y_: np.empty(shape=(batch_size, 1)),
}

if args.all:
raw_log_file = './logs/tf_local_%s.log' % (args.model)
print('Processing all data, log to', raw_log_file)
log_file = open(raw_log_file, 'w')
iterations = dense_feature.shape[0] // batch_size
total_epoch = 11
start_index = 0
for ep in range(total_epoch):
print("epoch %d" % ep)
st_time = time.time()
train_loss, train_acc, train_auc = [], [], []
for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
start_index += batch_size
if start_index + batch_size > dense_feature.shape[0]:
start_index = 0
loss_val = sess.run([loss, y, y_, train_op],
feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
acc_val = np.equal(
true_val,
pred_val > 0.5)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
train_auc.append(metrics.roc_auc_score(true_val, pred_val))
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
tra_auc = np.mean(train_auc)
en_time = time.time()
train_time = en_time - st_time
printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
% (tra_loss, tra_accuracy, tra_auc, train_time)
print(printstr)
log_file.write(printstr + '\n')
log_file.flush()

else:
iteration = dense_feature.shape[0] // batch_size

epoch = 50
for ep in range(epoch):
print('epoch', ep)
if ep == 5:
start = time.time()
ep_st = time.time()
train_loss = []
train_acc = []
for idx in range(iteration):
start_index = idx * batch_size
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]

loss_val = sess.run([loss, y, y_, train_op],
feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
if pred_val.shape[1] == 1: # for criteo case
acc_val = np.equal(
true_val,
pred_val > 0.5)
else:
acc_val = np.equal(
np.argmax(pred_val, 1),
np.argmax(true_val, 1)).astype(np.float32)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
ep_en = time.time()
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
% (tra_loss, tra_accuracy, ep_en - ep_st))
print('all time:', (time.time() - start))


def train_adult(model):
batch_size = 128
total_epoch = 50
dim_wide = 809

X_deep = []
for i in range(8):
X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
for i in range(4):
X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1]))
X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
loss, y, train_op = model(X_deep, X_wide, y_)

init = tf.global_variables_initializer()

gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

sess.run(init)

from models.load_data import load_adult_data
x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)

iterations = x_train_deep.shape[0] // batch_size
for ep in range(total_epoch):
print('epoch', ep)
if ep == 5:
start = time.time()
ep_st = time.time()
train_loss = []
train_acc = []
pre_index = 0

for it in range(iterations):
batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
batch_y = y_train[pre_index:pre_index + batch_size]
pre_index += batch_size

my_feed_dict = dict()
for i in range(12):
my_feed_dict[X_deep[i]] = np.array(
batch_x_deep[:, 1]).reshape(-1, 1)

my_feed_dict[X_wide] = np.array(batch_x_wide)
my_feed_dict[y_] = batch_y
loss_val = sess.run([loss, y, y_, train_op],
feed_dict=my_feed_dict)
acc_val = np.equal(
np.argmax(loss_val[1], 1),
np.argmax(loss_val[2], 1)).astype(np.float32)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
ep_en = time.time()
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
% (tra_loss, tra_accuracy, ep_en - ep_st))
print('all time:', (time.time() - start))


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True,
help="model to be tested")
parser.add_argument("--all", action="store_true",
help="whether to use all data")
args = parser.parse_args()
raw_model = args.model
import tf_models
model = eval('tf_models.' + raw_model)
dataset = raw_model.split('_')[-1]
print('Model:', raw_model)

if dataset == 'criteo':
train_criteo(model, args)
elif dataset == 'adult':
train_adult(model)
else:
raise NotImplementedError


if __name__ == '__main__':
main()

+ 211
- 0
examples/ctr/run_tf_parallax.py View File

@@ -0,0 +1,211 @@
import os
import numpy as np
import tensorflow as tf
import time
import argparse
from tqdm import tqdm
from sklearn import metrics

from autodist import AutoDist
from autodist.resource_spec import ResourceSpec
from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
from autodist.strategy.base import Strategy
from autodist.kernel.common.utils import get_op_name
from tensorflow.python.framework import ops


def pop_env():
for k in ['https_proxy', 'http_proxy']:
if k in os.environ:
os.environ.pop(k)


pop_env()

# Please DO NOT modify /etc/bash.bashrc to activate conda environment.
# Use python_venv in spec yml file instead.
# Use absolute path of python file.
# Here we use the tf native partitioner instead of autodist's PartitionPS.


class Parallaxx(PSLoadBalancing, AllReduce):
"""
Modify original parallax to remove replica on CPUs.
"""

def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
AllReduce.__init__(self, chunk_size)

# pylint: disable=attribute-defined-outside-init
def build(self, graph_item, resource_spec):
"""Generate the strategy."""
expr = Strategy()

# For each variable, generate variable synchronizer config
expr.graph_config.replicas.extend(
[k for k, v in resource_spec.gpu_devices])
reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
self.loads = {ps: 0.0 for ps in reduction_device_names}

# Generate node config
node_config = []
for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
var_op_name = get_op_name(var.name)
grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
if isinstance(grad, ops.Tensor): # this is a dense variable
group_id = idx // self.chunk_size
config = self._gen_all_reduce_node_config(
var.name, group=group_id)
else: # sparse updates
# For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
# Sparse variables are likely larger, so keeping copies would be costlier,
# and usually each device only requires a small part of the overall variable.
config = self._gen_ps_node_config(
var,
# For Parallax Strategy, all PS vars are sparse which does not need proxy.
False,
self._sync,
self._staleness
)
node_config.append(config)
expr.node_config.extend(node_config)

return expr


def train_criteo(model, args):
resource_spec_file = os.path.join(os.path.dirname(
__file__), 'settings', 'plx_local_spec.yml')
autodist = AutoDist(resource_spec_file, Parallaxx())
respec = ResourceSpec(resource_spec_file)
if args.all:
from models.load_data import process_all_criteo_data
dense, sparse, all_labels = process_all_criteo_data()
dense_feature, val_dense = dense
sparse_feature, val_sparse = sparse
labels, val_labels = all_labels
else:
from models.load_data import process_sampled_criteo_data
dense_feature, sparse_feature, labels = process_sampled_criteo_data()

# autodist will split the feeding data
batch_size = 128
with tf.Graph().as_default() as g, autodist.scope():
dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
embed_partitioner = tf.fixed_size_partitioner(
len(respec.nodes), 0) if len(respec.nodes) > 1 else None
loss, y, opt = model(dense_input, sparse_input,
y_, embed_partitioner, False)
train_op = opt.minimize(loss)

sess = autodist.create_distributed_session()

my_feed_dict = {
dense_input: np.empty(shape=(batch_size, 13)),
sparse_input: np.empty(shape=(batch_size, 26)),
y_: np.empty(shape=(batch_size, 1)),
}

if args.all:
raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[
0], 'logs', 'tf_plx_%s.log' % (args.model))
print('Processing all data, log to', raw_log_file)
log_file = open(raw_log_file, 'w')
iterations = dense_feature.shape[0] // batch_size
total_epoch = 11
start_index = 0
for ep in range(total_epoch):
print("epoch %d" % ep)
st_time = time.time()
train_loss, train_acc, train_auc = [], [], []
for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
start_index += batch_size
if start_index + batch_size > dense_feature.shape[0]:
start_index = 0
loss_val = sess.run(
[loss, y, y_, train_op], feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
acc_val = np.equal(
true_val,
pred_val > 0.5)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
train_auc.append(metrics.roc_auc_score(true_val, pred_val))
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
tra_auc = np.mean(train_auc)
en_time = time.time()
train_time = en_time - st_time
printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
% (tra_loss, tra_accuracy, tra_auc, train_time)
print(printstr)
log_file.write(printstr + '\n')
log_file.flush()

else:
iteration = dense_feature.shape[0] // batch_size

epoch = 50
for ep in range(epoch):
print('epoch', ep)
if ep == 5:
start = time.time()
ep_st = time.time()
train_loss = []
train_acc = []
for idx in range(iteration):
start_index = idx * batch_size
my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]

loss_val = sess.run(
[loss, y, y_, train_op], feed_dict=my_feed_dict)
pred_val = loss_val[1]
true_val = loss_val[2]
if pred_val.shape[1] == 1: # for criteo case
acc_val = np.equal(
true_val,
pred_val > 0.5)
else:
acc_val = np.equal(
np.argmax(pred_val, 1),
np.argmax(true_val, 1)).astype(np.float32)
train_loss.append(loss_val[0])
train_acc.append(acc_val)
tra_accuracy = np.mean(train_acc)
tra_loss = np.mean(train_loss)
ep_en = time.time()
print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
% (tra_loss, tra_accuracy, ep_en - ep_st))
print('all time:', (time.time() - start))


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True,
help="model to be tested")
parser.add_argument("--all", action="store_true",
help="whether to use all data")
args = parser.parse_args()
raw_model = args.model
import tf_models
model = eval('tf_models.' + raw_model)
dataset = raw_model.split('_')[-1]
print('Model:', raw_model)

if dataset == 'criteo':
train_criteo(model, args)
else:
raise NotImplementedError


if __name__ == '__main__':
main()

+ 10
- 0
examples/ctr/settings/local_s1.yml View File

@@ -0,0 +1,10 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 4
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3
launch :
worker : 0
server : 1
scheduler : true

+ 10
- 0
examples/ctr/settings/local_s1_w2.yml View File

@@ -0,0 +1,10 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 2
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3
launch :
worker : 2
server : 1
scheduler : true

+ 10
- 0
examples/ctr/settings/local_s1_w4.yml View File

@@ -0,0 +1,10 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 4
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3
launch :
worker : 4
server : 1
scheduler : true

+ 10
- 0
examples/ctr/settings/local_s1_w8.yml View File

@@ -0,0 +1,10 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 8
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3
launch :
worker : 8
server : 1
scheduler : true

+ 6
- 0
examples/ctr/settings/local_w4.yml View File

@@ -0,0 +1,6 @@
shared :
DMLC_PS_ROOT_URI : 127.0.0.1
DMLC_PS_ROOT_PORT : 13100
DMLC_NUM_WORKER : 4
DMLC_NUM_SERVER : 1
DMLC_PS_VAN_TYPE : p3

+ 4
- 0
examples/ctr/settings/plx_local_spec.yml View File

@@ -0,0 +1,4 @@
nodes:
- address: localhost
cpus: [0]
gpus: [0,1,2,3,4,5,6,7]

+ 9
- 0
examples/ctr/settings/tf_local_s1_w2.json View File

@@ -0,0 +1,9 @@
{
"worker": [
"127.0.0.1:12349",
"127.0.0.1:12348"
],
"ps": [
"127.0.0.1:12345"
]
}

+ 11
- 0
examples/ctr/settings/tf_local_s1_w4.json View File

@@ -0,0 +1,11 @@
{
"worker": [
"127.0.0.1:23459",
"127.0.0.1:23458",
"127.0.0.1:23457",
"127.0.0.1:23456"
],
"ps": [
"127.0.0.1:23455"
]
}

+ 15
- 0
examples/ctr/settings/tf_local_s1_w8.json View File

@@ -0,0 +1,15 @@
{
"worker": [
"127.0.0.1:34569",
"127.0.0.1:34568",
"127.0.0.1:34567",
"127.0.0.1:34566",
"127.0.0.1:34565",
"127.0.0.1:34564",
"127.0.0.1:34563",
"127.0.0.1:34562"
],
"ps": [
"127.0.0.1:34575"
]
}

+ 7
- 0
examples/ctr/tests/hybrid_dcn_criteo.sh View File

@@ -0,0 +1,7 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml

+ 7
- 0
examples/ctr/tests/hybrid_dfm_criteo.sh View File

@@ -0,0 +1,7 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml

+ 7
- 0
examples/ctr/tests/hybrid_wdl_adult.sh View File

@@ -0,0 +1,7 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml

+ 7
- 0
examples/ctr/tests/hybrid_wdl_criteo.sh View File

@@ -0,0 +1,7 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml

+ 6
- 0
examples/ctr/tests/local_dcn_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model dcn_criteo --val

+ 6
- 0
examples/ctr/tests/local_dfm_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model dfm_criteo --val

+ 6
- 0
examples/ctr/tests/local_wdl_adult.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model wdl_adult --val

+ 6
- 0
examples/ctr/tests/local_wdl_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model wdl_criteo --val

+ 6
- 0
examples/ctr/tests/ps_dcn_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml

+ 6
- 0
examples/ctr/tests/ps_dfm_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml

+ 6
- 0
examples/ctr/tests/ps_wdl_adult.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml

+ 6
- 0
examples/ctr/tests/ps_wdl_criteo.sh View File

@@ -0,0 +1,6 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../run_hetu.py

python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save