diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..1fccce5
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "third_party/GraphMix"]
+ path = third_party/GraphMix
+ url = https://github.com/nox-410/GraphMix.git
+[submodule "third_party/HetuML"]
+ path = third_party/HetuML
+ url = https://github.com/ccchengff/HetuML.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..fa225f1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,59 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(Hetu CXX)
+
+include(cmake/config.cmake)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_CXX_FLAGS "-O3 -Wall")
+
+# openmp
+find_package(OpenMP REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+# compile flag
+if(${HETU_VERSION} STREQUAL "all")
+ set(HETU_COMPILE_GPU ON)
+ set(HETU_COMPILE_MKL ON)
+elseif(${HETU_VERSION} STREQUAL "gpu")
+ set(HETU_COMPILE_GPU ON)
+ set(HETU_COMPILE_MKL OFF)
+elseif(${HETU_VERSION} STREQUAL "mkl")
+ set(HETU_COMPILE_GPU OFF)
+ set(HETU_COMPILE_MKL ON)
+else()
+ message(FATAL_ERROR "unknown hetu version")
+endif()
+message(STATUS "HETU version: ${HETU_VERSION}")
+
+# cuda
+if(${HETU_COMPILE_GPU})
+ set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc)
+ file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION)
+ string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION})
+ if(${CUDA_VERSION} VERSION_LESS "10.1")
+ message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}")
+ endif()
+ find_package(CUDAToolkit REQUIRED)
+ enable_language(CUDA)
+endif()
+
+include(FetchContent) # download third_party
+
+add_subdirectory(${CMAKE_SOURCE_DIR}/src)
+
+if(${HETU_PS})
+ add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite)
+endif()
+
+if(${HETU_GEOMETRIC})
+ add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix)
+endif()
+if (HETU_ML)
+ add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML)
+endif()
+enable_testing()
diff --git a/COMMITTERS.md b/COMMITTERS.md
new file mode 100644
index 0000000..3406485
--- /dev/null
+++ b/COMMITTERS.md
@@ -0,0 +1,33 @@
+## Committer
+
+Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer.
+
+One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
+
+Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## Committer Members
+### Current Committer
+- [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)>
+- [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)>
+- [codecaution](https://github.com/codecaution) - **Xiaonan Nie**
+- [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang**
+- [nox-410](https://github.com/nox-410) - **Yining Shi**
+- [initzhang](https://github.com/initzhang) - **Xin Zhang**
+- [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang**
+- [AFDWang](https://github.com/AFDWang) - **Yujie Wang**
+- [sj1104](https://github.com/sj1104) - **Jia Shen**
+- [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou**
+- [zmxdream](https://github.com/zmxdream) - **Minxu Zhang**
+
+We would like to sincerely thank the following community members for their contributions to Hetu.
+
+- [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)**
+- [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)**
+- [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)**
+- [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)**
+- [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)**
+- [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao**
+- [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue**
+- **Lizi Su**
+- **Dong Li**
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..239c267
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,53 @@
+# Contributing to Hetu
+Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing.
+
+
+## Issues
+We use Github Issues to track public bugs and feature requests.
+
+### Search Known Issues First
+Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant.
+
+### Reporting New Issues
+If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash.
+
+## Pull Requests
+We strongly welcome your pull request to make Hetu better.
+
+### Branch Management
+There are three main branches here:
+
+1. `main` branch.
+
+ (1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`...
+
+ (2). **Don't submit any PR on `main` branch.**
+
+2. `specific version` branchs.
+
+ (1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing branch. After full testing, `specific version` branch will be merged to `main` branch for the next release.
+
+ (2). **You are recommended to submit bugfix or feature PR on `specific version` branch.**
+
+
+Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release.
+
+
+### Make Pull Requests
+The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay.
+
+Before submitting a pull request, please make sure the followings are done:
+
+1. Fork the repo and create your branch from `main` or `specific version`.
+2. Update code or documentation if you have changed APIs.
+3. Add the copyright notice to the top of any new files you've added.
+4. Check your code lints and checkstyles.
+5. Test and test again your code.
+6. Now, you can submit your pull request on `specific version` branch.
+
+## Code Style Guide
+Use [Code Style](./.clang-format) for Python and C++.
+
+## License
+By contributing to Hetu, you agree that your contributions will be licensed
+under [License](LICENSE)
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index d63dfb7..7849ac5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,124 +1,201 @@
-木兰宽松许可证, 第2版
-
-2020年1月 http://license.coscl.org.cn/MulanPSL2
-
-您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束:
-
-0. 定义
-
-“软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。
-
-“贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。
-
-“贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。
-
-“法人实体” 是指提交贡献的机构及其“关联实体”。
-
-“关联实体” 是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。
-
-1. 授予版权许可
-
-每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。
-
-2. 授予专利许可
-
-每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。
-
-3. 无商标许可
-
-“本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。
-
-4. 分发限制
-
-您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。
-
-5. 免责声明与责任限制
-
-“软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。
-
-6. 语言
-
-“本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。
-
-条款结束
-
-如何将木兰宽松许可证,第2版,应用到您的软件
-
-如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步:
-
-1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字;
-
-2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中;
-
-3, 请将如下声明文本放入每个源文件的头部注释中。
-
-Copyright (c) [Year] [name of copyright holder]
-[Software Name] is licensed under Mulan PSL v2.
-You can use this software according to the terms and conditions of the Mulan PSL v2.
-You may obtain a copy of Mulan PSL v2 at:
- http://license.coscl.org.cn/MulanPSL2
-THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-See the Mulan PSL v2 for more details.
-Mulan Permissive Software License,Version 2
-Mulan Permissive Software License,Version 2 (Mulan PSL v2)
-
-January 2020 http://license.coscl.org.cn/MulanPSL2
-
-Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions:
-
-0. Definition
-
-Software means the program and related documents which are licensed under this License and comprise all Contribution(s).
-
-Contribution means the copyrightable work licensed by a particular Contributor under this License.
-
-Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License.
-
-Legal Entity means the entity making a Contribution and all its Affiliates.
-
-Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity.
-
-1. Grant of Copyright License
-
-Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not.
-
-2. Grant of Patent License
-
-Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken.
-
-3. No Trademark License
-
-No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4.
-
-4. Distribution Restriction
-
-You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software.
-
-5. Disclaimer of Warranty and Limitation of Liability
-
-THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-6. Language
-
-THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL.
-
-END OF THE TERMS AND CONDITIONS
-
-How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software
-
-To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps:
-
-Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner;
-Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package;
-Attach the statement to the appropriate annotated syntax at the beginning of each source file.
-Copyright (c) [Year] [name of copyright holder]
-[Software Name] is licensed under Mulan PSL v2.
-You can use this software according to the terms and conditions of the Mulan PSL v2.
-You may obtain a copy of Mulan PSL v2 at:
- http://license.coscl.org.cn/MulanPSL2
-THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-See the Mulan PSL v2 for more details.
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [2021] [Peking University]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
index 6ec32e1..1accd40 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,177 @@
-#### 从命令行创建一个新的仓库
+
+

+
+# HETU
+
+
+
+[Documentation](https://hetu-doc.readthedocs.io) |
+[Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html)
+
+Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by DAIR Lab at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics:
+
+- Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms.
+
+- Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks.
+
+- Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce.
+
+- Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark
+
+- Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search.
+
+We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details.
+
+## Installation
+1. Clone the repository.
+
+2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used:
+```conda env create -f environment.yml``` .
+Please prepare Cuda toolkit and CuDNN in advance.
+
+3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix.
```bash
-touch README.md
-git init
-git add README.md
-git commit -m "first commit"
-git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
-git push -u origin master
+# modify paths and configurations in cmake/config.cmake
+
+# generate Makefile
+mkdir build && cd build && cmake ..
+# compile
+# make all
+make -j 8
+# make hetu, version is specified in cmake/config.cmake
+make hetu -j 8
+# make allreduce module
+make allreduce -j 8
+# make ps module
+make ps -j 8
+# make geometric module
+make geometric -j 8
+# make hetu-cache module
+make hetu_cache -j 8
```
-#### 从命令行推送已经创建的仓库
+
+4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` .
+
+
+
+## Usage
+
+Train logistic regression on gpu:
```bash
-git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
-git push -u origin master
+bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST
+```
+
+Train a 3-layer mlp on gpu:
+```bash
+bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10
+```
+
+Train a 3-layer cnn with gpu:
+
+```bash
+bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST
```
+Train a 3-layer mlp with allreduce on 8 gpus (use mpirun):
+```bash
+bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10
+```
+
+Train a 3-layer mlp with PS on 1 server and 2 workers:
+```bash
+# in the script we launch the scheduler and server, and two workers
+bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10
+```
+
+
+## More Examples
+Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks.
+
+## Community
+* Email: xupeng.miao@pku.edu.cn
+* Slack: coming soon
+* Hetu homepage: https://hetu-doc.readthedocs.io
+* [Committers & Contributors](COMMITTERS.md)
+* [Contributing to Hetu](CONTRIBUTING.md)
+* [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html)
+
+## Enterprise Users
+
+If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here.
+
+
+
+
+
+## License
+
+The entire codebase is under [license](LICENSE)
+
+## Papers
+ 1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021
+ 2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021
+ 3. coming soon
+
+## Acknowledgements
+
+We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel).
+
+## Appendix
+The prerequisites for different modules in Hetu is listed as follows:
+ ```
+ "*" means you should prepare by yourself, while others support auto-download
+
+ Hetu: OpenMP(*), CMake(*)
+ Hetu (version mkl): MKL 1.6.1
+ Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*)
+ Hetu (version all): both
+
+ Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version
+
+ Hetu-PS: Protobuf(*), ZeroMQ 4.3.2
+
+ Hetu-Geometric: Pybind11(*), Metis(*)
+
+ Hetu-Cache: Pybind11(*), this module needs PS module
+
+ ##################################################################
+ Tips for preparing the prerequisites
+
+ Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment):
+ 1. download from https://developer.nvidia.com
+ 2. install
+ 3. modify paths in cmake/config.cmake if necessary
+
+ Preparing OpenMP:
+ Your just need to ensure your compiler support openmp.
+
+ Preparing CMake, Protobuf, Pybind11, Metis:
+ Install by anaconda:
+ conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis
+
+ Preparing OpenMPI (not necessary):
+ install by anaconda: `conda install -c conda-forge openmpi=4.0.3`
+ or
+ 1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
+ 2. build openmpi by `./configure /path/to/build && make -j8 && make install`
+ 3. modify MPI_HOME to /path/to/build in cmake/config.cmake
+
+ Preparing MKL (not necessary):
+ install by anaconda: `conda install -c conda-forge onednn`
+ or
+ 1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz
+ 2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
+ 3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake
+
+ Preparing ZeroMQ (not necessary):
+ install by anaconda: `conda install -c anaconda zeromq=4.3.2`
+ or
+ 1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip
+ 2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
+ 3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake
+ ```
diff --git a/bin/heturun b/bin/heturun
new file mode 100755
index 0000000..5bf3693
--- /dev/null
+++ b/bin/heturun
@@ -0,0 +1,2 @@
+#!/bin/bash
+python $(cd $(dirname $0); pwd)/../python/runner.py $@
diff --git a/cmake/Modules/FindCUDNN.cmake b/cmake/Modules/FindCUDNN.cmake
new file mode 100644
index 0000000..5cfbb0d
--- /dev/null
+++ b/cmake/Modules/FindCUDNN.cmake
@@ -0,0 +1,75 @@
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+# CUDNN_ROOT: Base directory where CUDNN is found
+# CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+# CUDNN_LIBRARY: Directory where CUDNN library is searched for
+# CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+# CUDNN_FOUND
+# CUDNN_INCLUDE_PATH
+# CUDNN_LIBRARY_PATH
+#
+
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+ message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+ HINTS ${CUDNN_INCLUDE_DIR}
+ PATH_SUFFIXES cuda/include cuda include
+ REQUIRED)
+
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+ set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+ set(CUDNN_LIBNAME "cudnn")
+endif()
+
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+ message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+ PATHS ${CUDNN_LIBRARY}
+ PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
+ REQUIRED)
+
+set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc")
+file(WRITE ${file} "
+ #include
+ #include \"${CUDNN_INCLUDE_PATH}/cudnn.h\"
+ int main()
+ {
+ std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL;
+ int x = cudnnGetVersion();
+ return x == CUDNN_VERSION;
+ }
+")
+try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+ RUN_OUTPUT_VARIABLE CUDNN_VERSION
+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
+ LINK_LIBRARIES ${CUDNN_LIBRARY_PATH})
+if (NOT CUDNN_VERSION_MATCHED)
+ message(FATAL_ERROR "Found CUDNN header version and library version do not match! \
+ (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.")
+endif()
+message(STATUS "CUDNN version: ${CUDNN_VERSION}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+ CUDNN
+ REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH
+ VERSION_VAR CUDNN_VERSION)
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/cmake/Modules/FindMETIS.cmake b/cmake/Modules/FindMETIS.cmake
new file mode 100644
index 0000000..eaf818b
--- /dev/null
+++ b/cmake/Modules/FindMETIS.cmake
@@ -0,0 +1,70 @@
+# Accepts the following variables:
+#
+# METIS_ROOT: Prefix where METIS is installed.
+# METIS_LIB_NAME: Name of the METIS library (default: metis).
+# METIS_LIBRARY: Full path of the METIS library.
+
+# Sets the following variables:
+#
+# METIS_LIBRARY: Full path of the METIS library.
+# METIS_FOUND: True if ParMETIS was found.
+# METIS_LIBRARIES: List of all libraries needed for linking with METIS,
+#
+# Provides the following macros:
+#
+# find_package(METIS)
+#
+# Searches for METIS (See above)
+
+
+# search metis header
+find_path(METIS_INCLUDE_DIR metis.h
+ PATHS ${METIS_DIR} ${METIS_ROOT}
+ PATH_SUFFIXES metis include include/metis Lib METISLib
+ NO_DEFAULT_PATH
+ DOC "Include directory of metis")
+find_path(METIS_INCLUDE_DIR metis.h
+ PATH_SUFFIXES metis include include/metis Lib METISLib)
+
+set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library")
+
+# search metis library
+if(NOT METIS_LIB_NAME)
+ set(METIS_LIB_NAME metis)
+endif(NOT METIS_LIB_NAME)
+
+find_library(METIS_LIBRARY ${METIS_LIB_NAME}
+ PATHS ${METIS_DIR} ${METIS_ROOT}
+ PATH_SUFFIXES lib
+ NO_DEFAULT_PATH)
+find_library(METIS_LIBRARY ${METIS_LIB_NAME}
+ PATH_SUFFIXES lib
+)
+
+# behave like a CMake module is supposed to behave
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+ "METIS"
+ DEFAULT_MSG
+ METIS_INCLUDE_DIR
+ METIS_LIBRARY
+)
+
+mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME)
+
+# if both headers and library are found, store results
+if(METIS_FOUND)
+ set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR})
+ set(METIS_LIBRARIES ${METIS_LIBRARY})
+ # log result
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+ "Determing location of METIS succeded:\n"
+ "Include directory: ${METIS_INCLUDE_DIRS}\n"
+ "Library directory: ${METIS_LIBRARIES}\n\n")
+else(METIS_FOUND)
+ # log errornous result
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+ "Determing location of METIS failed:\n"
+ "Include directory: ${METIS_INCLUDE_DIRS}\n"
+ "Library directory: ${METIS_LIBRARIES}\n\n")
+endif(METIS_FOUND)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000..322a80a
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,14 @@
+# - Try to find DNNL(MKL-DNN)
+# Once done this will define
+# DNNL_FOUND - System has DNNL
+# DNNL_INCLUDE_DIR - The DNNL include directories
+# DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build
+# DNNL_LIBRARY - The libraries needed to use DNNL
+# DNNL_DEFINITIONS - Compiler switches required for using DNNL
+
+find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include )
+find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include )
+find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src )
+
+include ( FindPackageHandleStandardArgs )
+find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR )
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000..b0a81d9
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,97 @@
+# Try to find NCCL
+#
+# The following variables are optionally searched for defaults
+# NCCL_ROOT: Base directory where all NCCL components are found
+# NCCL_ROOT_DIR: Base directory where all NCCL components are found
+# NCCL_INCLUDE_DIR: Directory where NCCL header is found
+# NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+# NCCL_FOUND
+# NCCL_INCLUDE_DIRS
+# NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+if (NOT DEFINED NCCL_ROOT)
+ set(NCCL_ROOT $ENV{CONDA_PREFIX})
+endif()
+
+set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")
+
+find_path(NCCL_INCLUDE_DIRS
+ NAMES nccl.h
+ HINTS
+ ${NCCL_ROOT}
+ ${NCCL_ROOT}/include
+ ${NCCL_INCLUDE_DIR}
+ ${NCCL_ROOT_DIR}
+ ${NCCL_ROOT_DIR}/include
+ ${CUDA_TOOLKIT_ROOT_DIR}/include
+ REQUIRED)
+
+if ($ENV{USE_STATIC_NCCL})
+ message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+ set(NCCL_LIBNAME "libnccl_static.a")
+else()
+ set(NCCL_LIBNAME "nccl")
+endif()
+
+find_library(NCCL_LIBRARIES
+ NAMES ${NCCL_LIBNAME}
+ HINTS
+ ${NCCL_LIB_DIR}
+ ${NCCL_ROOT}
+ ${NCCL_ROOT}/lib
+ ${NCCL_ROOT}/lib/x86_64-linux-gnu
+ ${NCCL_ROOT}/lib64
+ ${NCCL_ROOT_DIR}
+ ${NCCL_ROOT_DIR}/lib
+ ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+ ${NCCL_ROOT_DIR}/lib64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+ REQUIRED)
+
+set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR})
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+if (NCCL_VERSION_DEFINED)
+ set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+ file(WRITE ${file} "
+ #include
+ #include \"${NCCL_HEADER_FILE}\"
+ int main()
+ {
+ std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH;
+ int x;
+ ncclGetVersion(&x);
+ return x == NCCL_VERSION_CODE;
+ }
+ ")
+ try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+ RUN_OUTPUT_VARIABLE NCCL_VERSION
+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
+ LINK_LIBRARIES ${NCCL_LIBRARIES})
+ if (NOT NCCL_VERSION_MATCHED)
+ message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+ (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+ endif()
+ message(STATUS "NCCL version: ${NCCL_VERSION}")
+else()
+ message(STATUS "NCCL version < 2.3.5-5")
+endif ()
+set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+ NCCL
+ REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES
+ VERSION_VAR NCCL_VERSION)
diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake
new file mode 100644
index 0000000..806c9c6
--- /dev/null
+++ b/cmake/Modules/FindZMQ.cmake
@@ -0,0 +1,47 @@
+# - Try to find ZMQ
+# Once done this will define
+# ZMQ_FOUND - System has ZMQ
+# ZMQ_INCLUDE_DIRS - The ZMQ include directories
+# ZMQ_LIBRARIES - The libraries needed to use ZMQ
+# ZMQ_DEFINITIONS - Compiler switches required for using ZMQ
+
+find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include )
+find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib )
+
+set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} )
+set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} )
+
+if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS)
+ set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc")
+ file(WRITE ${file} "
+ #include
+ #include \"${ZMQ_INCLUDE_DIRS}/zmq.h\"
+ int main()
+ {
+ std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH;
+ int x, y, z;
+ zmq_version(&x, &y, &z);
+ return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH;
+ }
+ ")
+ try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+ RUN_OUTPUT_VARIABLE ZMQ_VERSION
+ LINK_LIBRARIES ${ZMQ_LIBRARIES})
+ if (NOT ZMQ_VERSION_MATCHED)
+ message(WARNING "Found ZMQ header version and library version do not match! \
+ (include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.")
+ unset(ZMQ_INCLUDE_DIRS)
+ unset(ZMQ_LIBRARIES)
+ unset(ZMQ_VERSION)
+ else ()
+ message(STATUS "ZMQ version: ${ZMQ_VERSION}")
+ endif()
+endif()
+
+include ( FindPackageHandleStandardArgs )
+# handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE
+# if all listed variables are TRUE
+find_package_handle_standard_args (
+ ZMQ
+ REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS
+ VERSION_VAR ZMQ_VERSION)
diff --git a/cmake/config.example.cmake b/cmake/config.example.cmake
new file mode 100644
index 0000000..86224f1
--- /dev/null
+++ b/cmake/config.example.cmake
@@ -0,0 +1,55 @@
+######################
+### Set targets ######
+######################
+
+# hetu main version, choose from (mkl, gpu, all)
+# if using mkl (for CPU) or all, OpenMP(*), mkl required
+# if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required
+set(HETU_VERSION "all")
+
+# whether to compile allreduce module
+# nccl(*), openmpi required
+set(HETU_ALLREDUCE ON)
+
+# whether to compile ps module
+# protobuf(*), zeromq required
+set(HETU_PS ON)
+
+# whether to compile geometric module (for GNNs)
+# pybind11(*), metis(*) required
+set(HETU_GEOMETRIC ON)
+
+# whether to compile cache module (for PS)
+# to enable this, you must turn HETU_PS on
+# pybind11(*) required
+set(HETU_CACHE ON)
+
+# whether to compile Hetu ML Module
+set(HETU_ML ON)
+set(HETU_PARALLEL_ML ON)
+
+######################
+### Set paths ########
+######################
+
+# CUDA version >= 10.1
+set(CUDAToolkit_ROOT /usr/local/cuda)
+
+# NCCL version >= 2.8
+set(NCCL_ROOT $ENV{CONDA_PREFIX})
+
+set(CUDNN_ROOT)
+
+# MPI version >= 3.1 (OpenMPI version >= 4.0.3)
+# if valid version not found, we'll download and compile it in time (openmpi-4.0.3)
+set(MPI_HOME $ENV{CONDA_PREFIX})
+
+# MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl
+# if not found, we'll download and compile it in time
+set(MKL_ROOT $ENV{CONDA_PREFIX})
+set(MKL_BUILD $ENV{CONDA_PREFIX})
+
+# ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq
+# if not found, we'll download and compile it in time
+set(ZMQ_ROOT $ENV{CONDA_PREFIX})
+set(ZMQ_BUILD $ENV{CONDA_PREFIX})
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..a230326
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,84 @@
+name: hetu
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=4.5=1_gnu
+ - bcrypt=3.2.0=py37h5e8e339_1
+ - blas=1.0=mkl
+ - bzip2=1.0.8=h7b6447c_0
+ - ca-certificates=2021.7.5=h06a4308_1
+ - certifi=2021.5.30=py37h06a4308_0
+ - cffi=1.14.6=py37hc58025e_0
+ - cmake=3.18.2=ha30ef3c_0
+ - cryptography=3.4.7=py37h5d9358c_0
+ - cudatoolkit=10.1.243=h6bb024c_0
+ - expat=2.4.1=h2531618_2
+ - intel-openmp=2021.3.0=h06a4308_3350
+ - joblib=1.0.1=pyhd3eb1b0_0
+ - krb5=1.18.2=h173b8e3_0
+ - ld_impl_linux-64=2.35.1=h7274673_9
+ - libcurl=7.71.1=h20c2e04_1
+ - libedit=3.1.20210216=h27cfd23_1
+ - libffi=3.3=he6710b0_2
+ - libgcc-ng=9.3.0=h5101ec6_17
+ - libgfortran-ng=7.5.0=h14aa051_19
+ - libgfortran4=7.5.0=h14aa051_19
+ - libgomp=9.3.0=h5101ec6_17
+ - libprotobuf=3.15.8=h780b84a_0
+ - libsodium=1.0.18=h7b6447c_0
+ - libssh2=1.9.0=h1ba5d50_1
+ - libstdcxx-ng=9.3.0=hd4cf53a_17
+ - libuv=1.40.0=h7b6447c_0
+ - lz4-c=1.9.3=h2531618_0
+ - metis=5.1.0=hf484d3e_4
+ - mkl=2021.3.0=h06a4308_520
+ - mkl-service=2.4.0=py37h7f8727e_0
+ - mkl_fft=1.3.0=py37h42c9631_2
+ - mkl_random=1.2.2=py37h51133e4_0
+ - mpi=1.0=openmpi
+ - nccl=2.8.3.1=hcaf9a05_0
+ - ncurses=6.2=he6710b0_1
+ - numpy=1.20.3=py37hf144106_0
+ - numpy-base=1.20.3=py37h74d4b33_0
+ - onednn=2.3=omp_hf4ef041_0
+ - onnx=1.9.0=py37h284874a_0
+ - onnxruntime=1.7.2=py37he8cb6d3_1
+ - openmpi=4.0.3=hdf1f1ad_1
+ - openssl=1.1.1k=h27cfd23_0
+ - pandas=1.2.5=py37h295c915_0
+ - paramiko=2.7.2=pyh9f0ad1d_0
+ - pip=21.1.3=py37h06a4308_0
+ - protobuf=3.15.8=py37hcd2ae1e_0
+ - psutil=5.8.0=py37h5e8e339_1
+ - pybind11=2.6.2=py37hff7bd54_1
+ - pycparser=2.20=pyh9f0ad1d_2
+ - pynacl=1.4.0=py37h5e8e339_2
+ - python=3.7.10=h12debd9_4
+ - python-dateutil=2.8.2=pyhd3eb1b0_0
+ - python_abi=3.7=2_cp37m
+ - pytz=2021.1=pyhd3eb1b0_0
+ - pyyaml=5.4.1=py37h27cfd23_1
+ - re2=2021.04.01=h9c3ff4c_0
+ - readline=8.1=h27cfd23_0
+ - rhash=1.4.1=h3c74f83_1
+ - scikit-learn=0.24.2=py37ha9443f7_0
+ - scipy=1.6.2=py37had2a1c9_1
+ - setuptools=52.0.0=py37h06a4308_0
+ - six=1.16.0=pyhd3eb1b0_0
+ - sqlite=3.36.0=hc218d9a_0
+ - threadpoolctl=2.2.0=pyhb85f177_0
+ - tk=8.6.10=hbc83047_0
+ - tqdm=4.61.2=pyhd3eb1b0_1
+ - typing-extensions=3.10.0.0=hd8ed1ab_0
+ - typing_extensions=3.10.0.0=pyha770c72_0
+ - wheel=0.36.2=pyhd3eb1b0_0
+ - xz=5.2.5=h7b6447c_0
+ - yaml=0.2.5=h7b6447c_0
+ - zeromq=4.3.2=he6710b0_3
+ - zlib=1.2.11=h7b6447c_3
+ - zstd=1.4.9=haebb681_0
+ - pip:
+ - cloudpickle==1.6.0
+ - wget==3.2
diff --git a/examples/cnn/README.md b/examples/cnn/README.md
new file mode 100644
index 0000000..474b927
--- /dev/null
+++ b/examples/cnn/README.md
@@ -0,0 +1,49 @@
+# CNN Examples
+In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison.
+## Structure
+```
+- cnn
+ - models/ CNN models in HETU
+ - pytorch_models/ CNN models in PyTorch
+ - tf_models/ CNN models in TensorFlow
+ - scripts/ Test scripts
+ - main.py Trainer for HETU
+ - run_tf_horovod.py Trainer for Horovod
+ - tf_launch_server.py Trainer for TF-PS (role: server)
+ - tf_launch_worker.py Trainer for TF-PS (role: worker)
+ - tf_main.py Trainer for TensorFlow
+ - torch_main.py Trainer for Pytorch
+ -
+```
+## Usage
+Here are some examples of running scripts.
+```bash
+bash scripts/hetu_1gpu.sh mlp CIFAR10 # mlp with CIFAR10 dataset in hetu
+bash scripts/hetu_8gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (1-node)
+bash scripts/hetu_16gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (2-nodes)
+```
+To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples.
+
+We can change the setting in scripts. See `mnist_mlp.sh` below.
+```bash
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+### validate and timing
+python ${mainpy} --model mlp --dataset CIFAR10 --validate --timing
+
+### run in cpu
+# python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing
+
+```
+
+For more details about training setting, please refer to `main.py`.
+## Models
+We provide following models with specific datasets.
+```
+CIFAR100: VGG, ResNet
+CIFAR10: MLP, VGG, ResNet
+MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN
+```
diff --git a/examples/cnn/local_s1.yml b/examples/cnn/local_s1.yml
new file mode 100644
index 0000000..20ffc94
--- /dev/null
+++ b/examples/cnn/local_s1.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13030
+ DMLC_NUM_WORKER : 2
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
+launch :
+ worker : 0
+ server : 1
+ scheduler : true
diff --git a/examples/cnn/main.py b/examples/cnn/main.py
new file mode 100644
index 0000000..1a4a224
--- /dev/null
+++ b/examples/cnn/main.py
@@ -0,0 +1,202 @@
+import hetu as ht
+import models
+import os
+import numpy as np
+import argparse
+import json
+import logging
+from time import time
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+ if device_id == 0:
+ logger.info(msg)
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ help='model to be tested')
+ parser.add_argument('--dataset', type=str, required=True,
+ help='dataset to be trained on')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=10, help='epoch number')
+ parser.add_argument('--gpu', type=int, default=0,
+ help='gpu to be used, -1 means cpu')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ parser.add_argument('--comm-mode', default=None, help='communication mode')
+ args = parser.parse_args()
+
+ global device_id
+ device_id = 0
+ print_rank0("Training {} on HETU".format(args.model))
+ if args.comm_mode in ('AllReduce', 'Hybrid'):
+ comm, device_id = ht.mpi_nccl_init()
+ executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
+ else:
+ if args.gpu == -1:
+ executor_ctx = ht.cpu(0)
+ print_rank0('Use CPU.')
+ else:
+ executor_ctx = ht.gpu(args.gpu)
+ print_rank0('Use GPU %d.' % args.gpu)
+ if args.comm_mode in ('PS', 'Hybrid'):
+ settings_file = open(os.path.join(os.path.abspath(
+ os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu))
+ settings = json.load(settings_file)
+ for key in settings:
+ if type(settings[key]) == str:
+ os.environ[key] = settings[key]
+ else:
+ os.environ[key] = str(settings[key]) # type is str
+
+ assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \
+ 'Model not supported!'
+ model = eval('models.' + args.model)
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+
+ if args.opt == 'sgd':
+ print_rank0('Use SGD Optimizer.')
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ elif args.opt == 'momentum':
+ print_rank0('Use Momentum Optimizer.')
+ opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
+ elif args.opt == 'nesterov':
+ print_rank0('Use Nesterov Momentum Optimizer.')
+ opt = ht.optim.MomentumOptimizer(
+ learning_rate=args.learning_rate, nesterov=True)
+ elif args.opt == 'adagrad':
+ print_rank0('Use AdaGrad Optimizer.')
+ opt = ht.optim.AdaGradOptimizer(
+ learning_rate=args.learning_rate, initial_accumulator_value=0.1)
+ else:
+ print_rank0('Use Adam Optimizer.')
+ opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)
+
+ # data loading
+ print_rank0('Loading %s data...' % dataset)
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ # train_set_x: (50000, 784), train_set_y: (50000, 10)
+ # valid_set_x: (10000, 784), valid_set_y: (10000, 10)
+ # x_shape = (args.batch_size, 784)
+ # y_shape = (args.batch_size, 10)
+ elif dataset == 'CIFAR10':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+ num_class=10)
+ if args.model == "mlp":
+ train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+ valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+ # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10)
+ # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10)
+ # x_shape = (args.batch_size, 3, 32, 32)
+ # y_shape = (args.batch_size, 10)
+ elif dataset == 'CIFAR100':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+ num_class=100)
+ # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100)
+ # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100)
+ else:
+ raise NotImplementedError
+
+ # model definition
+ print_rank0('Building model {}'.format(args.model))
+ x = ht.dataloader_op([
+ ht.Dataloader(train_set_x, args.batch_size, 'train'),
+ ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
+ ])
+ y_ = ht.dataloader_op([
+ ht.Dataloader(train_set_y, args.batch_size, 'train'),
+ ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
+ ])
+ if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
+ loss, y = model(x, y_, 100)
+ else:
+ loss, y = model(x, y_)
+
+ train_op = opt.minimize(loss)
+
+ eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]}
+ executor = ht.Executor(eval_nodes, ctx=executor_ctx,
+ comm_mode=args.comm_mode)
+ n_train_batches = executor.get_batch_num('train')
+ n_valid_batches = executor.get_batch_num('validate')
+
+ # training
+ print_rank0("Start training loop...")
+ running_time = 0
+ for i in range(args.num_epochs + 1):
+ print_rank0("Epoch %d" % i)
+ loss_all = 0
+ batch_num = 0
+ if args.timing:
+ start = time()
+ correct_predictions = []
+ for minibatch_index in range(n_train_batches):
+ loss_val, predict_y, y_val, _ = executor.run(
+ 'train', eval_node_list=[loss, y, y_, train_op])
+ # Loss for this minibatch
+ predict_y = predict_y.asnumpy()
+ y_val = y_val.asnumpy()
+ loss_all += loss_val.asnumpy()
+ batch_num += 1
+ # Predict accuracy for this minibatch
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+
+ loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Train loss = %f" % loss_all)
+ print_rank0("Train accuracy = %f" % accuracy)
+
+ if args.timing:
+ end = time()
+ during_time = end - start
+ print_rank0("Running time of current epoch = %fs" % (during_time))
+ if i != 0:
+ running_time += during_time
+ if args.validate:
+ val_loss_all = 0
+ batch_num = 0
+ correct_predictions = []
+ for minibatch_index in range(n_valid_batches):
+ loss_val, valid_y_predicted, y_val = executor.run(
+ 'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True)
+ val_loss_all += loss_val
+ batch_num += 1
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(valid_y_predicted, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+
+ val_loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Validation loss = %f" % val_loss_all)
+ print_rank0("Validation accuracy = %f" % accuracy)
+ print_rank0("*"*50)
+ print_rank0("Running time of total %d epoch = %fs" %
+ (args.num_epochs, running_time))
+ if args.comm_mode in ('AllReduce', 'Hybrid'):
+ ht.mpi_nccl_finish(comm)
diff --git a/examples/cnn/models/AlexNet.py b/examples/cnn/models/AlexNet.py
new file mode 100644
index 0000000..1491dbf
--- /dev/null
+++ b/examples/cnn/models/AlexNet.py
@@ -0,0 +1,61 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False):
+ weight = init.random_normal(
+ shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
+ bn_scale = init.random_normal(
+ shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale')
+ bn_bias = init.random_normal(
+ shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias')
+ x = ht.conv2d_op(x, weight, stride=1, padding=1)
+ x = ht.batch_normalization_op(x, bn_scale, bn_bias)
+ if with_relu:
+ x = ht.relu_op(x)
+ if with_pool:
+ x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0)
+ return x
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+def alexnet(x, y_):
+ '''
+ AlexNet model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print('Building AlexNet model...')
+ x = ht.array_reshape_op(x, [-1, 1, 28, 28])
+ x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1',
+ with_relu=True, with_pool=True)
+ x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2',
+ with_relu=True, with_pool=True)
+ x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3',
+ with_relu=True, with_pool=False)
+ x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4',
+ with_relu=True, with_pool=False)
+ x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5',
+ with_relu=False, with_pool=True)
+ x = ht.array_reshape_op(x, (-1, 256*3*3))
+ x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True)
+ x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True)
+ y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/CNN.py b/examples/cnn/models/CNN.py
new file mode 100644
index 0000000..ed84059
--- /dev/null
+++ b/examples/cnn/models/CNN.py
@@ -0,0 +1,41 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_relu_avg(x, shape):
+ weight = init.random_normal(shape=shape, stddev=0.1)
+ x = ht.conv2d_op(x, weight, padding=2, stride=1)
+ x = ht.relu_op(x)
+ x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ return x
+
+
+def fc(x, shape):
+ weight = init.random_normal(shape=shape, stddev=0.1)
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1)
+ x = ht.array_reshape_op(x, (-1, shape[0]))
+ x = ht.matmul_op(x, weight)
+ y = x + ht.broadcastto_op(bias, x)
+ return y
+
+
+def cnn_3_layers(x, y_):
+ '''
+ 3-layer-CNN model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print('Building 3-layer-CNN model...')
+ x = ht.array_reshape_op(x, [-1, 1, 28, 28])
+ x = conv_relu_avg(x, (32, 1, 5, 5))
+ x = conv_relu_avg(x, (64, 32, 5, 5))
+ y = fc(x, (7 * 7 * 64, 10))
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/LSTM.py b/examples/cnn/models/LSTM.py
new file mode 100644
index 0000000..7469a7c
--- /dev/null
+++ b/examples/cnn/models/LSTM.py
@@ -0,0 +1,90 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def lstm(x, y_):
+ '''
+ LSTM model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+ diminput = 28
+ dimhidden = 128
+ dimoutput = 10
+ nsteps = 28
+
+ forget_gate_w = init.random_normal(
+ shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w")
+ forget_gate_u = init.random_normal(
+ shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u")
+ forget_gate_b = init.random_normal(
+ shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b")
+ input_gate_w = init.random_normal(
+ shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w")
+ input_gate_u = init.random_normal(
+ shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u")
+ input_gate_b = init.random_normal(
+ shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b")
+ output_gate_w = init.random_normal(
+ shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w")
+ output_gate_u = init.random_normal(
+ shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u")
+ output_gate_b = init.random_normal(
+ shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b")
+ tanh_w = init.random_normal(
+ shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w")
+ tanh_u = init.random_normal(
+ shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u")
+ tanh_b = init.random_normal(
+ shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b")
+ out_weights = init.random_normal(
+ shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight")
+ out_bias = init.random_normal(
+ shape=(dimoutput,), stddev=0.1, name="lstm_out_bias")
+ initial_state = ht.Variable(value=np.zeros((1,)).astype(
+ np.float32), name='initial_state', trainable=False)
+
+ for i in range(nsteps):
+ cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput))
+ # forget gate
+ if i == 0:
+ temp = ht.matmul_op(cur_x, forget_gate_w)
+ last_c_state = ht.broadcastto_op(initial_state, temp)
+ last_h_state = ht.broadcastto_op(initial_state, temp)
+ cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp
+ else:
+ cur_forget = ht.matmul_op(
+ last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w)
+ cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget)
+ cur_forget = ht.sigmoid_op(cur_forget)
+ # input gate
+ cur_input = ht.matmul_op(
+ last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w)
+ cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input)
+ cur_input = ht.sigmoid_op(cur_input)
+ # output gate
+ cur_output = ht.matmul_op(
+ last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w)
+ cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output)
+ cur_output = ht.sigmoid_op(cur_output)
+ # tanh
+ cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \
+ ht.matmul_op(cur_x, tanh_w)
+ cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh)
+ cur_tanh = ht.tanh_op(cur_tanh)
+
+ last_c_state = ht.mul_op(last_c_state, cur_forget) + \
+ ht.mul_op(cur_input, cur_tanh)
+ last_h_state = ht.tanh_op(last_c_state) * cur_output
+
+ x = ht.matmul_op(last_h_state, out_weights)
+ y = x + ht.broadcastto_op(out_bias, x)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/LeNet.py b/examples/cnn/models/LeNet.py
new file mode 100644
index 0000000..a14903c
--- /dev/null
+++ b/examples/cnn/models/LeNet.py
@@ -0,0 +1,46 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_pool(x, in_channel, out_channel, name):
+ weight = init.random_normal(
+ shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight')
+ x = ht.conv2d_op(x, weight, padding=2, stride=1)
+ x = ht.relu_op(x)
+ x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ return x
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+def lenet(x, y_):
+ '''
+ LeNet model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print('Building LeNet model...')
+ x = ht.array_reshape_op(x, (-1, 1, 28, 28))
+ x = conv_pool(x, 1, 6, name='lenet_conv1')
+ x = conv_pool(x, 6, 16, name='lenet_conv2')
+ x = ht.array_reshape_op(x, (-1, 7*7*16))
+ x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True)
+ x = fc(x, (120, 84), name='lenet_fc2', with_relu=True)
+ y = fc(x, (84, 10), name='lenet_fc3', with_relu=False)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/LogReg.py b/examples/cnn/models/LogReg.py
new file mode 100644
index 0000000..a8a05a6
--- /dev/null
+++ b/examples/cnn/models/LogReg.py
@@ -0,0 +1,24 @@
+import hetu as ht
+from hetu import init
+
+
+def logreg(x, y_):
+ '''
+ Logistic Regression model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print("Build logistic regression model...")
+ weight = init.zeros((784, 10), name='logreg_weight')
+ bias = init.zeros((10,), name='logreg_bias')
+ x = ht.matmul_op(x, weight)
+ y = x + ht.broadcastto_op(bias, x)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/MLP.py b/examples/cnn/models/MLP.py
new file mode 100644
index 0000000..184e200
--- /dev/null
+++ b/examples/cnn/models/MLP.py
@@ -0,0 +1,33 @@
+import hetu as ht
+from hetu import init
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+def mlp(x, y_):
+ '''
+ MLP model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print("Building MLP model...")
+ x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True)
+ x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
+ y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/RNN.py b/examples/cnn/models/RNN.py
new file mode 100644
index 0000000..a767952
--- /dev/null
+++ b/examples/cnn/models/RNN.py
@@ -0,0 +1,56 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def rnn(x, y_):
+ '''
+ RNN model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print("Building RNN model...")
+ diminput = 28
+ dimhidden = 128
+ dimoutput = 10
+ nsteps = 28
+
+ weight1 = init.random_normal(
+ shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1')
+ bias1 = init.random_normal(
+ shape=(dimhidden, ), stddev=0.1, name='rnn_bias1')
+ weight2 = init.random_normal(
+ shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2')
+ bias2 = init.random_normal(
+ shape=(dimhidden, ), stddev=0.1, name='rnn_bias2')
+ weight3 = init.random_normal(
+ shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3')
+ bias3 = init.random_normal(
+ shape=(dimoutput, ), stddev=0.1, name='rnn_bias3')
+ last_state = ht.Variable(value=np.zeros((1,)).astype(
+ np.float32), name='initial_state', trainable=False)
+
+ for i in range(nsteps):
+ cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput))
+ h = ht.matmul_op(cur_x, weight1)
+ h = h + ht.broadcastto_op(bias1, h)
+
+ if i == 0:
+ last_state = ht.broadcastto_op(last_state, h)
+ s = ht.concat_op(h, last_state, axis=1)
+ s = ht.matmul_op(s, weight2)
+ s = s + ht.broadcastto_op(bias2, s)
+ last_state = ht.relu_op(s)
+
+ final_state = last_state
+ x = ht.matmul_op(final_state, weight3)
+ y = x + ht.broadcastto_op(bias3, x)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/cnn/models/ResNet.py b/examples/cnn/models/ResNet.py
new file mode 100644
index 0000000..cd1d30e
--- /dev/null
+++ b/examples/cnn/models/ResNet.py
@@ -0,0 +1,125 @@
+import hetu as ht
+from hetu import init
+
+
+def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''):
+ weight = init.random_normal(
+ shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
+ x = ht.conv2d_op(x, weight, stride=stride, padding=padding)
+ return x
+
+
+def batch_norm_with_relu(x, hidden, name):
+ scale = init.random_normal(
+ shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale')
+ bias = init.random_normal(shape=(1, hidden, 1, 1),
+ stddev=0.1, name=name+'_bias')
+ x = ht.batch_normalization_op(x, scale, bias)
+ x = ht.relu_op(x)
+ return x
+
+
+def resnet_block(x, in_channel, num_blocks, is_first=False, name=''):
+ if is_first:
+ out_channel = in_channel
+ identity = x
+ x = conv2d(x, in_channel, out_channel, stride=1,
+ padding=1, name=name+'_conv1')
+ x = batch_norm_with_relu(x, out_channel, name+'_bn1')
+ x = conv2d(x, out_channel, out_channel, stride=1,
+ padding=1, name=name+'_conv2')
+ x = x + identity
+ else:
+ out_channel = 2 * in_channel
+ identity = x
+ x = batch_norm_with_relu(x, in_channel, name+'_bn0')
+ x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]])
+ x = conv2d(x, in_channel, out_channel, stride=2,
+ padding=0, name=name+'_conv1')
+ x = batch_norm_with_relu(x, out_channel, name+'_bn1')
+ x = conv2d(x, out_channel, out_channel, stride=1,
+ padding=1, name=name+'_conv2')
+ identity = ht.avg_pool2d_op(
+ identity, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ identity = ht.pad_op(
+ identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])
+ x = x + identity
+
+ for i in range(1, num_blocks):
+ identity = x
+ x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i))
+ x = conv2d(x, out_channel, out_channel, stride=1,
+ padding=1, name=name+'_conv%d' % (2 * i + 1))
+ x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1))
+ x = conv2d(x, out_channel, out_channel, stride=1,
+ padding=1, name=name+'_conv%d' % (2 * i + 2))
+ x = x + identity
+
+ return x
+
+
+def fc(x, shape, name):
+ weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ return x
+
+
+def resnet(x, y_, num_layers=18, num_class=10):
+ '''
+ ResNet model, for CIFAR10 dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ num_layers: 18 or 34
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ base_size = 16
+
+ x = conv2d(x, 3, base_size, stride=1, padding=1,
+ name='resnet_initial_conv')
+ x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn')
+
+ if num_layers == 18:
+ print("Building ResNet-18 model...")
+ x = resnet_block(x, base_size, num_blocks=2,
+ is_first=True, name='resnet_block1')
+ x = resnet_block(x, base_size, num_blocks=2,
+ is_first=False, name='resnet_block2')
+ x = resnet_block(x, 2 * base_size, num_blocks=2,
+ is_first=False, name='resnet_block3')
+ x = resnet_block(x, 4 * base_size, num_blocks=2,
+ is_first=False, name='resnet_block4')
+ elif num_layers == 34:
+ print("Building ResNet-34 model...")
+ x = resnet_block(x, base_size, num_blocks=3,
+ is_first=True, name='resnet_block1')
+ x = resnet_block(x, base_size, num_blocks=4,
+ is_first=False, name='resnet_block2')
+ x = resnet_block(x, 2 * base_size, num_blocks=6,
+ is_first=False, name='resnet_block3')
+ x = resnet_block(x, 4 * base_size, num_blocks=3,
+ is_first=False, name='resnet_block4')
+ else:
+ assert False, "Number of layers should be 18 or 34 !"
+
+ x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn')
+ x = ht.array_reshape_op(x, (-1, 128 * base_size))
+ y = fc(x, (128 * base_size, num_class), name='resnet_final_fc')
+ # here we don't use cudnn for softmax crossentropy to avoid overflows
+ loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
+
+
+def resnet18(x, y_, num_class=10):
+ return resnet(x, y_, 18, num_class)
+
+
+def resnet34(x, y_, num_class=10):
+ return resnet(x, y_, 34, num_class)
diff --git a/examples/cnn/models/VGG.py b/examples/cnn/models/VGG.py
new file mode 100644
index 0000000..9cdab5f
--- /dev/null
+++ b/examples/cnn/models/VGG.py
@@ -0,0 +1,100 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_bn_relu(x, in_channel, out_channel, name):
+ weight = init.random_normal(shape=(out_channel, in_channel, 3, 3),
+ stddev=0.1, name=name+'_weight')
+ bn_scale = init.random_normal(shape=(1, out_channel, 1, 1),
+ stddev=0.1, name=name+'_bn_scale')
+ bn_bias = init.random_normal(shape=(1, out_channel, 1, 1),
+ stddev=0.1, name=name+'_bn_bias')
+
+ x = ht.conv2d_op(x, weight, padding=1, stride=1)
+ x = ht.batch_normalization_op(x, bn_scale, bn_bias)
+ act = ht.relu_op(x)
+ return act
+
+
+def vgg_2block(x, in_channel, out_channel, name):
+ x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+ x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ return x
+
+
+def vgg_3block(x, in_channel, out_channel, name):
+ x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
+ x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ return x
+
+
+def vgg_4block(x, in_channel, out_channel, name):
+ x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
+ x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4')
+ x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+ return x
+
+
+def vgg_fc(x, in_feat, out_feat, name):
+ weight = init.random_normal(shape=(in_feat, out_feat),
+ stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=(out_feat,),
+ stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ return x
+
+
+def vgg(x, y_, num_layers, num_class=10):
+ '''
+ VGG model, for CIFAR10/CIFAR100 dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ num_layers: 16 or 19
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ if num_layers == 16:
+ print('Building VGG-16 model...')
+ x = vgg_2block(x, 3, 64, 'vgg_block1')
+ x = vgg_2block(x, 64, 128, 'vgg_block2')
+ x = vgg_3block(x, 128, 256, 'vgg_block3')
+ x = vgg_3block(x, 256, 512, 'vgg_block4')
+ x = vgg_3block(x, 512, 512, 'vgg_block5')
+
+ elif num_layers == 19:
+ print('Building VGG-19 model...')
+ x = vgg_2block(x, 3, 64, 'vgg_block1')
+ x = vgg_2block(x, 64, 128, 'vgg_block2')
+ x = vgg_4block(x, 128, 256, 'vgg_block3')
+ x = vgg_4block(x, 256, 512, 'vgg_block4')
+ x = vgg_4block(x, 512, 512, 'vgg_block5')
+
+ else:
+ assert False, 'VGG model should have 16 or 19 layers!'
+
+ x = ht.array_reshape_op(x, (-1, 512))
+ x = vgg_fc(x, 512, 4096, 'vgg_fc1')
+ x = vgg_fc(x, 4096, 4096, 'vgg_fc2')
+ y = vgg_fc(x, 4096, num_class, 'vgg_fc3')
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+
+ return loss, y
+
+
+def vgg16(x, y_, num_class=10):
+ return vgg(x, y_, 16, num_class)
+
+
+def vgg19(x, y_, num_class=10):
+ return vgg(x, y_, 19, num_class)
diff --git a/examples/cnn/models/__init__.py b/examples/cnn/models/__init__.py
new file mode 100644
index 0000000..f6306d2
--- /dev/null
+++ b/examples/cnn/models/__init__.py
@@ -0,0 +1,9 @@
+from .VGG import vgg, vgg16, vgg19
+from .LogReg import logreg
+from .CNN import cnn_3_layers
+from .AlexNet import alexnet
+from .LeNet import lenet
+from .MLP import mlp
+from .RNN import rnn
+from .LSTM import lstm
+from .ResNet import resnet, resnet18, resnet34
diff --git a/examples/cnn/pytorch_models/__init__.py b/examples/cnn/pytorch_models/__init__.py
new file mode 100644
index 0000000..022e62e
--- /dev/null
+++ b/examples/cnn/pytorch_models/__init__.py
@@ -0,0 +1,4 @@
+from .mlp import mlp
+from .resnet import resnet18, resnet34, resnet50
+from .vgg import vgg16, vgg19
+from .rnn import rnn
diff --git a/examples/cnn/pytorch_models/mlp.py b/examples/cnn/pytorch_models/mlp.py
new file mode 100644
index 0000000..398799f
--- /dev/null
+++ b/examples/cnn/pytorch_models/mlp.py
@@ -0,0 +1,20 @@
+import torch.nn.functional as F
+import torch.nn as nn
+
+
+class MLP(nn.Module):
+ def __init__(self):
+ super(MLP, self).__init__()
+ self.fc1 = nn.Linear(3072, 256)
+ self.fc2 = nn.Linear(256, 256)
+ self.fc3 = nn.Linear(256, 10)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = F.relu(self.fc2(x))
+ out = self.fc3(x)
+ return out
+
+
+def mlp():
+ return MLP()
diff --git a/examples/cnn/pytorch_models/resnet.py b/examples/cnn/pytorch_models/resnet.py
new file mode 100644
index 0000000..927adec
--- /dev/null
+++ b/examples/cnn/pytorch_models/resnet.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, in_planes, planes, stride=1):
+ super(BasicBlock, self).__init__()
+ self.conv1 = nn.Conv2d(
+ in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+ stride=1, padding=1, bias=False)
+ self.bn2 = nn.BatchNorm2d(planes)
+
+ self.shortcut = nn.Sequential()
+ if stride != 1 or in_planes != self.expansion*planes:
+ self.shortcut = nn.Sequential(
+ nn.Conv2d(in_planes, self.expansion*planes,
+ kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion*planes)
+ )
+
+ def forward(self, x):
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = self.bn2(self.conv2(out))
+ out += self.shortcut(x)
+ out = F.relu(out)
+ return out
+
+
+class Bottleneck(nn.Module):
+ expansion = 4
+
+ def __init__(self, in_planes, planes, stride=1):
+ super(Bottleneck, self).__init__()
+ self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+ stride=stride, padding=1, bias=False)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.conv3 = nn.Conv2d(planes, self.expansion *
+ planes, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+
+ self.shortcut = nn.Sequential()
+ if stride != 1 or in_planes != self.expansion*planes:
+ self.shortcut = nn.Sequential(
+ nn.Conv2d(in_planes, self.expansion*planes,
+ kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion*planes)
+ )
+
+ def forward(self, x):
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = F.relu(self.bn2(self.conv2(out)))
+ out = self.bn3(self.conv3(out))
+ out += self.shortcut(x)
+ out = F.relu(out)
+ return out
+
+
+class ResNet(nn.Module):
+ def __init__(self, block, num_blocks, num_classes=10):
+ super(ResNet, self).__init__()
+ self.in_planes = 64
+
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
+ stride=1, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(64)
+ self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+ self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+ self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+ self.linear = nn.Linear(512*block.expansion, num_classes)
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ strides = [stride] + [1]*(num_blocks-1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+ out = F.avg_pool2d(out, 4)
+ out = out.view(out.size(0), -1)
+ out = self.linear(out)
+ return out
+
+
+def resnet18(num_classes=10):
+ return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
+
+
+def resnet34(num_classes=10):
+ return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
+
+
+def resnet50(num_classes=10):
+ return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
+
+
+def resnet101(num_classes=10):
+ return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+
+
+def resnet152(num_classes=10):
+ return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
diff --git a/examples/cnn/pytorch_models/rnn.py b/examples/cnn/pytorch_models/rnn.py
new file mode 100644
index 0000000..0a298d7
--- /dev/null
+++ b/examples/cnn/pytorch_models/rnn.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class RNN(nn.Module):
+ def __init__(self, diminput, dimoutput, dimhidden, nsteps):
+ super(RNN, self).__init__()
+ self.diminput = diminput
+ self.dimoutput = dimoutput
+ self.dimhidden = dimhidden
+ self.nsteps = nsteps
+ self.fc1 = nn.Linear(diminput, dimhidden)
+ self.fc2 = nn.Linear(dimhidden*2, dimhidden)
+ self.fc3 = nn.Linear(dimhidden, dimoutput)
+
+ def forward(self, x):
+ last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device)
+ for i in range(self.nsteps):
+ t = i % self.nsteps
+ index = torch.Tensor([idx for idx in range(
+ t*self.diminput, (t+1)*self.diminput)]).long().to(x.device)
+ cur_x = torch.index_select(x, 1, index)
+ h = self.fc1(cur_x)
+ s = torch.cat([h, last_state], axis=1)
+ s = self.fc2(s)
+ last_state = F.relu(s)
+
+ final_state = last_state
+ y = self.fc3(final_state)
+ return y
+
+
+def rnn(diminput, dimoutput, dimhidden, nsteps):
+
+ return RNN(diminput, dimoutput, dimhidden, nsteps)
diff --git a/examples/cnn/pytorch_models/vgg.py b/examples/cnn/pytorch_models/vgg.py
new file mode 100644
index 0000000..0fa2e88
--- /dev/null
+++ b/examples/cnn/pytorch_models/vgg.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+cfg = {
+ 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+ 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+ 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+ 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class VGG(nn.Module):
+ def __init__(self, vgg_name, num_class=10):
+ super(VGG, self).__init__()
+ self.features = self._make_layers(cfg[vgg_name])
+ self.fc1 = nn.Linear(512, 4096)
+ self.fc2 = nn.Linear(4096, 4096)
+ self.classifier = nn.Linear(4096, num_class)
+
+ def forward(self, x):
+ out = self.features(x)
+ out = out.view(out.size(0), -1)
+ out = self.fc2(self.fc1(out))
+ out = self.classifier(out)
+ return out
+
+ def _make_layers(self, cfg):
+ layers = []
+ in_channels = 3
+ for x in cfg:
+ if x == 'M':
+ layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+ else:
+ layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+ nn.BatchNorm2d(x),
+ nn.ReLU(inplace=True)]
+ in_channels = x
+ layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+ return nn.Sequential(*layers)
+
+
+def vgg16(num_class=10):
+ return VGG('VGG16', num_class)
+
+
+def vgg19(num_class=10):
+ return VGG('VGG19', num_class)
diff --git a/examples/cnn/run_tf_horovod.py b/examples/cnn/run_tf_horovod.py
new file mode 100644
index 0000000..fa2b96f
--- /dev/null
+++ b/examples/cnn/run_tf_horovod.py
@@ -0,0 +1,309 @@
+import os
+import numpy as np
+import tensorflow as tf
+import tf_models
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+import horovod.tensorflow as hvd
+import hetu as ht
+import logging
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+ if rank % 8 == 0:
+ logger.info(msg)
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
+# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
+# horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8
+# python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn
+
+
+# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
+# we can also use mpirun (default gloo):
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
+'''
+def train(model, args):
+ hvd.init()
+
+ def get_current_shard(data):
+ part_size = data.shape[0] // hvd.size()
+ start = part_size * hvd.rank()
+ end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0]
+ return data[start:end]
+
+ batch_size = 128
+ if args.model == 'tf_resnet34':
+ train_images, train_labels, test_images,\
+ test_labels = ht.data.tf_normalize_cifar10()
+ x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
+ else:
+ datasets = ht.data.mnist()
+ train_images, train_labels = datasets[0]
+ test_images, test_labels = datasets[2]
+ x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
+
+
+ n_train_batches = train_images.shape[0] // batch_size
+
+ loss, y = model(x, y_)
+ opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+ global_step = tf.train.get_or_create_global_step()
+ # here in DistributedOptimizer by default all tensor are reduced on GPU
+ # can use device_sparse=xxx, device_dense=xxx to modify
+ # if using device_sparse='/cpu:0', the performance degrades
+ train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step)
+
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank()))
+ # here horovod default use gpu to initialize, which will cause OOM
+ hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+ sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+ iterations = train_images.shape[0] // batch_size
+ total_epoch = 10
+ start_index = 0
+ total_time = 0
+ for ep in range(total_epoch + 1):
+ print("epoch %d" % ep)
+ st_time = time.time()
+ train_loss, train_acc = [], []
+ for it in range(n_train_batches):
+ x_val = train_images[start_index: start_index + batch_size]
+ y_val = train_labels[start_index : start_index+batch_size]
+ start_index += batch_size
+ if start_index + batch_size > train_images.shape[0]:
+ start_index = 0
+ loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val})
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ en_time = time.time()
+ train_time = en_time - st_time
+ if ep != 0:
+ total_time += train_time
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, train_time)
+
+ print("training time:", total_time)
+
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True, help="model to be tested")
+ parser.add_argument("--all", action="store_true", help="whether to use all data")
+ args = parser.parse_args()
+ raw_model = args.model
+ import tf_models
+ model = eval('tf_models.' + raw_model)
+ print('Model:', raw_model)
+ train(model, args)
+
+if __name__ == '__main__':
+ main()
+'''
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ help='model to be tested')
+ parser.add_argument('--dataset', type=str, required=True,
+ help='dataset to be trained on')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=20, help='epoch number')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ args = parser.parse_args()
+
+ hvd.init()
+ global rank
+ rank = hvd.rank()
+ assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+ 'Model not supported now.'
+ model = eval('tf_models.' + args.model)
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+ if args.opt == 'sgd':
+ print_rank0('Use SGD Optimizer.')
+ opt = tf.train.GradientDescentOptimizer(
+ learning_rate=args.learning_rate)
+ elif args.opt == 'momentum':
+ print_rank0('Use Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9)
+ elif args.opt == 'nesterov':
+ print_rank0('Use Nesterov Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+ elif args.opt == 'adagrad':
+ print_rank0('Use AdaGrad Optimizer.')
+ opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+ else:
+ print_rank0('Use Adam Optimizer.')
+ opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 784), train_set_y: (50000,)
+ # valid_set_x: (10000, 784), valid_set_y: (10000,)
+ elif dataset == 'CIFAR10':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=10)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ if args.model == "tf_mlp":
+ train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+ valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ elif dataset == 'CIFAR100':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=100)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ else:
+ raise NotImplementedError
+
+ if dataset == 'MNIST':
+ x = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 784), name='x')
+ y_ = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_)
+ elif dataset == 'CIFAR10':
+ if args.model == "tf_mlp":
+ x = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 3072), name='x')
+ y_ = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ else:
+ x = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
+ y_ = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_, 10)
+ elif dataset == 'CIFAR100':
+ x = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
+ y_ = tf.compat.v1.placeholder(
+ dtype=tf.float32, shape=(None, 100), name='y_')
+ loss, y = model(x, y_, 100)
+
+ global_step = tf.train.get_or_create_global_step()
+ # here in DistributedOptimizer by default all tensor are reduced on GPU
+ # can use device_sparse=xxx, device_dense=xxx to modify
+ # if using device_sparse='/cpu:0', the performance degrades
+ train_op = hvd.DistributedOptimizer(
+ opt).minimize(loss, global_step=global_step)
+
+ gpu_options = tf.compat.v1.GPUOptions(
+ allow_growth=True, visible_device_list=str(hvd.local_rank()))
+ # here horovod default use gpu to initialize, which will cause OOM
+ hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+ sess = tf.compat.v1.train.MonitoredTrainingSession(
+ hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+ # sess.run(tf.compat.v1.global_variables_initializer())
+
+ # training
+ print_rank0("Start training loop...")
+ running_time = 0
+ for i in range(args.num_epochs + 1):
+ print_rank0("Epoch %d" % i)
+ loss_all = 0
+ batch_num = 0
+ if args.timing:
+ start = time.time()
+ correct_predictions = []
+ for minibatch_index in range(n_train_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ x_val = train_set_x[minibatch_start:minibatch_end]
+ y_val = train_set_y[minibatch_start:minibatch_end]
+ loss_val, predict_y, _ = sess.run([loss, y, train_op],
+ feed_dict={x: x_val, y_: y_val})
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ batch_num += 1
+ loss_all += loss_val
+ loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Train loss = %f" % loss_all)
+ print_rank0("Train accuracy = %f" % accuracy)
+
+ if args.timing:
+ end = time.time()
+ print_rank0("Running time of current epoch = %fs" % (end - start))
+ if i != 0:
+ running_time += (end - start)
+
+ if args.validate:
+ val_loss_all = 0
+ batch_num = 0
+ correct_predictions = []
+ for minibatch_index in range(n_valid_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+ valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+ loss_val, valid_y_predicted = sess.run([loss, y],
+ feed_dict={x: valid_x_val, y_: valid_y_val})
+ correct_prediction = np.equal(
+ np.argmax(valid_y_val, 1),
+ np.argmax(valid_y_predicted, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ val_loss_all += loss_all
+ batch_num += 1
+ val_loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Validation loss = %f" % val_loss_all)
+ print_rank0("Validation accuracy = %f" % accuracy)
+ print_rank0("*"*50)
+ print_rank0("Running time of total %d epoch = %fs" %
+ (args.num_epochs, running_time))
diff --git a/examples/cnn/scripts/hetu_16gpu.sh b/examples/cnn/scripts/hetu_16gpu.sh
new file mode 100644
index 0000000..4b4c130
--- /dev/null
+++ b/examples/cnn/scripts/hetu_16gpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+depsdir=${workdir}/../../..
+echo $depsdir
+### validate and timing
+$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce
+
diff --git a/examples/cnn/scripts/hetu_1gpu.sh b/examples/cnn/scripts/hetu_1gpu.sh
new file mode 100644
index 0000000..e63b3eb
--- /dev/null
+++ b/examples/cnn/scripts/hetu_1gpu.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+
+# model:
+# e.g. bash hetu_1gpu.sh mlp CIFAR10
+
+### validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
diff --git a/examples/cnn/scripts/hetu_2gpu_ps.sh b/examples/cnn/scripts/hetu_2gpu_ps.sh
new file mode 100644
index 0000000..92f5eae
--- /dev/null
+++ b/examples/cnn/scripts/hetu_2gpu_ps.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+### validate and timing
+python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched &
+python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 &
+python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 &
+wait
\ No newline at end of file
diff --git a/examples/cnn/scripts/hetu_8gpu.sh b/examples/cnn/scripts/hetu_8gpu.sh
new file mode 100644
index 0000000..f2a99e3
--- /dev/null
+++ b/examples/cnn/scripts/hetu_8gpu.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+depsdir=${workdir}/../../..
+
+### validate and timing
+#
+NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
diff --git a/examples/cnn/scripts/horovod_16gpu.sh b/examples/cnn/scripts/horovod_16gpu.sh
new file mode 100644
index 0000000..7db8800
--- /dev/null
+++ b/examples/cnn/scripts/horovod_16gpu.sh
@@ -0,0 +1,11 @@
+
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_tf_horovod.py
+
+# horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing
+
+horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
+
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
diff --git a/examples/cnn/scripts/horovod_8gpu.sh b/examples/cnn/scripts/horovod_8gpu.sh
new file mode 100644
index 0000000..10e3cc7
--- /dev/null
+++ b/examples/cnn/scripts/horovod_8gpu.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_tf_horovod.py
+
+horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing
diff --git a/examples/cnn/scripts/pytorch_16gpu_0.sh b/examples/cnn/scripts/pytorch_16gpu_0.sh
new file mode 100644
index 0000000..2a847f9
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_16gpu_0.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=162.105.146.117
+MASTER_PORT=6000
+NNODES=2
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+ ${mainpy} \
+ --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/pytorch_16gpu_1.sh b/examples/cnn/scripts/pytorch_16gpu_1.sh
new file mode 100644
index 0000000..dba0ecf
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_16gpu_1.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=162.105.146.117
+MASTER_PORT=39575
+NNODES=2
+NODE_RANK=1
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+ ${mainpy} \
+ --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/pytorch_1gpu.sh b/examples/cnn/scripts/pytorch_1gpu.sh
new file mode 100644
index 0000000..cc40166
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_1gpu.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+## validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
diff --git a/examples/cnn/scripts/pytorch_8gpu.sh b/examples/cnn/scripts/pytorch_8gpu.sh
new file mode 100644
index 0000000..ebbb049
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_8gpu.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+ ${mainpy} \
+ --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/tf_16gpu_worker0.sh b/examples/cnn/scripts/tf_16gpu_worker0.sh
new file mode 100644
index 0000000..0f11c59
--- /dev/null
+++ b/examples/cnn/scripts/tf_16gpu_worker0.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate &
+wait
+
diff --git a/examples/cnn/scripts/tf_16gpu_worker1.sh b/examples/cnn/scripts/tf_16gpu_worker1.sh
new file mode 100644
index 0000000..dcdde09
--- /dev/null
+++ b/examples/cnn/scripts/tf_16gpu_worker1.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate &
+wait
\ No newline at end of file
diff --git a/examples/cnn/scripts/tf_1gpu.sh b/examples/cnn/scripts/tf_1gpu.sh
new file mode 100644
index 0000000..f58f323
--- /dev/null
+++ b/examples/cnn/scripts/tf_1gpu.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_main.py
+
+### validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
+
+### run in cpu
+# python ${mainpy} --model tf_mlp --gpu -1 --validate --timing
diff --git a/examples/cnn/scripts/tf_8gpu.sh b/examples/cnn/scripts/tf_8gpu.sh
new file mode 100644
index 0000000..70ecb79
--- /dev/null
+++ b/examples/cnn/scripts/tf_8gpu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate &
+wait
+
diff --git a/examples/cnn/settings/tf_dist_s1_w16.json b/examples/cnn/settings/tf_dist_s1_w16.json
new file mode 100644
index 0000000..2b3c1d0
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w16.json
@@ -0,0 +1,23 @@
+{
+ "worker": [
+ "162.105.146.117:34569",
+ "162.105.146.117:34568",
+ "162.105.146.117:34567",
+ "162.105.146.117:34566",
+ "162.105.146.117:34565",
+ "162.105.146.117:34564",
+ "162.105.146.117:34563",
+ "162.105.146.117:34562",
+ "162.105.146.118:34779",
+ "162.105.146.118:34778",
+ "162.105.146.118:34777",
+ "162.105.146.118:34776",
+ "162.105.146.118:34775",
+ "162.105.146.118:34774",
+ "162.105.146.118:34773",
+ "162.105.146.118:34772"
+ ],
+ "ps": [
+ "162.105.146.117:34575"
+ ]
+}
\ No newline at end of file
diff --git a/examples/cnn/settings/tf_dist_s1_w4.json b/examples/cnn/settings/tf_dist_s1_w4.json
new file mode 100644
index 0000000..c9f08f6
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w4.json
@@ -0,0 +1,11 @@
+{
+ "worker": [
+ "162.105.146.119:34569",
+ "162.105.146.119:34568",
+ "162.105.146.119:34567",
+ "162.105.146.119:34566"
+ ],
+ "ps": [
+ "162.105.146.119:34575"
+ ]
+}
\ No newline at end of file
diff --git a/examples/cnn/settings/tf_dist_s1_w8.json b/examples/cnn/settings/tf_dist_s1_w8.json
new file mode 100644
index 0000000..92ce51c
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w8.json
@@ -0,0 +1,15 @@
+{
+ "worker": [
+ "162.105.146.119:34569",
+ "162.105.146.119:34568",
+ "162.105.146.119:34567",
+ "162.105.146.119:34566",
+ "162.105.146.119:34565",
+ "162.105.146.119:34564",
+ "162.105.146.119:34563",
+ "162.105.146.119:34562"
+ ],
+ "ps": [
+ "162.105.146.119:34575"
+ ]
+}
\ No newline at end of file
diff --git a/examples/cnn/tf_launch_server.py b/examples/cnn/tf_launch_server.py
new file mode 100644
index 0000000..67ca216
--- /dev/null
+++ b/examples/cnn/tf_launch_server.py
@@ -0,0 +1,49 @@
+import os
+import tensorflow as tf
+import multiprocessing
+import signal
+import json
+import argparse
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+ os.environ['CUDA_VISIBLE_DEVICES'] = ''
+
+
+pop_env()
+
+
+def start_server(cluster, task_id):
+ server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
+ server.join()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path")
+ parser.add_argument("--id", type=int, required=True)
+ args = parser.parse_args()
+ raw_config = args.config
+ config = json.load(open(raw_config))
+ cluster = tf.train.ClusterSpec(config)
+ global proc
+ proc = multiprocessing.Process(
+ target=start_server, args=[cluster, args.id, ])
+ proc.start()
+ signal.signal(signal.SIGINT, signal_handler)
+ proc.join()
+
+
+def signal_handler(signal, frame):
+ print("SIGINT signal caught, stop Training")
+ global proc
+ proc.kill()
+ exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/cnn/tf_launch_worker.py b/examples/cnn/tf_launch_worker.py
new file mode 100644
index 0000000..925bc83
--- /dev/null
+++ b/examples/cnn/tf_launch_worker.py
@@ -0,0 +1,234 @@
+import tensorflow as tf
+import tf_models
+import hetu as ht
+
+import numpy as np
+import argparse
+import json
+from time import time
+import os
+import logging
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+ if task_id % 8 == 0:
+ logger.info(msg)
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ help='model to be tested')
+ parser.add_argument('--dataset', type=str, required=True,
+ help='dataset to be trained on')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=20, help='epoch number')
+ parser.add_argument('--gpu', type=int, default=0,
+ help='gpu to be used, -1 means cpu')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ parser.add_argument("--rank", type=int, required=True,
+ help="rank of process")
+ parser.add_argument(
+ "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")
+
+ args = parser.parse_args()
+ global task_id
+
+ task_id = int(args.rank)
+ print_rank0("task id %d" % (task_id))
+ raw_config = args.config
+
+ if args.gpu == -1:
+ device = '/job:worker/task:%d/cpu:0' % (task_id)
+ print_rank0('Use CPU.')
+ else:
+ device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu)
+ print_rank0('Use GPU %d.' % args.gpu)
+
+ config = json.load(open(raw_config))
+ cluster = tf.train.ClusterSpec(config)
+
+ assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+ 'Model not supported now.'
+ model = eval('tf_models.' + args.model)
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+ if args.opt == 'sgd':
+ print_rank0('Use SGD Optimizer.')
+ opt = tf.train.GradientDescentOptimizer(
+ learning_rate=args.learning_rate)
+ elif args.opt == 'momentum':
+ print_rank0('Use Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9)
+ elif args.opt == 'nesterov':
+ print_rank0('Use Nesterov Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+ elif args.opt == 'adagrad':
+ print_rank0('Use AdaGrad Optimizer.')
+ opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+ else:
+ print_rank0('Use Adam Optimizer.')
+ opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+ with tf.device(
+ tf.compat.v1.train.replica_device_setter(
+ worker_device=device,
+ cluster=cluster)):
+ # data loading
+ print_rank0('Loading %s data...' % dataset)
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 784), train_set_y: (50000,)
+ # valid_set_x: (10000, 784), valid_set_y: (10000,)
+ elif dataset == 'CIFAR10':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=10)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ if args.model == "tf_mlp":
+ train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+ valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ elif dataset == 'CIFAR100':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=100)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ else:
+ raise NotImplementedError
+
+ if dataset == 'MNIST':
+ x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
+ y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_)
+ elif dataset == 'CIFAR10':
+ if args.model == "tf_mlp":
+ x = tf.placeholder(
+ dtype=tf.float32, shape=(None, 3072), name='x')
+ y_ = tf.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ else:
+ x = tf.placeholder(dtype=tf.float32, shape=(
+ None, 32, 32, 3), name='x')
+ y_ = tf.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_, 10)
+ elif dataset == 'CIFAR100':
+ x = tf.placeholder(dtype=tf.float32, shape=(
+ None, 32, 32, 3), name='x')
+ y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
+ loss, y = model(x, y_, 100)
+ train_op = opt.minimize(loss)
+
+ server = tf.train.Server(
+ cluster, job_name="worker", task_index=task_id)
+
+ init = tf.compat.v1.global_variables_initializer()
+ sv = tf.train.Supervisor(
+ is_chief=(task_id == 0),
+ init_op=init,
+ recovery_wait_secs=1)
+ sess_config = tf.compat.v1.ConfigProto(
+ allow_soft_placement=True,
+ log_device_placement=False,
+ device_filters=["/job:ps",
+ "/job:worker/task:%d" % task_id])
+ sess = sv.prepare_or_wait_for_session(
+ server.target, config=sess_config)
+
+ sess.run(init)
+ # training
+ print_rank0("Start training loop...")
+ running_time = 0
+ for i in range(args.num_epochs + 1):
+ print_rank0("Epoch %d" % i)
+ loss_all = 0
+ batch_num = 0
+ if args.timing:
+ start = time()
+ correct_predictions = []
+ for minibatch_index in range(n_train_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ x_val = train_set_x[minibatch_start:minibatch_end]
+ y_val = train_set_y[minibatch_start:minibatch_end]
+ loss_val, predict_y, _ = sess.run([loss, y, train_op],
+ feed_dict={x: x_val, y_: y_val})
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ batch_num += 1
+ loss_all += loss_val
+ loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Train loss = %f" % loss_all)
+ print_rank0("Train accuracy = %f" % accuracy)
+
+ if args.timing:
+ end = time()
+ print_rank0("Running time of current epoch = %fs" %
+ (end - start))
+ if i != 0:
+ running_time += (end - start)
+
+ if args.validate:
+ val_loss_all = 0
+ batch_num = 0
+ correct_predictions = []
+ for minibatch_index in range(n_valid_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+ valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+ loss_val, valid_y_predicted = sess.run([loss, y],
+ feed_dict={x: valid_x_val, y_: valid_y_val})
+ correct_prediction = np.equal(
+ np.argmax(valid_y_val, 1),
+ np.argmax(valid_y_predicted, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ val_loss_all += loss_all
+ batch_num += 1
+ val_loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Validation loss = %f" % val_loss_all)
+ print_rank0("Validation accuracy = %f" % accuracy)
+ print_rank0("*"*50)
+ print_rank0("Running time of total %d epoch = %fs" %
+ (args.num_epochs, running_time))
diff --git a/examples/cnn/tf_main.py b/examples/cnn/tf_main.py
new file mode 100644
index 0000000..feb1400
--- /dev/null
+++ b/examples/cnn/tf_main.py
@@ -0,0 +1,194 @@
+import tensorflow as tf
+import tf_models
+import hetu as ht
+import numpy as np
+import argparse
+from time import time
+import logging
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+ logger.info(msg)
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ help='model to be tested')
+ parser.add_argument('--dataset', type=str, required=True,
+ help='dataset to be trained on')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=20, help='epoch number')
+ parser.add_argument('--gpu', type=int, default=0,
+ help='gpu to be used, -1 means cpu')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ args = parser.parse_args()
+
+ if args.gpu == -1:
+ device = '/cpu:0'
+ print_rank0('Use CPU.')
+ else:
+ device = '/gpu:%d' % args.gpu
+ print_rank0('Use GPU %d.' % args.gpu)
+
+ print_rank0("Training {} on TensorFlow".format(args.model))
+ assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+ 'Model not supported now.'
+ model = eval('tf_models.' + args.model)
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+ if args.opt == 'sgd':
+ print_rank0('Use SGD Optimizer.')
+ opt = tf.train.GradientDescentOptimizer(
+ learning_rate=args.learning_rate)
+ elif args.opt == 'momentum':
+ print_rank0('Use Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9)
+ elif args.opt == 'nesterov':
+ print_rank0('Use Nesterov Momentum Optimizer.')
+ opt = tf.train.MomentumOptimizer(
+ learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+ elif args.opt == 'adagrad':
+ print_rank0('Use AdaGrad Optimizer.')
+ opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+ else:
+ print_rank0('Use Adam Optimizer.')
+ opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+ # model definition
+ print_rank0('Building model...')
+ with tf.device(device):
+ if dataset == 'MNIST':
+ x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
+ y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_)
+ elif dataset == 'CIFAR10':
+ if args.model == "tf_mlp":
+ x = tf.placeholder(
+ dtype=tf.float32, shape=(None, 3072), name='x')
+ y_ = tf.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ else:
+ x = tf.placeholder(dtype=tf.float32, shape=(
+ None, 32, 32, 3), name='x')
+ y_ = tf.placeholder(
+ dtype=tf.float32, shape=(None, 10), name='y_')
+ loss, y = model(x, y_, 10)
+ elif dataset == 'CIFAR100':
+ x = tf.placeholder(dtype=tf.float32, shape=(
+ None, 32, 32, 3), name='x')
+ y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
+ loss, y = model(x, y_, 100)
+
+ train_op = opt.minimize(loss)
+
+ # data loading
+ print_rank0('Loading %s data...' % dataset)
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 784), train_set_y: (50000,)
+ # valid_set_x: (10000, 784), valid_set_y: (10000,)
+ elif dataset == 'CIFAR10':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=10)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ if args.model == "tf_mlp":
+ train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+ valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ elif dataset == 'CIFAR100':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+ num_class=100)
+ n_train_batches = train_set_x.shape[0] // args.batch_size
+ n_valid_batches = valid_set_x.shape[0] // args.batch_size
+ # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+ # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+ else:
+ raise NotImplementedError
+
+ # training
+ print_rank0("Start training loop...")
+ running_time = 0
+ with tf.Session() as sess:
+ sess.run(tf.global_variables_initializer())
+ for i in range(args.num_epochs + 1):
+ print_rank0("Epoch %d" % i)
+ loss_all = 0
+ batch_num = 0
+ if args.timing:
+ start = time()
+ correct_predictions = []
+ for minibatch_index in range(n_train_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ x_val = train_set_x[minibatch_start:minibatch_end]
+ y_val = train_set_y[minibatch_start:minibatch_end]
+ loss_val, predict_y, _ = sess.run([loss, y, train_op],
+ feed_dict={x: x_val, y_: y_val})
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ batch_num += 1
+ loss_all += loss_val
+ loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Train loss = %f" % loss_all)
+ print_rank0("Train accuracy = %f" % accuracy)
+
+ if args.timing:
+ end = time()
+ print_rank0("Running time of current epoch = %fs" %
+ (end - start))
+ if i != 0:
+ running_time += (end - start)
+
+ if args.validate:
+ val_loss_all = 0
+ batch_num = 0
+ correct_predictions = []
+ for minibatch_index in range(n_valid_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+ valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+ loss_val, valid_y_predicted = sess.run([loss, y],
+ feed_dict={x: valid_x_val, y_: valid_y_val})
+ correct_prediction = np.equal(
+ np.argmax(valid_y_val, 1),
+ np.argmax(valid_y_predicted, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ val_loss_all += loss_all
+ batch_num += 1
+ val_loss_all /= batch_num
+ accuracy = np.mean(correct_predictions)
+ print_rank0("Validation loss = %f" % val_loss_all)
+ print_rank0("Validation accuracy = %f" % accuracy)
+ print_rank0("*"*50)
+ print_rank0("Running time of total %d epoch = %fs" %
+ (args.num_epochs, running_time))
diff --git a/examples/cnn/tf_models/__init__.py b/examples/cnn/tf_models/__init__.py
new file mode 100644
index 0000000..6d713ff
--- /dev/null
+++ b/examples/cnn/tf_models/__init__.py
@@ -0,0 +1,8 @@
+from .tf_LogReg import tf_logreg
+from .tf_CNN import tf_cnn_3_layers
+from .tf_LeNet import tf_lenet
+from .tf_MLP import tf_mlp
+from .tf_RNN import tf_rnn
+from .tf_LSTM import tf_lstm
+from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34
+from .tf_VGG import tf_vgg16, tf_vgg19
diff --git a/examples/cnn/tf_models/tf_CNN.py b/examples/cnn/tf_models/tf_CNN.py
new file mode 100644
index 0000000..d8036a7
--- /dev/null
+++ b/examples/cnn/tf_models/tf_CNN.py
@@ -0,0 +1,45 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv_relu_avg(x, shape):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32))
+ x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
+ x = tf.nn.relu(x)
+ x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1],
+ padding='VALID', strides=[1, 2, 2, 1])
+ return x
+
+
+def tf_fc(x, shape):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=shape).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=shape[-1:]).astype(np.float32))
+ x = tf.reshape(x, (-1, shape[0]))
+ y = tf.matmul(x, weight) + bias
+ return y
+
+
+def tf_cnn_3_layers(x, y_):
+ '''
+ 3-layer-CNN model in TensorFlow, for MNIST dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print('Building 3-layer-CNN model in tensorflow...')
+ x = tf.reshape(x, [-1, 28, 28, 1])
+ x = tf_conv_relu_avg(x, (32, 1, 5, 5))
+ x = tf_conv_relu_avg(x, (64, 32, 5, 5))
+ x = tf.transpose(x, [0, 3, 1, 2])
+ y = tf_fc(x, (7 * 7 * 64, 10))
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_LSTM.py b/examples/cnn/tf_models/tf_LSTM.py
new file mode 100644
index 0000000..32ebe23
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LSTM.py
@@ -0,0 +1,81 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_lstm(x, y_):
+ '''
+ LSTM model in TensorFlow, for MNIST dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print("Building LSTM model in tensorflow...")
+ diminput = 28
+ dimhidden = 128
+ dimoutput = 10
+ nsteps = 28
+
+ forget_gate_w = tf.Variable(np.random.normal(
+ scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+ forget_gate_u = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+ forget_gate_b = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden,)).astype(np.float32))
+ input_gate_w = tf.Variable(np.random.normal(
+ scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+ input_gate_u = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+ input_gate_b = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden,)).astype(np.float32))
+ output_gate_w = tf.Variable(np.random.normal(
+ scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+ output_gate_u = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+ output_gate_b = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden,)).astype(np.float32))
+ tanh_w = tf.Variable(np.random.normal(
+ scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+ tanh_u = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+ tanh_b = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden,)).astype(np.float32))
+ out_weights = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
+ out_bias = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimoutput,)).astype(np.float32))
+ initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32)
+
+ last_c_state = initial_state
+ last_h_state = initial_state
+
+ for i in range(nsteps):
+ cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
+ # forget gate
+ cur_forget = tf.matmul(last_h_state, forget_gate_u) + \
+ tf.matmul(cur_x, forget_gate_w) + forget_gate_b
+ cur_forget = tf.sigmoid(cur_forget)
+ # input gate
+ cur_input = tf.matmul(last_h_state, input_gate_u) + \
+ tf.matmul(cur_x, input_gate_w) + input_gate_b
+ cur_input = tf.sigmoid(cur_input)
+ # output gate
+ cur_output = tf.matmul(last_h_state, output_gate_u) + \
+ tf.matmul(cur_x, output_gate_w) + output_gate_b
+ cur_output = tf.sigmoid(cur_output)
+ # tanh
+ cur_tanh = tf.matmul(last_h_state, tanh_u) + \
+ tf.matmul(cur_x, tanh_w) + tanh_b
+ cur_tanh = tf.tanh(cur_tanh)
+
+ last_c_state = last_c_state * cur_forget + cur_input * cur_tanh
+ last_h_state = tf.tanh(last_c_state) * cur_output
+
+ y = tf.matmul(last_h_state, out_weights) + out_bias
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_LeNet.py b/examples/cnn/tf_models/tf_LeNet.py
new file mode 100644
index 0000000..03905f5
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LeNet.py
@@ -0,0 +1,49 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv_pool(x, in_channel, out_channel):
+ weight = tf.Variable(np.random.normal(scale=0.1, size=(
+ out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
+ x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
+ x = tf.nn.relu(x)
+ x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
+ padding='VALID', strides=[1, 2, 2, 1])
+ return x
+
+
+def tf_fc(x, shape, with_relu=True):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=shape).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=shape[-1:]).astype(np.float32))
+ x = tf.matmul(x, weight) + bias
+ if with_relu:
+ x = tf.nn.relu(x)
+ return x
+
+
+def tf_lenet(x, y_):
+ '''
+ LeNet model in TensorFlow, for MNIST dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print('Building LeNet model in tensorflow...')
+ x = tf.reshape(x, [-1, 28, 28, 1])
+ x = tf_conv_pool(x, 1, 6)
+ x = tf_conv_pool(x, 6, 16)
+ x = tf.transpose(x, [0, 3, 1, 2])
+ x = tf.reshape(x, (-1, 7*7*16))
+ x = tf_fc(x, (7*7*16, 120), with_relu=True)
+ x = tf_fc(x, (120, 84), with_relu=True)
+ y = tf_fc(x, (84, 10), with_relu=False)
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_LogReg.py b/examples/cnn/tf_models/tf_LogReg.py
new file mode 100644
index 0000000..67a7577
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LogReg.py
@@ -0,0 +1,23 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_logreg(x, y_):
+ '''
+ Logistic Regression model in TensorFlow, for MNIST dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print("Build logistic regression model in tensorflow...")
+ weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32))
+ bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32))
+ y = tf.matmul(x, weight) + bias
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_MLP.py b/examples/cnn/tf_models/tf_MLP.py
new file mode 100644
index 0000000..8a23646
--- /dev/null
+++ b/examples/cnn/tf_models/tf_MLP.py
@@ -0,0 +1,34 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_fc(x, shape, with_relu=True):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=shape).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=shape[-1:]).astype(np.float32))
+ x = tf.matmul(x, weight) + bias
+ if with_relu:
+ x = tf.nn.relu(x)
+ return x
+
+
+def tf_mlp(x, y_, num_class=10):
+ '''
+ MLP model in TensorFlow, for CIFAR dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print("Building MLP model in tensorflow...")
+ x = tf_fc(x, (3072, 256), with_relu=True)
+ x = tf_fc(x, (256, 256), with_relu=True)
+ y = tf_fc(x, (256, num_class), with_relu=False)
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_RNN.py b/examples/cnn/tf_models/tf_RNN.py
new file mode 100644
index 0000000..c1d1436
--- /dev/null
+++ b/examples/cnn/tf_models/tf_RNN.py
@@ -0,0 +1,49 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_rnn(x, y_):
+ '''
+ RNN model in TensorFlow, for MNIST dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+
+ print("Building RNN model in tensorflow...")
+ diminput = 28
+ dimhidden = 128
+ dimoutput = 10
+ nsteps = 28
+
+ weight1 = tf.Variable(np.random.normal(
+ scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+ bias1 = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, )).astype(np.float32))
+ weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
+ dimhidden + dimhidden, dimhidden)).astype(np.float32))
+ bias2 = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, )).astype(np.float32))
+ weight3 = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
+ bias3 = tf.Variable(np.random.normal(
+ scale=0.1, size=(dimoutput, )).astype(np.float32))
+ last_state = tf.zeros((128, dimhidden), dtype=tf.float32)
+
+ for i in range(nsteps):
+ cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
+ h = tf.matmul(cur_x, weight1) + bias1
+
+ s = tf.concat([h, last_state], axis=1)
+ s = tf.matmul(s, weight2) + bias2
+ last_state = tf.nn.relu(s)
+
+ final_state = last_state
+ y = tf.matmul(final_state, weight3) + bias3
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
diff --git a/examples/cnn/tf_models/tf_ResNet.py b/examples/cnn/tf_models/tf_ResNet.py
new file mode 100644
index 0000000..a0e0b2c
--- /dev/null
+++ b/examples/cnn/tf_models/tf_ResNet.py
@@ -0,0 +1,113 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv2d(x, in_channel, out_channel, stride=1):
+ weight = tf.Variable(np.random.normal(scale=0.1, size=(
+ out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
+ x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME')
+ return x
+
+
+def tf_batch_norm_with_relu(x, hidden):
+ scale = tf.Variable(np.random.normal(
+ scale=0.1, size=(hidden,)).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=(hidden,)).astype(np.float32))
+ axis = list(range(len(x.shape) - 1))
+ a_mean, a_var = tf.nn.moments(x, axis)
+ x = tf.nn.batch_normalization(
+ x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
+ x = tf.nn.relu(x)
+ return x
+
+
+def tf_resnet_block(x, in_channel, num_blocks, is_first=False):
+ if is_first:
+ out_channel = in_channel
+ identity = x
+ x = tf_conv2d(x, in_channel, out_channel, stride=1)
+ x = tf_batch_norm_with_relu(x, out_channel)
+ x = tf_conv2d(x, out_channel, out_channel, stride=1)
+ x = x + identity
+ else:
+ out_channel = 2 * in_channel
+ identity = x
+ x = tf_batch_norm_with_relu(x, in_channel)
+ x = tf_conv2d(x, in_channel, out_channel, stride=2)
+ x = tf_batch_norm_with_relu(x, out_channel)
+ x = tf_conv2d(x, out_channel, out_channel, stride=1)
+ identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[
+ 1, 2, 2, 1], padding='VALID')
+ identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [
+ in_channel // 2, in_channel // 2]])
+ x = x + identity
+
+ for i in range(1, num_blocks):
+ identity = x
+ x = tf_batch_norm_with_relu(x, out_channel)
+ x = tf_conv2d(x, out_channel, out_channel, stride=1)
+ x = tf_batch_norm_with_relu(x, out_channel)
+ x = tf_conv2d(x, out_channel, out_channel, stride=1)
+ x = x + identity
+
+ return x
+
+
+def tf_fc(x, shape):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=shape).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=shape[-1:]).astype(np.float32))
+ x = tf.matmul(x, weight) + bias
+ return x
+
+
+def tf_resnet(x, y_, num_layers, num_class=10):
+ '''
+ ResNet model in TensorFlow, for CIFAR10 dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ num_layers: 18 or 34
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+ print("Number of Class: {}".format(num_class))
+ base_size = 16
+
+ x = tf_conv2d(x, 3, base_size, stride=1)
+ x = tf_batch_norm_with_relu(x, base_size)
+
+ if num_layers == 18:
+ print("Building ResNet-18 model in tensorflow...")
+ x = tf_resnet_block(x, base_size, num_blocks=2, is_first=True)
+ x = tf_resnet_block(x, base_size, num_blocks=2)
+ x = tf_resnet_block(x, 2 * base_size, num_blocks=2)
+ x = tf_resnet_block(x, 4 * base_size, num_blocks=2)
+ elif num_layers == 34:
+ print("Building ResNet-34 model in tensorflow...")
+ x = tf_resnet_block(x, base_size, num_blocks=3, is_first=True)
+ x = tf_resnet_block(x, base_size, num_blocks=4)
+ x = tf_resnet_block(x, 2 * base_size, num_blocks=6)
+ x = tf_resnet_block(x, 4 * base_size, num_blocks=3)
+ else:
+ assert False, "Number of layers should be 18 or 34 !"
+
+ x = tf_batch_norm_with_relu(x, 8 * base_size)
+ x = tf.transpose(x, [0, 3, 1, 2])
+ x = tf.reshape(x, [-1, 128 * base_size])
+ y = tf_fc(x, (128 * base_size, num_class))
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
+
+
+def tf_resnet18(x, y_, num_class=10):
+ return tf_resnet(x, y_, 18, num_class)
+
+
+def tf_resnet34(x, y_, num_class=10):
+ return tf_resnet(x, y_, 34, num_class)
diff --git a/examples/cnn/tf_models/tf_VGG.py b/examples/cnn/tf_models/tf_VGG.py
new file mode 100644
index 0000000..124457d
--- /dev/null
+++ b/examples/cnn/tf_models/tf_VGG.py
@@ -0,0 +1,103 @@
+import numpy as np
+import tensorflow as tf
+
+
+def conv_bn_relu(x, in_channel, out_channel):
+ weight = tf.Variable(np.random.normal(scale=0.1, size=(
+ out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
+ scale = tf.Variable(np.random.normal(
+ scale=0.1, size=(out_channel,)).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=(out_channel,)).astype(np.float32))
+ x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME')
+ axis = list(range(len(x.shape) - 1))
+ a_mean, a_var = tf.nn.moments(x, axis)
+ x = tf.nn.batch_normalization(
+ x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
+ x = tf.nn.relu(x)
+ return x
+
+
+def vgg_2block(x, in_channel, out_channel):
+ x = conv_bn_relu(x, in_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+ 1, 2, 2, 1], padding='VALID')
+ return x
+
+
+def vgg_3block(x, in_channel, out_channel):
+ x = conv_bn_relu(x, in_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+ 1, 2, 2, 1], padding='VALID')
+ return x
+
+
+def vgg_4block(x, in_channel, out_channel):
+ x = conv_bn_relu(x, in_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = conv_bn_relu(x, out_channel, out_channel)
+ x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+ 1, 2, 2, 1], padding='VALID')
+ return x
+
+
+def tf_fc(x, in_feat, out_feat):
+ weight = tf.Variable(np.random.normal(
+ scale=0.1, size=(in_feat, out_feat)).astype(np.float32))
+ bias = tf.Variable(np.random.normal(
+ scale=0.1, size=(out_feat,)).astype(np.float32))
+ x = tf.matmul(x, weight) + bias
+ return x
+
+
+def tf_vgg(x, y_, num_layers, num_class=10):
+ '''
+ ResNet model in TensorFlow, for CIFAR10 dataset.
+
+ Parameters:
+ x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
+ y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ num_layers: 18 or 34
+ Return:
+ loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+ y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+ '''
+ if num_layers == 16:
+ print('Building VGG-16 model in tensorflow')
+ x = vgg_2block(x, 3, 64)
+ x = vgg_2block(x, 64, 128)
+ x = vgg_3block(x, 128, 256)
+ x = vgg_3block(x, 256, 512)
+ x = vgg_3block(x, 512, 512)
+
+ elif num_layers == 19:
+ print('Building VGG-19 model in tensorflow')
+ x = vgg_2block(x, 3, 64)
+ x = vgg_2block(x, 64, 128)
+ x = vgg_4block(x, 128, 256)
+ x = vgg_4block(x, 256, 512)
+ x = vgg_4block(x, 512, 512)
+ else:
+ assert False, "Number of layers should be 18 or 34 !"
+
+ x = tf.reshape(x, [-1, 512])
+ x = tf_fc(x, 512, 4096)
+ x = tf_fc(x, 4096, 4096)
+ y = tf_fc(x, 4096, num_class)
+ print("Number of Class: {}".format(num_class))
+
+ loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ return loss, y
+
+
+def tf_vgg16(x, y_, num_class=10):
+ return tf_vgg(x, y_, 16, num_class)
+
+
+def tf_vgg19(x, y_, num_class=10):
+ return tf_vgg(x, y_, 34, num_class)
diff --git a/examples/cnn/torch_main.py b/examples/cnn/torch_main.py
new file mode 100644
index 0000000..f6ed928
--- /dev/null
+++ b/examples/cnn/torch_main.py
@@ -0,0 +1,213 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from pytorch_models import *
+import hetu as ht
+import numpy as np
+import argparse
+from time import time
+import os
+import logging
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+ if local_rank % 8 == 0:
+ logger.info(msg)
+
+
+def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None):
+ print_rank0('Epoch: %d' % epoch)
+ n_train_batches = data.shape[0] // batch_size
+
+ net.train()
+
+ train_loss = 0
+ correct = 0
+ total = 0
+
+ for minibatch_index in range(n_train_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ inputs = torch.Tensor(data[minibatch_start:minibatch_end])
+ targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
+
+ inputs, targets = inputs.to(device), targets.to(device)
+ optimizer.zero_grad()
+ outputs = net(inputs)
+ loss = criterion(outputs, targets)
+ loss.backward()
+ optimizer.step()
+
+ train_loss += loss.item()
+ _, predicted = outputs.max(1)
+ total += targets.size(0)
+ correct += predicted.eq(targets).sum().item()
+
+ print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1)))
+ print_rank0("Train accuracy = %f" % (100.*correct/total))
+
+
+def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None):
+ net.eval()
+ n_test_batches = data.shape[0] // batch_size
+ test_loss = 0
+ correct = 0
+ total = 0
+
+ with torch.no_grad():
+ for minibatch_index in range(n_test_batches):
+ minibatch_start = minibatch_index * args.batch_size
+ minibatch_end = (minibatch_index + 1) * args.batch_size
+ inputs = torch.Tensor(data[minibatch_start:minibatch_end])
+ targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
+
+ inputs, targets = inputs.to(device), targets.to(device)
+ outputs = net(inputs)
+ loss = criterion(outputs, targets)
+ test_loss += loss.item()
+ _, predicted = outputs.max(1)
+ total += targets.size(0)
+ correct += predicted.eq(targets).sum().item()
+
+ print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1)))
+ print_rank0("Validation accuracy = %f" % (100.*correct/total))
+
+
+if __name__ == "__main__":
+ # argument parser
+ global local_rank
+ local_rank = 0
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, required=True,
+ help='model to be tested')
+ parser.add_argument('--dataset', type=str, required=True,
+ help='dataset to be trained on')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=20, help='epoch number')
+ parser.add_argument('--gpu', type=int, default=0,
+ help='gpu to be used, -1 means cpu')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ parser.add_argument('--distributed', action='store_true',
+ help='whether to distributed training')
+ parser.add_argument('--local_rank', type=int, default=-1)
+ args = parser.parse_args()
+
+ if args.distributed == True:
+ init_method = 'tcp://'
+ master_ip = os.getenv('MASTER_ADDR', 'localhost')
+ master_port = os.getenv('MASTER_PORT', '6000')
+ init_method += master_ip + ':' + master_port
+ rank = int(os.getenv('RANK', '0'))
+ world_size = int(os.getenv("WORLD_SIZE", '1'))
+ print("***"*50)
+ print(init_method)
+ torch.distributed.init_process_group(backend="nccl",
+ world_size=world_size,
+ rank=rank,
+ init_method=init_method)
+
+ if args.gpu == -1:
+ device = 'cpu'
+ else:
+ if args.distributed == True:
+ local_rank = rank % torch.cuda.device_count()
+ torch.cuda.set_device(local_rank)
+ device = torch.device('cuda:%d' % local_rank)
+ logger.info('Use GPU %d.' % local_rank)
+ else:
+ device = torch.device('cuda:%d' % args.gpu)
+ torch.cuda.set_device(args.gpu)
+ print_rank0('Use GPU %d.' % args.gpu)
+
+ assert args.model in ['mlp', 'resnet18', 'resnet34',
+ 'vgg16', 'vgg19', 'rnn'], 'Model not supported now.'
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+
+ if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
+ net = eval(args.model)(100)
+ elif args.model == 'rnn':
+ net = eval(args.model)(28, 10, 128, 28)
+ else:
+ net = eval(args.model)()
+
+ assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+ dataset = args.dataset
+
+ net.to(device)
+ if args.distributed:
+ net = torch.nn.parallel.DistributedDataParallel(
+ net, device_ids=[local_rank])
+
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+ if args.opt == 'sgd':
+ print_rank0('Use SGD Optimizer.')
+ opt = optim.SGD(net.parameters(), lr=args.learning_rate)
+ elif args.opt == 'momentum':
+ print_rank0('Use Momentum Optimizer.')
+ opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9)
+ elif args.opt == 'nesterov':
+ print_rank0('Use Nesterov Momentum Optimizer.')
+ opt = optim.SGD(net.parameters(), lr=args.learning_rate,
+ momentum=0.9, nesterov=True)
+ elif args.opt == 'adagrad':
+ print_rank0('Use AdaGrad Optimizer.')
+ opt = optim.Adagrad(net.parameters(), lr=args.learning_rate)
+ else:
+ print_rank0('Use Adam Optimizer.')
+ opt = optim.Adam(lr=args.learning_rate)
+
+ criterion = nn.CrossEntropyLoss()
+
+ # data loading
+ print_rank0('Loading %s data...' % dataset)
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist(onehot=False)
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ elif dataset == 'CIFAR10':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+ num_class=10, onehot=False)
+ if args.model == "mlp":
+ train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+ valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+ elif dataset == 'CIFAR100':
+ train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+ num_class=100, onehot=False)
+
+ running_time = 0
+ # training
+ print_rank0("Start training loop...")
+ for i in range(args.num_epochs + 1):
+ if args.timing:
+ start = time()
+ train(epoch=i, net=net, data=train_set_x, label=train_set_y,
+ batch_size=args.batch_size, criterion=criterion, optimizer=opt)
+ if args.timing:
+ end = time()
+ print_rank0("Running time of current epoch = %fs" % (end - start))
+ if i != 0:
+ running_time += (end - start)
+ test(epoch=i, net=net, data=valid_set_x, label=valid_set_y,
+ batch_size=args.batch_size, criterion=criterion)
+
+ print_rank0("*"*50)
+ print_rank0("Running time of total %d epoch = %fs" %
+ (args.num_epochs, running_time))
diff --git a/examples/cnn/worker_conf0.json b/examples/cnn/worker_conf0.json
new file mode 100644
index 0000000..b705130
--- /dev/null
+++ b/examples/cnn/worker_conf0.json
@@ -0,0 +1,9 @@
+{
+"DMLC_ROLE":"worker",
+"WORKER_ID":"0",
+"DMLC_PS_ROOT_URI":"127.0.0.1",
+"DMLC_PS_ROOT_PORT":"13030",
+"DMLC_NUM_WORKER":"2",
+"DMLC_NUM_SERVER":"1",
+"DMLC_PS_VAN_TYPE":"p3"
+}
diff --git a/examples/cnn/worker_conf1.json b/examples/cnn/worker_conf1.json
new file mode 100644
index 0000000..831826d
--- /dev/null
+++ b/examples/cnn/worker_conf1.json
@@ -0,0 +1,9 @@
+{
+"DMLC_ROLE":"worker",
+"WORKER_ID":"1",
+"DMLC_PS_ROOT_URI":"127.0.0.1",
+"DMLC_PS_ROOT_PORT":"13030",
+"DMLC_NUM_WORKER":"2",
+"DMLC_NUM_SERVER":"1",
+"DMLC_PS_VAN_TYPE":"p3"
+}
diff --git a/examples/ctr/.gitignore b/examples/ctr/.gitignore
new file mode 100644
index 0000000..ccb0cb2
--- /dev/null
+++ b/examples/ctr/.gitignore
@@ -0,0 +1,2 @@
+datasets/
+logs/
diff --git a/examples/ctr/README.md b/examples/ctr/README.md
new file mode 100644
index 0000000..c99c5db
--- /dev/null
+++ b/examples/ctr/README.md
@@ -0,0 +1,109 @@
+# CTR Examples (with Distributed Settings)
+In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset.
+
+## Structure
+```
+- ctr
+ - datasets/ contains sampled criteo data
+ - models/ ctr models in hetu
+ - tf_models/ ctr models in tensorflow
+ - settings/ configurations for distributed training
+ - tests/ test scripts
+ - kill.sh script to kill all python processes
+ - run_hetu.py basic trainer for hetu
+ - run_tf_local.py local trainer for tensorflow
+ - run_tf_horovod.py trainer for tensorflow in horovod setting
+ - run_tf_parallax.py trainer for tensorflow in parallax setting
+ - tf_launch_server.py launcher for server in tensorflow
+ - tf_launch_worker.py launcher for worker in tensorflow
+```
+
+## Prepare criteo data
+* We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files.
+* To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset.
+
+
+## Flags for test files
+Here we explain some of the flags you may use in test files:
+* model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult')
+* config: to specify the configuration file in settings.
+* val: whether using validation.
+* cache: whether using cache in PS/Hybrid mode.
+* bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.)
+* all: whether to use all criteo data.
+* bound: per embedding entry staleness in cache setting, default to be 100.
+
+
+## Usage
+If memory available, you can try to run the model locally, by running
+```bash
+# run locally
+bash tests/local_{model}_{dataset}.sh
+# run in ps setting (locally)
+bash tests/ps_{model}_{dataset}.sh
+# run in hybrid setting (locally)
+bash tests/hybrid_{model}_{dataset}.sh
+
+# run tensorflow locally
+python run_tf_local.py --model {model}_{dataset}
+# run tensorflow in horovod
+horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset}
+# run tensorflow in parallax
+python {absolute_path_to}/run_tf_parallax.py
+# run tensorflow in ps setting
+python tf_launch_server.py --config {config} --id {rank}
+python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config}
+```
+
+
+## Configuration
+We use a simple yaml file to specify the run configuration.
+
+```yaml
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+launch :
+ worker : 4
+ server : 1
+ scheduler : true
+```
+
+The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.
+
+The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by
+
+
+## Examples
+### Local execution
+Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data):
+```bash
+python run_hetu.py --model wdl_criteo (--all) (--val)
+```
+
+### PS mode execution
+Run ps locally, here we can also run on multiple nodes.
+```bash
+# launch scheduler and server, -n means number of servers, --sched means using scheduler
+python -m hetu.launcher {config} -n 1 --sched
+# launch workers (or run scheduler and server together if configured in config file)
+python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10)
+```
+You can also specify the cache to be used and also the cache bound.
+
+
+### Hybrid mode execution
+You must launch a scheduler and server in one terminal:
+```bash
+python -m hetu.launcher {config} -n 1 --sched
+```
+And then launch the workers simultaneously using mpirun command:
+```bash
+mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ...
+```
+Or if in distributed nodes setting:
+```
+mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ...
+```
diff --git a/examples/ctr/kill.sh b/examples/ctr/kill.sh
new file mode 100755
index 0000000..4d99771
--- /dev/null
+++ b/examples/ctr/kill.sh
@@ -0,0 +1,3 @@
+#/bin/bash
+#pkill -f mnist_mlp_ps.py
+kill -9 $(pidof python)
diff --git a/examples/ctr/models/__init__.py b/examples/ctr/models/__init__.py
new file mode 100644
index 0000000..8c17586
--- /dev/null
+++ b/examples/ctr/models/__init__.py
@@ -0,0 +1,5 @@
+from .wdl_adult import wdl_adult
+from .dcn_criteo import dcn_criteo
+from .dc_criteo import dc_criteo
+from .wdl_criteo import wdl_criteo
+from .deepfm_criteo import dfm_criteo
diff --git a/examples/ctr/models/dc_criteo.py b/examples/ctr/models/dc_criteo.py
new file mode 100644
index 0000000..30b3522
--- /dev/null
+++ b/examples/ctr/models/dc_criteo.py
@@ -0,0 +1,63 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def residual_layer(x0, input_dim, hidden_dim):
+
+ embedding_len = input_dim
+ weight_1 = init.random_normal(
+ shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1')
+ bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1')
+ weight_2 = init.random_normal(
+ shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2')
+ bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2')
+
+ x0w = ht.matmul_op(x0, weight_1) # (batch, hidden_dim)
+ x0w_b = x0w + ht.broadcastto_op(bias_1, x0w)
+
+ relu1 = ht.relu_op(x0w_b)
+ x1w = ht.matmul_op(relu1, weight_2) # (batch, input_dim)
+ x1w_b = x1w + ht.broadcastto_op(bias_2, x1w)
+ residual = x1w_b + x0
+ y = ht.relu_op(residual)
+ return y
+
+
+def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3):
+ for i in range(num_layers):
+ x0 = residual_layer(x0, input_dim, hidden_dim)
+ return x0
+
+
+def dc_criteo(dense_input, sparse_input, y_):
+
+ feature_dimension = 33762577
+ embedding_size = 8
+ learning_rate = 0.001
+
+ Embedding = init.random_normal(
+ [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding")
+ sparse_input = ht.embedding_lookup_op(Embedding, sparse_input)
+ sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+
+ # dc_model
+ x = ht.concat_op(sparse_input, dense_input, axis=1)
+
+ input_dim = 26 * 8 + 13
+ hidden_dim = input_dim
+ residual_out = build_residual_layers(
+ x, input_dim, hidden_dim, num_layers=5)
+
+ W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4")
+ y = ht.matmul_op(residual_out, W4)
+ y = ht.sigmoid_op(y)
+
+ loss = ht.binarycrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+ train_op = opt.minimize(loss)
+
+ return loss, y, y_, train_op
diff --git a/examples/ctr/models/dcn_criteo.py b/examples/ctr/models/dcn_criteo.py
new file mode 100644
index 0000000..2222d22
--- /dev/null
+++ b/examples/ctr/models/dcn_criteo.py
@@ -0,0 +1,68 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def cross_layer(x0, x1):
+ # x0: input embedding feature (batch_size, 26 * embedding_size + 13)
+ # x1: the output of last layer (batch_size, 26 * embedding_size + 13)
+
+ embedding_len = 26 * 128 + 13
+ weight = init.random_normal(
+ shape=(embedding_len, 1), stddev=0.01, name='weight')
+ bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias')
+ x1w = ht.matmul_op(x1, weight) # (batch_size, 1)
+ y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0))
+ y = y + x1 + ht.broadcastto_op(bias, y)
+ return y
+
+
+def build_cross_layer(x0, num_layers=3):
+ x1 = x0
+ for i in range(num_layers):
+ x1 = cross_layer(x0, x1)
+ return x1
+
+
+def dcn_criteo(dense_input, sparse_input, y_):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.003
+
+ Embedding = init.random_normal(
+ [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+ sparse_input = ht.embedding_lookup_op(
+ Embedding, sparse_input, ctx=ht.cpu(0))
+ sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+ x = ht.concat_op(sparse_input, dense_input, axis=1)
+ # Cross Network
+ cross_output = build_cross_layer(x, num_layers=3)
+
+ # DNN
+ flatten = x
+ W1 = init.random_normal(
+ [26*embedding_size + 13, 256], stddev=0.01, name="W1")
+ W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+ W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
+
+ W4 = init.random_normal(
+ [256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4")
+
+ fc1 = ht.matmul_op(flatten, W1)
+ relu1 = ht.relu_op(fc1)
+ fc2 = ht.matmul_op(relu1, W2)
+ relu2 = ht.relu_op(fc2)
+ y3 = ht.matmul_op(relu2, W3)
+
+ y4 = ht.concat_op(cross_output, y3, axis=1)
+ y = ht.matmul_op(y4, W4)
+ y = ht.sigmoid_op(y)
+
+ loss = ht.binarycrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+ train_op = opt.minimize(loss)
+
+ return loss, y, y_, train_op
diff --git a/examples/ctr/models/deepfm_criteo.py b/examples/ctr/models/deepfm_criteo.py
new file mode 100644
index 0000000..d84f299
--- /dev/null
+++ b/examples/ctr/models/deepfm_criteo.py
@@ -0,0 +1,59 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def dfm_criteo(dense_input, sparse_input, y_):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.01
+
+ # FM
+ Embedding1 = init.random_normal(
+ [feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0))
+ FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
+ sparse_1dim_input = ht.embedding_lookup_op(
+ Embedding1, sparse_input, ctx=ht.cpu(0))
+ fm_dense_part = ht.matmul_op(dense_input, FM_W)
+ fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1)
+ # fst order output
+ y1 = fm_dense_part + fm_sparse_part
+
+ Embedding2 = init.random_normal(
+ [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+ sparse_2dim_input = ht.embedding_lookup_op(
+ Embedding2, sparse_input, ctx=ht.cpu(0))
+ sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1)
+ sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum)
+
+ sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input)
+ sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1)
+ sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
+ sparse_2dim_half = sparse_2dim * 0.5
+ # snd order output
+ y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)
+
+ # DNN
+ flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size))
+ W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1")
+ W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+ W3 = init.random_normal([256, 1], stddev=0.01, name="W3")
+
+ fc1 = ht.matmul_op(flatten, W1)
+ relu1 = ht.relu_op(fc1)
+ fc2 = ht.matmul_op(relu1, W2)
+ relu2 = ht.relu_op(fc2)
+ y3 = ht.matmul_op(relu2, W3)
+
+ y4 = y1 + y2
+ y = y4 + y3
+ y = ht.sigmoid_op(y)
+
+ loss = ht.binarycrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+ train_op = opt.minimize(loss)
+
+ return loss, y, y_, train_op
diff --git a/examples/ctr/models/load_data.py b/examples/ctr/models/load_data.py
new file mode 100644
index 0000000..6122342
--- /dev/null
+++ b/examples/ctr/models/load_data.py
@@ -0,0 +1,320 @@
+import os
+import numpy as np
+
+
+###########################################################################
+# criteo
+###########################################################################
+
+def download_criteo(path):
+ import tarfile
+ import pandas as pd
+ from six.moves import urllib
+ if not os.path.exists(path):
+ os.makedirs(path)
+ assert os.path.isdir(path), 'Please provide a directory path.'
+ # this source may be invalid, please use other valid sources.
+ origin = (
+ 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'
+ )
+ print('Downloading data from %s' % origin)
+ dataset = os.path.join(path, 'criteo.tar.gz')
+ urllib.request.urlretrieve(origin, dataset)
+ print("Extracting criteo zip...")
+ with tarfile.open(dataset) as f:
+ f.extractall(path=path)
+ print("Create local files...")
+
+ # save csv filed
+ df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None)
+ df.columns = ['label'] + ["I" +
+ str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)]
+ df.to_csv(os.path.join(path, "train.csv"), index=0)
+ print('Csv file saved.')
+
+ # save numpy arrays
+ target_path = [os.path.join(path, filename) for filename in [
+ 'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy',
+ 'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']]
+ dense_feats = [col for col in df.columns if col.startswith('I')]
+ sparse_feats = [col for col in df.columns if col.startswith('C')]
+ labels = df['label']
+ dense_feats = process_dense_feats(df, dense_feats)
+ sparse_feats = process_sparse_feats(df, sparse_feats)
+ num_data = dense_feats.shape[0]
+ perm = np.random.permutation(num_data)
+ # split data in 2 parts
+ test_num = num_data // 10
+ processed_data = [
+ dense_feats[perm[:-test_num]], # train dense
+ sparse_feats[perm[:-test_num]], # train sparse
+ labels[perm[:-test_num]], # train labels
+ dense_feats[perm[-test_num:]], # validate dense
+ sparse_feats[perm[-test_num:]], # validate sparse
+ labels[perm[-test_num:]], # validate labels
+ ]
+ print('Array shapes:')
+ for i in range(len(processed_data)):
+ print(os.path.split(target_path[i])
+ [-1].split('.')[0], processed_data[i].shape)
+ np.save(target_path[i], processed_data[i])
+ print('Numpy arrays saved.')
+
+
+def process_dense_feats(data, feats):
+ d = data.copy()
+ d = d[feats].fillna(0.0)
+ for f in feats:
+ d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
+ return d
+
+
+def process_sparse_feats(data, feats):
+ from sklearn.preprocessing import LabelEncoder
+ # process to embeddings.
+ d = data.copy()
+ d = d[feats].fillna("-1")
+ for f in feats:
+ label_encoder = LabelEncoder()
+ d[f] = label_encoder.fit_transform(d[f])
+ feature_cnt = 0
+ for f in feats:
+ d[f] += feature_cnt
+ feature_cnt += d[f].nunique()
+ return d
+
+
+def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True):
+ import pandas as pd
+ csv_path = os.path.join(path, "train.csv")
+ if not os.path.exists(csv_path):
+ download_criteo(path)
+ df = pd.read_csv(csv_path, nrows=nrows, header=0)
+ dense_feats = [col for col in df.columns if col.startswith('I')]
+ sparse_feats = [col for col in df.columns if col.startswith('C')]
+ labels = np.array(df['label']).reshape(-1, 1)
+ dense_feats = np.array(process_dense_feats(df, dense_feats))
+ sparse_feats = np.array(process_sparse_feats(
+ df, sparse_feats)).astype(np.int32)
+ if return_val:
+ test_num = nrows // 10
+ train_dense = dense_feats[:-test_num]
+ train_sparse = sparse_feats[:-test_num]
+ train_label = labels[:-test_num]
+ validate_dense = dense_feats[-test_num:]
+ validate_sparse = sparse_feats[-test_num:]
+ validate_label = labels[-test_num:]
+ return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label)
+ else:
+ return dense_feats, sparse_feats, labels
+
+
+def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')):
+ # all data should be available! no checking.
+ processed_data = [np.load(os.path.join(path, filename))
+ for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']]
+ return tuple(processed_data)
+
+
+def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True):
+ file_paths = [os.path.join(path, filename) for filename in [
+ 'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy',
+ 'test_sparse_feats.npy', 'train_labels.npy', 'test_labels.npy']]
+ if not all([os.path.exists(p) for p in file_paths]):
+ download_criteo(path)
+ files = [np.load(filename) for filename in file_paths]
+ if return_val:
+ return (files[0], files[1]), (files[2], files[3]), (files[4], files[5])
+ else:
+ return files[0], files[2], files[4]
+
+
+###########################################################################
+# adult
+###########################################################################
+
+def maybe_download(train_data, test_data):
+ import pandas as pd
+ """if adult data "train.csv" and "test.csv" are not in your directory,
+ download them.
+ """
+
+ COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+ "marital_status", "occupation", "relationship", "race", "gender",
+ "capital_gain", "capital_loss", "hours_per_week", "native_country",
+ "income_bracket"]
+
+ if not os.path.exists(train_data):
+ print("downloading training data...")
+ df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+ names=COLUMNS, skipinitialspace=True)
+ else:
+ df_train = pd.read_csv("train.csv")
+
+ if not os.path.exists(test_data):
+ print("downloading testing data...")
+ df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+ names=COLUMNS, skipinitialspace=True, skiprows=1)
+ else:
+ df_test = pd.read_csv("test.csv")
+
+ return df_train, df_test
+
+
+def cross_columns(x_cols):
+ """simple helper to build the crossed columns in a pandas dataframe
+ """
+ crossed_columns = dict()
+ colnames = ['_'.join(x_c) for x_c in x_cols]
+ for cname, x_c in zip(colnames, x_cols):
+ crossed_columns[cname] = x_c
+ return crossed_columns
+
+
+def val2idx(df, cols):
+ """helper to index categorical columns before embeddings.
+ """
+ val_types = dict()
+ for c in cols:
+ val_types[c] = df[c].unique()
+
+ val_to_idx = dict()
+ for k, v in val_types.items():
+ val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
+
+ for k, v in val_to_idx.items():
+ df[k] = df[k].apply(lambda x: v[x])
+
+ unique_vals = dict()
+ for c in cols:
+ unique_vals[c] = df[c].nunique()
+
+ return df, unique_vals
+
+
+def onehot(x):
+ from sklearn.preprocessing import OneHotEncoder
+ return np.array(OneHotEncoder().fit_transform(x).todense())
+
+
+def wide(df_train, df_test, wide_cols, x_cols, target):
+ import pandas as pd
+ print('Processing wide data')
+ df_train['IS_TRAIN'] = 1
+ df_test['IS_TRAIN'] = 0
+ df_wide = pd.concat([df_train, df_test])
+
+ crossed_columns_d = cross_columns(x_cols)
+ categorical_columns = list(
+ df_wide.select_dtypes(include=['object']).columns)
+
+ wide_cols += list(crossed_columns_d.keys())
+
+ for k, v in crossed_columns_d.items():
+ df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
+
+ df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
+
+ dummy_cols = [
+ c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
+ df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
+
+ train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+ test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+ assert all(train.columns == test.columns)
+
+ cols = [c for c in train.columns if c != target]
+ X_train = train[cols].values
+ y_train = train[target].values.reshape(-1, 1)
+ X_test = test[cols].values
+ y_test = test[target].values.reshape(-1, 1)
+ return X_train, y_train, X_test, y_test
+
+
+def load_adult_data(return_val=True):
+ import pandas as pd
+ df_train, df_test = maybe_download("train.csv", "test.csv")
+
+ df_train['income_label'] = (
+ df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+ df_test['income_label'] = (
+ df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+
+ age_groups = [0, 25, 65, 90]
+ age_labels = range(len(age_groups) - 1)
+ df_train['age_group'] = pd.cut(
+ df_train['age'], age_groups, labels=age_labels)
+ df_test['age_group'] = pd.cut(
+ df_test['age'], age_groups, labels=age_labels)
+
+ # columns for wide model
+ wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
+ 'relationship', 'race', 'gender', 'native_country', 'age_group']
+ x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
+
+ # columns for deep model
+ embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
+ 'relationship', 'race', 'gender', 'native_country']
+ cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
+
+ target = 'income_label'
+
+ x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
+ df_train, df_test, wide_cols, x_cols, target)
+ x_train_wide = np.array(x_train_wide).astype(np.float32)
+ x_test_wide = np.array(x_test_wide).astype(np.float32)
+
+ print('Processing deep data')
+ df_train['IS_TRAIN'] = 1
+ df_test['IS_TRAIN'] = 0
+ df_deep = pd.concat([df_train, df_test])
+
+ deep_cols = embedding_cols + cont_cols
+ df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
+ from sklearn.preprocessing import StandardScaler
+ scaler = StandardScaler()
+ df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
+ columns=cont_cols)
+ df_deep, unique_vals = val2idx(df_deep, embedding_cols)
+
+ train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+ test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+
+ x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
+ y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
+ x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
+ y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
+
+ x_train_deep = np.transpose(x_train_deep)
+ x_test_deep = np.transpose(x_test_deep)
+ y_train = onehot(y_train)
+ y_test = onehot(y_test)
+
+ if return_val:
+ return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
+ else:
+ return x_train_deep, x_train_wide, y_train
+
+
+###########################################################################
+# avazu
+###########################################################################
+
+def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')):
+ import pandas as pd
+ # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data
+ train_file = os.path.join(path, 'train.csv')
+ # test_file = os.path.join(path, 'test.csv') # useless, no labels
+
+ df_train = pd.read_csv(train_file)
+ sparse_feats = process_sparse_feats(df_train, df_train.columns[2:])
+ # the embedding num for each feature:
+ # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60]
+ # sum: 9449445
+
+ np.save(os.path.join(path, 'sparse.npy'), sparse_feats)
+
+
+if __name__ == '__main__':
+ download_criteo(os.path.join(os.path.split(
+ os.path.abspath(__file__)), '../datasets/criteo'))
diff --git a/examples/ctr/models/wdl_adult.py b/examples/ctr/models/wdl_adult.py
new file mode 100644
index 0000000..1ce77e0
--- /dev/null
+++ b/examples/ctr/models/wdl_adult.py
@@ -0,0 +1,56 @@
+import hetu as ht
+from hetu import init
+
+
+def wdl_adult(X_deep, X_wide, y_):
+ lr = 5 / 128
+ dim_wide = 809
+ dim_deep = 68
+
+ W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
+ W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
+ b1 = init.random_normal([50], stddev=0.1, name="b1")
+ W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
+ b2 = init.random_normal([20], stddev=0.1, name="b2")
+
+ # deep
+ Embedding = []
+ X_deep_input = None
+
+ for i in range(8):
+ Embedding_name = "Embedding_deep_" + str(i)
+ Embedding.append(init.random_normal(
+ [50, 8], stddev=0.1, name=Embedding_name))
+ now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
+ now = ht.array_reshape_op(now, (-1, 8))
+ if X_deep_input is None:
+ X_deep_input = now
+ else:
+ X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+ for i in range(4):
+ now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
+ X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+ mat1 = ht.matmul_op(X_deep_input, W1)
+ add1 = mat1 + ht.broadcastto_op(b1, mat1)
+ relu1 = ht.relu_op(add1)
+ dropout1 = relu1
+ mat2 = ht.matmul_op(dropout1, W2)
+ add2 = mat2 + ht.broadcastto_op(b2, mat2)
+ relu2 = ht.relu_op(add2)
+ dropout2 = relu2
+ dmodel = dropout2
+
+ # wide
+ wmodel = ht.concat_op(X_wide, dmodel, 1)
+ wmodel = ht.matmul_op(wmodel, W)
+
+ prediction = wmodel
+ loss = ht.softmaxcrossentropy_op(prediction, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+
+ opt = ht.optim.SGDOptimizer(learning_rate=lr)
+ train_op = opt.minimize(loss)
+
+ return loss, prediction, y_, train_op
diff --git a/examples/ctr/models/wdl_criteo.py b/examples/ctr/models/wdl_criteo.py
new file mode 100644
index 0000000..416ae24
--- /dev/null
+++ b/examples/ctr/models/wdl_criteo.py
@@ -0,0 +1,42 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def wdl_criteo(dense_input, sparse_input, y_):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.01
+ Embedding = init.random_normal(
+ [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+ sparse_input = ht.embedding_lookup_op(
+ Embedding, sparse_input, ctx=ht.cpu(0))
+ sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+
+ # DNN
+ flatten = dense_input
+ W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
+ W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+ W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
+
+ W4 = init.random_normal(
+ [256 + 26*embedding_size, 1], stddev=0.01, name="W4")
+
+ fc1 = ht.matmul_op(flatten, W1)
+ relu1 = ht.relu_op(fc1)
+ fc2 = ht.matmul_op(relu1, W2)
+ relu2 = ht.relu_op(fc2)
+ y3 = ht.matmul_op(relu2, W3)
+
+ y4 = ht.concat_op(sparse_input, y3, axis=1)
+ y = ht.matmul_op(y4, W4)
+ y = ht.sigmoid_op(y)
+
+ loss = ht.binarycrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+ train_op = opt.minimize(loss)
+
+ return loss, y, y_, train_op
diff --git a/examples/ctr/run_hetu.py b/examples/ctr/run_hetu.py
new file mode 100644
index 0000000..5745af3
--- /dev/null
+++ b/examples/ctr/run_hetu.py
@@ -0,0 +1,230 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import os.path as osp
+import numpy as np
+import yaml
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+
+def worker(args):
+ def train(iterations, auc_enabled=True, tqdm_enabled=False):
+ localiter = tqdm(range(iterations)
+ ) if tqdm_enabled else range(iterations)
+ train_loss = []
+ train_acc = []
+ if auc_enabled:
+ train_auc = []
+ for it in localiter:
+ loss_val, predict_y, y_val, _ = executor.run(
+ 'train', convert_to_numpy_ret_vals=True)
+ if y_val.shape[1] == 1: # for criteo case
+ acc_val = np.equal(
+ y_val,
+ predict_y > 0.5).astype(np.float32)
+ else:
+ acc_val = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ if auc_enabled:
+ train_auc.append(metrics.roc_auc_score(y_val, predict_y))
+ if auc_enabled:
+ return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
+ else:
+ return np.mean(train_loss), np.mean(train_acc)
+
+ def validate(iterations, tqdm_enabled=False):
+ localiter = tqdm(range(iterations)
+ ) if tqdm_enabled else range(iterations)
+ test_loss = []
+ test_acc = []
+ test_auc = []
+ for it in localiter:
+ loss_val, test_y_predicted, y_test_val = executor.run(
+ 'validate', convert_to_numpy_ret_vals=True)
+ if y_test_val.shape[1] == 1: # for criteo case
+ correct_prediction = np.equal(
+ y_test_val,
+ test_y_predicted > 0.5).astype(np.float32)
+ else:
+ correct_prediction = np.equal(
+ np.argmax(y_test_val, 1),
+ np.argmax(test_y_predicted, 1)).astype(np.float32)
+ test_loss.append(loss_val[0])
+ test_acc.append(correct_prediction)
+ test_auc.append(metrics.roc_auc_score(
+ y_test_val, test_y_predicted))
+ return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)
+
+ def get_current_shard(data):
+ if args.comm is not None:
+ part_size = data.shape[0] // nrank
+ start = part_size * rank
+ end = start + part_size if rank != nrank - 1 else data.shape[0]
+ return data[start:end]
+ else:
+ return data
+
+ batch_size = 128
+ dataset = args.dataset
+ model = args.model
+ device_id = 0
+
+ if args.comm == 'PS':
+ rank = ht.get_worker_communicate().rank()
+ nrank = int(os.environ['DMLC_NUM_WORKER'])
+ device_id = rank % 8
+ elif args.comm == 'Hybrid':
+ comm = ht.wrapped_mpi_nccl_init()
+ device_id = comm.dev_id
+ rank = comm.rank
+ nrank = int(os.environ['DMLC_NUM_WORKER'])
+
+ if dataset == 'criteo':
+ # define models for criteo
+ if args.all:
+ from models.load_data import process_all_criteo_data
+ dense, sparse, labels = process_all_criteo_data(
+ return_val=args.val)
+ elif args.val:
+ from models.load_data import process_head_criteo_data
+ dense, sparse, labels = process_head_criteo_data(return_val=True)
+ else:
+ from models.load_data import process_sampled_criteo_data
+ dense, sparse, labels = process_sampled_criteo_data()
+ if isinstance(dense, tuple):
+ dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [
+ get_current_shard(dense[1]), batch_size, 'validate']])
+ sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [
+ get_current_shard(sparse[1]), batch_size, 'validate']])
+ y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [
+ get_current_shard(labels[1]), batch_size, 'validate']])
+ else:
+ dense_input = ht.dataloader_op(
+ [[get_current_shard(dense), batch_size, 'train']])
+ sparse_input = ht.dataloader_op(
+ [[get_current_shard(sparse), batch_size, 'train']])
+ y_ = ht.dataloader_op(
+ [[get_current_shard(labels), batch_size, 'train']])
+ elif dataset == 'adult':
+ from models.load_data import load_adult_data
+ x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data()
+ dense_input = [
+ ht.dataloader_op([
+ [get_current_shard(x_train_deep[:, i]), batch_size, 'train'],
+ [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'],
+ ]) for i in range(12)
+ ]
+ sparse_input = ht.dataloader_op([
+ [get_current_shard(x_train_wide), batch_size, 'train'],
+ [get_current_shard(x_test_wide), batch_size, 'validate'],
+ ])
+ y_ = ht.dataloader_op([
+ [get_current_shard(y_train), batch_size, 'train'],
+ [get_current_shard(y_test), batch_size, 'validate'],
+ ])
+ else:
+ raise NotImplementedError
+ print("Data loaded.")
+
+ loss, prediction, y_, train_op = model(dense_input, sparse_input, y_)
+
+ eval_nodes = {'train': [loss, prediction, y_, train_op]}
+ if args.val:
+ print('Validation enabled...')
+ eval_nodes['validate'] = [loss, prediction, y_]
+ executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs')
+ executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id),
+ comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path)
+
+ if args.all and dataset == 'criteo':
+ print('Processing all data...')
+ file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[
+ args.comm], args.raw_model)
+ file_path += '%d.log' % rank if args.comm else '.log'
+ file_path = osp.join(osp.dirname(
+ osp.abspath(__file__)), 'logs', file_path)
+ log_file = open(file_path, 'w')
+ total_epoch = args.nepoch if args.nepoch > 0 else 11
+ for ep in range(total_epoch):
+ print("ep: %d" % ep)
+ ep_st = time.time()
+ train_loss, train_acc, train_auc = train(executor.get_batch_num(
+ 'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True)
+ ep_en = time.time()
+ if args.val:
+ val_loss, val_acc, val_auc = validate(
+ executor.get_batch_num('validate'))
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
+ % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st)
+ else:
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+ % (train_loss, train_acc, train_auc, ep_en - ep_st)
+ print(printstr)
+ log_file.write(printstr + '\n')
+ log_file.flush()
+ else:
+ total_epoch = args.nepoch if args.nepoch > 0 else 50
+ for ep in range(total_epoch):
+ if ep == 5:
+ start = time.time()
+ print("epoch %d" % ep)
+ ep_st = time.time()
+ train_loss, train_acc = train(
+ executor.get_batch_num('train'), auc_enabled=False)
+ ep_en = time.time()
+ if args.val:
+ val_loss, val_acc, val_auc = validate(
+ executor.get_batch_num('validate'))
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
+ % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
+ else:
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (train_loss, train_acc, ep_en - ep_st))
+ print('all time:', time.time() - start)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True,
+ help="model to be tested")
+ parser.add_argument("--val", action="store_true",
+ help="whether to use validation")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ parser.add_argument("--comm", default=None,
+ help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
+ parser.add_argument("--bsp", action="store_true",
+ help="whether to use bsp instead of asp")
+ parser.add_argument("--cache", default=None, help="cache policy")
+ parser.add_argument("--bound", default=100, help="cache bound")
+ parser.add_argument("--config", type=str, default=osp.join(osp.dirname(
+ osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps")
+ parser.add_argument("--nepoch", type=int, default=-1,
+ help="num of epochs, each train 1/10 data")
+ args = parser.parse_args()
+ import models
+ print('Model:', args.model)
+ model = eval('models.' + args.model)
+ args.dataset = args.model.split('_')[-1]
+ args.raw_model = args.model
+ args.model = model
+ if args.comm is None:
+ worker(args)
+ elif args.comm == 'Hybrid':
+ settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+ value = settings['shared']
+ os.environ['DMLC_ROLE'] = 'worker'
+ for k, v in value.items():
+ os.environ[k] = str(v)
+ worker(args)
+ elif args.comm == 'PS':
+ launch(worker, args)
+ else:
+ raise NotImplementedError
diff --git a/examples/ctr/run_tf_horovod.py b/examples/ctr/run_tf_horovod.py
new file mode 100644
index 0000000..dc590cc
--- /dev/null
+++ b/examples/ctr/run_tf_horovod.py
@@ -0,0 +1,174 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+import horovod.tensorflow as hvd
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
+# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
+# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
+# we can also use mpirun (default gloo):
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
+
+
+def train_criteo(model, args):
+ hvd.init()
+
+ def get_current_shard(data):
+ part_size = data.shape[0] // hvd.size()
+ start = part_size * hvd.rank()
+ end = start + part_size if hvd.rank() != hvd.size() - \
+ 1 else data.shape[0]
+ return data[start:end]
+
+ if args.all:
+ from models.load_data import process_all_criteo_data
+ dense, sparse, all_labels = process_all_criteo_data()
+ dense_feature = get_current_shard(dense[0])
+ sparse_feature = get_current_shard(sparse[0])
+ labels = get_current_shard(all_labels[0])
+ val_dense = get_current_shard(dense[1])
+ val_sparse = get_current_shard(sparse[1])
+ val_labels = get_current_shard(all_labels[1])
+ else:
+ from models.load_data import process_sampled_criteo_data
+ dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+ dense_feature = get_current_shard(dense_feature)
+ sparse_feature = get_current_shard(sparse_feature)
+ labels = get_current_shard(labels)
+
+ batch_size = 128
+ dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+ sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+ loss, y, opt = model(dense_input, sparse_input, y_)
+ global_step = tf.train.get_or_create_global_step()
+ # here in DistributedOptimizer by default all tensor are reduced on GPU
+ # can use device_sparse=xxx, device_dense=xxx to modify
+ # if using device_sparse='/cpu:0', the performance degrades
+ train_op = hvd.DistributedOptimizer(
+ opt).minimize(loss, global_step=global_step)
+
+ gpu_options = tf.compat.v1.GPUOptions(
+ allow_growth=True, visible_device_list=str(hvd.local_rank()))
+ # here horovod default use gpu to initialize, which will cause OOM
+ hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+ sess = tf.compat.v1.train.MonitoredTrainingSession(
+ hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+ my_feed_dict = {
+ dense_input: np.empty(shape=(batch_size, 13)),
+ sparse_input: np.empty(shape=(batch_size, 26)),
+ y_: np.empty(shape=(batch_size, 1)),
+ }
+
+ if args.all:
+ raw_log_file = './logs/tf_hvd_%s_%d.log' % (
+ args.model, hvd.local_rank())
+ print('Processing all data, log to', raw_log_file)
+ log_file = open(raw_log_file, 'w')
+ iterations = dense_feature.shape[0] // batch_size
+ total_epoch = 400
+ start_index = 0
+ for ep in range(total_epoch):
+ print("epoch %d" % ep)
+ st_time = time.time()
+ train_loss, train_acc, train_auc = [], [], []
+ for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+ start_index += batch_size
+ if start_index + batch_size > dense_feature.shape[0]:
+ start_index = 0
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ tra_auc = np.mean(train_auc)
+ en_time = time.time()
+ train_time = en_time - st_time
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, tra_auc, train_time)
+ print(printstr)
+ log_file.write(printstr + '\n')
+ log_file.flush()
+
+ else:
+ iterations = dense_feature.shape[0] // batch_size
+
+ epoch = 50
+ for ep in range(epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ for idx in range(iterations):
+ start_index = idx * batch_size
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ if pred_val.shape[1] == 1: # for criteo case
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ else:
+ acc_val = np.equal(
+ np.argmax(pred_val, 1),
+ np.argmax(true_val, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print('all time:', (time.time() - start))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True,
+ help="model to be tested")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+ raw_model = args.model
+ import tf_models
+ model = eval('tf_models.' + raw_model)
+ dataset = raw_model.split('_')[-1]
+ print('Model:', raw_model)
+ train_criteo(model, args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/ctr/run_tf_local.py b/examples/ctr/run_tf_local.py
new file mode 100644
index 0000000..1b0bc6d
--- /dev/null
+++ b/examples/ctr/run_tf_local.py
@@ -0,0 +1,202 @@
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+
+def train_criteo(model, args):
+ if args.all:
+ from models.load_data import process_all_criteo_data
+ dense, sparse, all_labels = process_all_criteo_data()
+ dense_feature, val_dense = dense
+ sparse_feature, val_sparse = sparse
+ labels, val_labels = all_labels
+ else:
+ from models.load_data import process_sampled_criteo_data
+ dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+
+ batch_size = 128
+ dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+ sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+ loss, y, opt = model(dense_input, sparse_input, y_)
+ train_op = opt.minimize(loss)
+
+ init = tf.compat.v1.global_variables_initializer()
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ sess = tf.compat.v1.Session(
+ config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+ sess.run(init)
+
+ my_feed_dict = {
+ dense_input: np.empty(shape=(batch_size, 13)),
+ sparse_input: np.empty(shape=(batch_size, 26)),
+ y_: np.empty(shape=(batch_size, 1)),
+ }
+
+ if args.all:
+ raw_log_file = './logs/tf_local_%s.log' % (args.model)
+ print('Processing all data, log to', raw_log_file)
+ log_file = open(raw_log_file, 'w')
+ iterations = dense_feature.shape[0] // batch_size
+ total_epoch = 11
+ start_index = 0
+ for ep in range(total_epoch):
+ print("epoch %d" % ep)
+ st_time = time.time()
+ train_loss, train_acc, train_auc = [], [], []
+ for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+ start_index += batch_size
+ if start_index + batch_size > dense_feature.shape[0]:
+ start_index = 0
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ tra_auc = np.mean(train_auc)
+ en_time = time.time()
+ train_time = en_time - st_time
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, tra_auc, train_time)
+ print(printstr)
+ log_file.write(printstr + '\n')
+ log_file.flush()
+
+ else:
+ iteration = dense_feature.shape[0] // batch_size
+
+ epoch = 50
+ for ep in range(epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ for idx in range(iteration):
+ start_index = idx * batch_size
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ if pred_val.shape[1] == 1: # for criteo case
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ else:
+ acc_val = np.equal(
+ np.argmax(pred_val, 1),
+ np.argmax(true_val, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print('all time:', (time.time() - start))
+
+
+def train_adult(model):
+ batch_size = 128
+ total_epoch = 50
+ dim_wide = 809
+
+ X_deep = []
+ for i in range(8):
+ X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
+ for i in range(4):
+ X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1]))
+ X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
+ y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
+ loss, y, train_op = model(X_deep, X_wide, y_)
+
+ init = tf.global_variables_initializer()
+
+ gpu_options = tf.GPUOptions(allow_growth=True)
+ sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+
+ sess.run(init)
+
+ from models.load_data import load_adult_data
+ x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)
+
+ iterations = x_train_deep.shape[0] // batch_size
+ for ep in range(total_epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ pre_index = 0
+
+ for it in range(iterations):
+ batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
+ batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
+ batch_y = y_train[pre_index:pre_index + batch_size]
+ pre_index += batch_size
+
+ my_feed_dict = dict()
+ for i in range(12):
+ my_feed_dict[X_deep[i]] = np.array(
+ batch_x_deep[:, 1]).reshape(-1, 1)
+
+ my_feed_dict[X_wide] = np.array(batch_x_wide)
+ my_feed_dict[y_] = batch_y
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ acc_val = np.equal(
+ np.argmax(loss_val[1], 1),
+ np.argmax(loss_val[2], 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print('all time:', (time.time() - start))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True,
+ help="model to be tested")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+ raw_model = args.model
+ import tf_models
+ model = eval('tf_models.' + raw_model)
+ dataset = raw_model.split('_')[-1]
+ print('Model:', raw_model)
+
+ if dataset == 'criteo':
+ train_criteo(model, args)
+ elif dataset == 'adult':
+ train_adult(model)
+ else:
+ raise NotImplementedError
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/ctr/run_tf_parallax.py b/examples/ctr/run_tf_parallax.py
new file mode 100644
index 0000000..9642775
--- /dev/null
+++ b/examples/ctr/run_tf_parallax.py
@@ -0,0 +1,211 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+from autodist import AutoDist
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
+from autodist.strategy.base import Strategy
+from autodist.kernel.common.utils import get_op_name
+from tensorflow.python.framework import ops
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+# Please DO NOT modify /etc/bash.bashrc to activate conda environment.
+# Use python_venv in spec yml file instead.
+# Use absolute path of python file.
+# Here we use the tf native partitioner instead of autodist's PartitionPS.
+
+
+class Parallaxx(PSLoadBalancing, AllReduce):
+ """
+ Modify original parallax to remove replica on CPUs.
+ """
+
+ def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
+ PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
+ AllReduce.__init__(self, chunk_size)
+
+ # pylint: disable=attribute-defined-outside-init
+ def build(self, graph_item, resource_spec):
+ """Generate the strategy."""
+ expr = Strategy()
+
+ # For each variable, generate variable synchronizer config
+ expr.graph_config.replicas.extend(
+ [k for k, v in resource_spec.gpu_devices])
+ reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+ self.loads = {ps: 0.0 for ps in reduction_device_names}
+
+ # Generate node config
+ node_config = []
+ for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
+ var_op_name = get_op_name(var.name)
+ grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
+ if isinstance(grad, ops.Tensor): # this is a dense variable
+ group_id = idx // self.chunk_size
+ config = self._gen_all_reduce_node_config(
+ var.name, group=group_id)
+ else: # sparse updates
+ # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
+ # Sparse variables are likely larger, so keeping copies would be costlier,
+ # and usually each device only requires a small part of the overall variable.
+ config = self._gen_ps_node_config(
+ var,
+ # For Parallax Strategy, all PS vars are sparse which does not need proxy.
+ False,
+ self._sync,
+ self._staleness
+ )
+ node_config.append(config)
+ expr.node_config.extend(node_config)
+
+ return expr
+
+
+def train_criteo(model, args):
+ resource_spec_file = os.path.join(os.path.dirname(
+ __file__), 'settings', 'plx_local_spec.yml')
+ autodist = AutoDist(resource_spec_file, Parallaxx())
+ respec = ResourceSpec(resource_spec_file)
+ if args.all:
+ from models.load_data import process_all_criteo_data
+ dense, sparse, all_labels = process_all_criteo_data()
+ dense_feature, val_dense = dense
+ sparse_feature, val_sparse = sparse
+ labels, val_labels = all_labels
+ else:
+ from models.load_data import process_sampled_criteo_data
+ dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+
+ # autodist will split the feeding data
+ batch_size = 128
+ with tf.Graph().as_default() as g, autodist.scope():
+ dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+ sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+ embed_partitioner = tf.fixed_size_partitioner(
+ len(respec.nodes), 0) if len(respec.nodes) > 1 else None
+ loss, y, opt = model(dense_input, sparse_input,
+ y_, embed_partitioner, False)
+ train_op = opt.minimize(loss)
+
+ sess = autodist.create_distributed_session()
+
+ my_feed_dict = {
+ dense_input: np.empty(shape=(batch_size, 13)),
+ sparse_input: np.empty(shape=(batch_size, 26)),
+ y_: np.empty(shape=(batch_size, 1)),
+ }
+
+ if args.all:
+ raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[
+ 0], 'logs', 'tf_plx_%s.log' % (args.model))
+ print('Processing all data, log to', raw_log_file)
+ log_file = open(raw_log_file, 'w')
+ iterations = dense_feature.shape[0] // batch_size
+ total_epoch = 11
+ start_index = 0
+ for ep in range(total_epoch):
+ print("epoch %d" % ep)
+ st_time = time.time()
+ train_loss, train_acc, train_auc = [], [], []
+ for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+ start_index += batch_size
+ if start_index + batch_size > dense_feature.shape[0]:
+ start_index = 0
+ loss_val = sess.run(
+ [loss, y, y_, train_op], feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ tra_auc = np.mean(train_auc)
+ en_time = time.time()
+ train_time = en_time - st_time
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, tra_auc, train_time)
+ print(printstr)
+ log_file.write(printstr + '\n')
+ log_file.flush()
+
+ else:
+ iteration = dense_feature.shape[0] // batch_size
+
+ epoch = 50
+ for ep in range(epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ for idx in range(iteration):
+ start_index = idx * batch_size
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+ loss_val = sess.run(
+ [loss, y, y_, train_op], feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ if pred_val.shape[1] == 1: # for criteo case
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ else:
+ acc_val = np.equal(
+ np.argmax(pred_val, 1),
+ np.argmax(true_val, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print('all time:', (time.time() - start))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True,
+ help="model to be tested")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+ raw_model = args.model
+ import tf_models
+ model = eval('tf_models.' + raw_model)
+ dataset = raw_model.split('_')[-1]
+ print('Model:', raw_model)
+
+ if dataset == 'criteo':
+ train_criteo(model, args)
+ else:
+ raise NotImplementedError
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/ctr/settings/local_s1.yml b/examples/ctr/settings/local_s1.yml
new file mode 100644
index 0000000..a60a56b
--- /dev/null
+++ b/examples/ctr/settings/local_s1.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
+launch :
+ worker : 0
+ server : 1
+ scheduler : true
diff --git a/examples/ctr/settings/local_s1_w2.yml b/examples/ctr/settings/local_s1_w2.yml
new file mode 100644
index 0000000..34dc439
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w2.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 2
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
+launch :
+ worker : 2
+ server : 1
+ scheduler : true
diff --git a/examples/ctr/settings/local_s1_w4.yml b/examples/ctr/settings/local_s1_w4.yml
new file mode 100644
index 0000000..0790c40
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w4.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
+launch :
+ worker : 4
+ server : 1
+ scheduler : true
diff --git a/examples/ctr/settings/local_s1_w8.yml b/examples/ctr/settings/local_s1_w8.yml
new file mode 100644
index 0000000..a795608
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w8.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 8
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
+launch :
+ worker : 8
+ server : 1
+ scheduler : true
diff --git a/examples/ctr/settings/local_w4.yml b/examples/ctr/settings/local_w4.yml
new file mode 100644
index 0000000..97b5bcd
--- /dev/null
+++ b/examples/ctr/settings/local_w4.yml
@@ -0,0 +1,6 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+ DMLC_PS_VAN_TYPE : p3
\ No newline at end of file
diff --git a/examples/ctr/settings/plx_local_spec.yml b/examples/ctr/settings/plx_local_spec.yml
new file mode 100644
index 0000000..11f9025
--- /dev/null
+++ b/examples/ctr/settings/plx_local_spec.yml
@@ -0,0 +1,4 @@
+nodes:
+ - address: localhost
+ cpus: [0]
+ gpus: [0,1,2,3,4,5,6,7]
diff --git a/examples/ctr/settings/tf_local_s1_w2.json b/examples/ctr/settings/tf_local_s1_w2.json
new file mode 100644
index 0000000..c1e0148
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w2.json
@@ -0,0 +1,9 @@
+{
+ "worker": [
+ "127.0.0.1:12349",
+ "127.0.0.1:12348"
+ ],
+ "ps": [
+ "127.0.0.1:12345"
+ ]
+}
\ No newline at end of file
diff --git a/examples/ctr/settings/tf_local_s1_w4.json b/examples/ctr/settings/tf_local_s1_w4.json
new file mode 100644
index 0000000..d0c339b
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w4.json
@@ -0,0 +1,11 @@
+{
+ "worker": [
+ "127.0.0.1:23459",
+ "127.0.0.1:23458",
+ "127.0.0.1:23457",
+ "127.0.0.1:23456"
+ ],
+ "ps": [
+ "127.0.0.1:23455"
+ ]
+}
\ No newline at end of file
diff --git a/examples/ctr/settings/tf_local_s1_w8.json b/examples/ctr/settings/tf_local_s1_w8.json
new file mode 100644
index 0000000..05d1f0e
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w8.json
@@ -0,0 +1,15 @@
+{
+ "worker": [
+ "127.0.0.1:34569",
+ "127.0.0.1:34568",
+ "127.0.0.1:34567",
+ "127.0.0.1:34566",
+ "127.0.0.1:34565",
+ "127.0.0.1:34564",
+ "127.0.0.1:34563",
+ "127.0.0.1:34562"
+ ],
+ "ps": [
+ "127.0.0.1:34575"
+ ]
+}
\ No newline at end of file
diff --git a/examples/ctr/tests/hybrid_dcn_criteo.sh b/examples/ctr/tests/hybrid_dcn_criteo.sh
new file mode 100644
index 0000000..8fcb203
--- /dev/null
+++ b/examples/ctr/tests/hybrid_dcn_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_dfm_criteo.sh b/examples/ctr/tests/hybrid_dfm_criteo.sh
new file mode 100644
index 0000000..caaa171
--- /dev/null
+++ b/examples/ctr/tests/hybrid_dfm_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_wdl_adult.sh b/examples/ctr/tests/hybrid_wdl_adult.sh
new file mode 100644
index 0000000..d3f41ee
--- /dev/null
+++ b/examples/ctr/tests/hybrid_wdl_adult.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_wdl_criteo.sh b/examples/ctr/tests/hybrid_wdl_criteo.sh
new file mode 100644
index 0000000..36a6b88
--- /dev/null
+++ b/examples/ctr/tests/hybrid_wdl_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/local_dcn_criteo.sh b/examples/ctr/tests/local_dcn_criteo.sh
new file mode 100644
index 0000000..f1cd439
--- /dev/null
+++ b/examples/ctr/tests/local_dcn_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dcn_criteo --val
diff --git a/examples/ctr/tests/local_dfm_criteo.sh b/examples/ctr/tests/local_dfm_criteo.sh
new file mode 100644
index 0000000..d57378c
--- /dev/null
+++ b/examples/ctr/tests/local_dfm_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dfm_criteo --val
diff --git a/examples/ctr/tests/local_wdl_adult.sh b/examples/ctr/tests/local_wdl_adult.sh
new file mode 100644
index 0000000..4ccb1cd
--- /dev/null
+++ b/examples/ctr/tests/local_wdl_adult.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_adult --val
diff --git a/examples/ctr/tests/local_wdl_criteo.sh b/examples/ctr/tests/local_wdl_criteo.sh
new file mode 100644
index 0000000..cfd2e45
--- /dev/null
+++ b/examples/ctr/tests/local_wdl_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_criteo --val
diff --git a/examples/ctr/tests/ps_dcn_criteo.sh b/examples/ctr/tests/ps_dcn_criteo.sh
new file mode 100644
index 0000000..fba3350
--- /dev/null
+++ b/examples/ctr/tests/ps_dcn_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_dfm_criteo.sh b/examples/ctr/tests/ps_dfm_criteo.sh
new file mode 100644
index 0000000..ee423e9
--- /dev/null
+++ b/examples/ctr/tests/ps_dfm_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_wdl_adult.sh b/examples/ctr/tests/ps_wdl_adult.sh
new file mode 100644
index 0000000..c0165b6
--- /dev/null
+++ b/examples/ctr/tests/ps_wdl_adult.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_wdl_criteo.sh b/examples/ctr/tests/ps_wdl_criteo.sh
new file mode 100644
index 0000000..8d2f9a1
--- /dev/null
+++ b/examples/ctr/tests/ps_wdl_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/tf_2workers.sh b/examples/ctr/tests/tf_2workers.sh
new file mode 100644
index 0000000..8f1c902
--- /dev/null
+++ b/examples/ctr/tests/tf_2workers.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 0 > ${workdir}/../logs/temp0.log &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 1 > ${workdir}/../logs/temp1.log &
+wait
diff --git a/examples/ctr/tests/tf_4workers.sh b/examples/ctr/tests/tf_4workers.sh
new file mode 100644
index 0000000..d1ad330
--- /dev/null
+++ b/examples/ctr/tests/tf_4workers.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 0 > ${workdir}/../logs/temp0.log &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 1 > ${workdir}/../logs/temp1.log &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 2 > ${workdir}/../logs/temp2.log &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 3 > ${workdir}/../logs/temp3.log &
+wait
diff --git a/examples/ctr/tests/tf_8workers.sh b/examples/ctr/tests/tf_8workers.sh
new file mode 100644
index 0000000..d0ede08
--- /dev/null
+++ b/examples/ctr/tests/tf_8workers.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 0 > ${workdir}/../logs/temp0.log &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 1 > ${workdir}/../logs/temp1.log &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 2 > ${workdir}/../logs/temp2.log &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 3 > ${workdir}/../logs/temp3.log &
+CUDA_VISIBLE_DEVICES=4 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 4 > ${workdir}/../logs/temp4.log &
+CUDA_VISIBLE_DEVICES=5 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 5 > ${workdir}/../logs/temp5.log &
+CUDA_VISIBLE_DEVICES=6 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 6 > ${workdir}/../logs/temp6.log &
+CUDA_VISIBLE_DEVICES=7 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 7 > ${workdir}/../logs/temp7.log &
+wait
diff --git a/examples/ctr/tf_launch_server.py b/examples/ctr/tf_launch_server.py
new file mode 100644
index 0000000..aaa0b7c
--- /dev/null
+++ b/examples/ctr/tf_launch_server.py
@@ -0,0 +1,49 @@
+import os
+import tensorflow as tf
+import multiprocessing
+import signal
+import json
+import argparse
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+ os.environ['CUDA_VISIBLE_DEVICES'] = ''
+
+
+pop_env()
+
+
+def start_server(cluster, task_id):
+ server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
+ server.join()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config", type=str, default='./settings/tf_dist_s4_w2.json', help="config file path")
+ parser.add_argument("--id", type=int, required=True)
+ args = parser.parse_args()
+ raw_config = args.config
+ config = json.load(open(raw_config))
+ cluster = tf.train.ClusterSpec(config)
+ global proc
+ proc = multiprocessing.Process(
+ target=start_server, args=[cluster, args.id, ])
+ proc.start()
+ signal.signal(signal.SIGINT, signal_handler)
+ proc.join()
+
+
+def signal_handler(signal, frame):
+ print("SIGINT signal caught, stop Training")
+ global proc
+ proc.kill()
+ exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/ctr/tf_launch_worker.py b/examples/ctr/tf_launch_worker.py
new file mode 100644
index 0000000..e56825d
--- /dev/null
+++ b/examples/ctr/tf_launch_worker.py
@@ -0,0 +1,353 @@
+import tensorflow as tf
+import numpy as np
+import argparse
+import os
+import time
+import json
+from sklearn import metrics
+from tqdm import tqdm
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+
+def train_criteo(model, cluster, task_id, nrank, args):
+ def get_current_shard(data):
+ part_size = data.shape[0] // nrank
+ start = part_size * task_id
+ end = start + part_size if task_id != nrank - 1 else data.shape[0]
+ return data[start:end]
+
+ if args.all:
+ from models.load_data import process_all_criteo_data
+ dense, sparse, all_labels = process_all_criteo_data()
+ dense_feature = get_current_shard(dense[0])
+ sparse_feature = get_current_shard(sparse[0])
+ labels = get_current_shard(all_labels[0])
+ val_dense = get_current_shard(dense[1])
+ val_sparse = get_current_shard(sparse[1])
+ val_labels = get_current_shard(all_labels[1])
+ else:
+ from models.load_data import process_sampled_criteo_data
+ dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+ dense_feature = get_current_shard(dense_feature)
+ sparse_feature = get_current_shard(sparse_feature)
+ labels = get_current_shard(labels)
+
+ batch_size = 128
+ worker_device = "/job:worker/task:%d/gpu:0" % (task_id)
+ with tf.device(worker_device):
+ dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+ sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+ y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+ with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)):
+ server_num = len(cluster.as_dict()['ps'])
+ embed_partitioner = tf.fixed_size_partitioner(
+ server_num, 0) if server_num > 1 else None
+ loss, y, opt = model(dense_input, sparse_input, y_,
+ embed_partitioner, param_on_gpu=False)
+ train_op = opt.minimize(loss)
+
+ server = tf.train.Server(
+ cluster, job_name="worker", task_index=task_id)
+ init = tf.compat.v1.global_variables_initializer()
+ sv = tf.train.Supervisor(
+ is_chief=(task_id == 0),
+ init_op=init,
+ recovery_wait_secs=1)
+ sess_config = tf.compat.v1.ConfigProto(
+ allow_soft_placement=True,
+ log_device_placement=False,
+ device_filters=["/job:ps",
+ "/job:worker/task:%d" % task_id])
+ sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
+ # sess.run(init)
+ if task_id == 0:
+ writer = tf.compat.v1.summary.FileWriter('logs/board', sess.graph)
+
+ my_feed_dict = {
+ dense_input: np.empty(shape=(batch_size, 13)),
+ sparse_input: np.empty(shape=(batch_size, 26)),
+ y_: np.empty(shape=(batch_size, 1)),
+ }
+
+ if args.all:
+ raw_log_file = './logs/tf_dist_%s_%d.log' % (args.model, task_id)
+ print('Processing all data, log to', raw_log_file)
+ log_file = open(raw_log_file, 'w')
+ iterations = dense_feature.shape[0] // batch_size
+ total_epoch = 21
+ start_index = 0
+ for ep in range(total_epoch):
+ print("epoch %d" % ep)
+ st_time = time.time()
+ train_loss, train_acc, train_auc = [], [], []
+ for it in range(iterations // 10 + (ep % 10 == 9) * (iterations % 10)):
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+ start_index += batch_size
+ if start_index + batch_size > dense_feature.shape[0]:
+ start_index = 0
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ tra_auc = np.mean(train_auc)
+ en_time = time.time()
+ train_time = en_time - st_time
+
+ if args.val:
+ val_loss, val_acc, val_auc = [], [], []
+ for it in range(val_dense.shape[0] // batch_size):
+ local_st = it * batch_size
+ my_feed_dict[dense_input][:] = val_dense[local_st: local_st + batch_size]
+ my_feed_dict[sparse_input][:] = val_sparse[local_st: local_st + batch_size]
+ my_feed_dict[y_][:] = val_labels[local_st: local_st+batch_size]
+ loss_val = sess.run([loss, y, y_], feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ val_loss.append(loss_val[0])
+ val_acc.append(acc_val)
+ val_auc.append(metrics.roc_auc_score(true_val, pred_val))
+ v_accuracy = np.mean(val_acc)
+ v_loss = np.mean(val_loss)
+ v_auc = np.mean(val_auc)
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, tra_auc, v_loss, v_accuracy, v_auc, train_time)
+ else:
+ printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+ % (tra_loss, tra_accuracy, tra_auc, train_time)
+
+ print(printstr)
+ log_file.write(printstr + '\n')
+ log_file.flush()
+ else:
+ # here no val
+ iteration = dense_feature.shape[0] // batch_size
+
+ epoch = 10
+ for ep in range(epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ for idx in range(iteration):
+ start_index = idx * batch_size
+ my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+ my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+ my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ pred_val = loss_val[1]
+ true_val = loss_val[2]
+ if pred_val.shape[1] == 1: # for criteo case
+ acc_val = np.equal(
+ true_val,
+ pred_val > 0.5)
+ else:
+ acc_val = np.equal(
+ np.argmax(pred_val, 1),
+ np.argmax(true_val, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print("tensorflow: ", (time.time() - start))
+
+
+def train_adult(model, cluster, task_id, nrank):
+ from models.load_data import load_adult_data
+ x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)
+ part_size = len(x_train_deep) // nrank
+ start = part_size * task_id
+ end = start + part_size if task_id != nrank - 1 else len(x_train_deep)
+ x_train_deep = x_train_deep[start:end]
+ x_train_wide = x_train_wide[start:end]
+ y_train = y_train[start:end]
+
+ batch_size = 128
+ total_epoch = 50
+ dim_wide = 809
+
+ worker_device = "/job:worker/task:%d/gpu:0" % (task_id)
+ with tf.device(worker_device):
+ X_deep = []
+ for i in range(8):
+ X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
+ for i in range(4):
+ X_deep.append(tf.compat.v1.placeholder(
+ tf.float32, [batch_size, 1]))
+ X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
+ y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
+ loss, y, train_op, global_step = model(
+ X_deep, X_wide, y_, cluster, task_id)
+
+ with tf.device(
+ tf.compat.v1.train.replica_device_setter(
+ worker_device=worker_device,
+ cluster=cluster)):
+ server = tf.train.Server(
+ cluster, job_name="worker", task_index=task_id)
+ init = tf.global_variables_initializer()
+ sv = tf.train.Supervisor(
+ is_chief=(task_id == 0),
+ init_op=init,
+ recovery_wait_secs=1,
+ global_step=global_step)
+ sess_config = tf.ConfigProto(
+ # allow_soft_placement=True,
+ log_device_placement=False,
+ device_filters=["/job:ps",
+ "/job:worker/task:%d" % task_id])
+ sess = sv.prepare_or_wait_for_session(
+ server.target, config=sess_config)
+
+ sess.run(init)
+
+ iterations = x_train_deep.shape[0] // batch_size
+ for ep in range(total_epoch):
+ print('epoch', ep)
+ if ep == 5:
+ start = time.time()
+ ep_st = time.time()
+ train_loss = []
+ train_acc = []
+ pre_index = 0
+
+ for it in range(iterations):
+ batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
+ batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
+ batch_y = y_train[pre_index:pre_index + batch_size]
+ pre_index += batch_size
+
+ my_feed_dict = dict()
+ for i in range(12):
+ my_feed_dict[X_deep[i]] = np.array(
+ batch_x_deep[:, 1]).reshape(-1, 1)
+
+ my_feed_dict[X_wide] = np.array(batch_x_wide)
+ my_feed_dict[y_] = batch_y
+ loss_val = sess.run([loss, y, y_, train_op],
+ feed_dict=my_feed_dict)
+ acc_val = np.equal(
+ np.argmax(loss_val[1], 1),
+ np.argmax(loss_val[2], 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ tra_accuracy = np.mean(train_acc)
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (tra_loss, tra_accuracy, ep_en - ep_st))
+ print("tensorflow: ", (time.time() - start))
+
+
+def test_bandwidth(cluster, task_id):
+ print('test bandwidth')
+ iters = 1000
+ params_size = 128 * 9
+ ps_device = "/job:ps/task:0/cpu:0"
+ worker_device = "/job:worker/task:%d/cpu:0" % (task_id)
+
+ with tf.device(ps_device):
+ dtype = tf.int32
+ params = tf.get_variable("params", shape=[params_size], dtype=dtype,
+ initializer=tf.zeros_initializer())
+ with tf.device(tf.compat.v1.train.replica_device_setter(
+ worker_device=worker_device,
+ cluster=cluster)):
+ update = tf.get_variable("update", shape=[params_size], dtype=dtype,
+ initializer=tf.ones_initializer())
+ add_op = params.assign(update)
+
+ server = tf.train.Server(
+ cluster, job_name="worker", task_index=task_id)
+ init = tf.global_variables_initializer()
+ sv = tf.train.Supervisor(
+ is_chief=(task_id == 0),
+ init_op=init,
+ recovery_wait_secs=1)
+ sess_config = tf.ConfigProto(
+ allow_soft_placement=True,
+ log_device_placement=False,
+ device_filters=["/job:ps",
+ "/job:worker/task:%d" % task_id])
+ sess = sv.prepare_or_wait_for_session(
+ server.target, config=sess_config)
+
+ sess.run(init)
+ # warm up
+ for i in range(5):
+ sess.run(add_op.op)
+
+ start_time = time.time()
+ for i in range(iters):
+ sess.run(add_op.op)
+ elapsed_time = time.time() - start_time
+ ans = float(iters)*(params_size / 1024 / 1024)/elapsed_time
+ print("transfer rate: %f MB/s" % (ans))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str, required=True,
+ help="model to be tested")
+ parser.add_argument("--rank", type=int, required=True,
+ help="rank of process")
+ parser.add_argument(
+ "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")
+ parser.add_argument("--val", action="store_true",
+ help="whether to use validation")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+ raw_model = args.model
+ task_id = int(args.rank)
+ raw_config = args.config
+
+ config = json.load(open(raw_config))
+ cluster = tf.train.ClusterSpec(config)
+
+ if raw_model != 'band':
+ import tf_models
+ model = eval('tf_models.' + raw_model)
+ dataset = raw_model.split('_')[-1]
+ print('Model:', raw_model)
+ if dataset == 'criteo':
+ train_criteo(model, cluster, task_id, len(config['worker']), args)
+ elif dataset == 'adult':
+ # not support val or all
+ train_adult(model, cluster, task_id, len(config['worker']))
+ else:
+ raise NotImplementedError
+ else:
+ test_bandwidth(cluster, task_id)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/ctr/tf_models/__init__.py b/examples/ctr/tf_models/__init__.py
new file mode 100644
index 0000000..659f885
--- /dev/null
+++ b/examples/ctr/tf_models/__init__.py
@@ -0,0 +1,4 @@
+from .tf_dcn_criteo import dcn_criteo
+from .tf_deepfm_criteo import dfm_criteo
+from .tf_wdl_criteo import wdl_criteo
+from .tf_wdl_adult import wdl_adult
diff --git a/examples/ctr/tf_models/tf_dcn_criteo.py b/examples/ctr/tf_models/tf_dcn_criteo.py
new file mode 100644
index 0000000..90a3b58
--- /dev/null
+++ b/examples/ctr/tf_models/tf_dcn_criteo.py
@@ -0,0 +1,69 @@
+import tensorflow as tf
+
+
+def cross_layer(x0, x1, device):
+ # x0: input embedding feature (batch_size, 26 * embedding_size + 13)
+ # x1: the output of last layer (batch_size, 26 * embedding_size + 13)
+
+ embed_dim = x1.shape[-1]
+ with tf.device(device):
+ w = tf.compat.v1.get_variable(name='w', shape=(embed_dim,))
+ b = tf.compat.v1.get_variable(name='b', shape=(embed_dim,))
+ x_1w = tf.tensordot(tf.reshape(x1, [-1, 1, embed_dim]), w, axes=1)
+ cross = x0 * x_1w
+ return cross + x1 + b
+
+
+def build_cross_layer(x0, num_layers=3, device=tf.device('/gpu:0')):
+ x1 = x0
+ for i in range(num_layers):
+ with tf.compat.v1.variable_scope('layer%d' % i):
+ x1 = cross_layer(x0, x1, device)
+ return x1
+
+
+def dcn_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.003 / 8 # here to comply with HETU
+ all_partitioner, embed_partitioner = (
+ partitioner, None) if part_all else (None, partitioner)
+ with tf.compat.v1.variable_scope('dcn', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+ with tf.device('/cpu:0'):
+ Embedding = tf.compat.v1.get_variable(name="Embedding", shape=(
+ feature_dimension, embedding_size), partitioner=embed_partitioner)
+ sparse_input_embedding = tf.nn.embedding_lookup(
+ Embedding, sparse_input)
+
+ device = '/gpu:0' if param_on_gpu else '/cpu:0'
+ with tf.device(device):
+ W1 = tf.compat.v1.get_variable(
+ name='W1', shape=[26*embedding_size + 13, 256])
+ W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+ W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256])
+ W4 = tf.compat.v1.get_variable(
+ name='W4', shape=[256 + 26 * embedding_size + 13, 1])
+
+ with tf.device('/gpu:0'):
+ flatten = tf.reshape(sparse_input_embedding,
+ (-1, 26*embedding_size))
+ x = tf.concat((flatten, dense_input), 1)
+ # CrossNet
+ cross_output = build_cross_layer(x, num_layers=3, device=device)
+ # DNN
+ flatten = x
+
+ fc1 = tf.matmul(flatten, W1)
+ relu1 = tf.nn.relu(fc1)
+ fc2 = tf.matmul(relu1, W2)
+ relu2 = tf.nn.relu(fc2)
+ y3 = tf.matmul(relu2, W3)
+
+ y4 = tf.concat((cross_output, y3), 1)
+ y = tf.matmul(y4, W4)
+ loss = tf.reduce_mean(
+ tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+ optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+ learning_rate)
+ return loss, y, optimizer
diff --git a/examples/ctr/tf_models/tf_deepfm_criteo.py b/examples/ctr/tf_models/tf_deepfm_criteo.py
new file mode 100644
index 0000000..ea9d8f6
--- /dev/null
+++ b/examples/ctr/tf_models/tf_deepfm_criteo.py
@@ -0,0 +1,62 @@
+import tensorflow as tf
+
+
+def dfm_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.01 / 8 # here to comply with HETU
+ all_partitioner, embed_partitioner = (
+ partitioner, None) if part_all else (None, partitioner)
+ with tf.compat.v1.variable_scope('dfm', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+ with tf.device('/cpu:0'):
+ Embedding1 = tf.compat.v1.get_variable(name="Embedding1", shape=(
+ feature_dimension, 1), partitioner=embed_partitioner)
+ Embedding2 = tf.compat.v1.get_variable(name="embeddings", shape=(
+ feature_dimension, embedding_size), partitioner=embed_partitioner)
+ sparse_1dim_input = tf.nn.embedding_lookup(
+ Embedding1, sparse_input)
+ sparse_2dim_input = tf.nn.embedding_lookup(
+ Embedding2, sparse_input)
+
+ device = '/gpu:0' if param_on_gpu else '/cpu:0'
+ with tf.device(device):
+ FM_W = tf.compat.v1.get_variable(name='FM_W', shape=[13, 1])
+ W1 = tf.compat.v1.get_variable(
+ name='W1', shape=[26*embedding_size, 256])
+ W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+ W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 1])
+
+ with tf.device('/gpu:0'):
+ fm_dense_part = tf.matmul(dense_input, FM_W)
+ fm_sparse_part = tf.reduce_sum(sparse_1dim_input, 1)
+ # fst order output
+ y1 = fm_dense_part + fm_sparse_part
+
+ sparse_2dim_sum = tf.reduce_sum(sparse_2dim_input, 1)
+ sparse_2dim_sum_square = tf.multiply(
+ sparse_2dim_sum, sparse_2dim_sum)
+
+ sparse_2dim_square = tf.multiply(
+ sparse_2dim_input, sparse_2dim_input)
+ sparse_2dim_square_sum = tf.reduce_sum(sparse_2dim_square, 1)
+ sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
+ sparse_2dim_half = sparse_2dim * 0.5
+ # snd order output
+ y2 = tf.reduce_sum(sparse_2dim_half, 1, keepdims=True)
+
+ # DNN
+ flatten = tf.reshape(sparse_2dim_input, (-1, 26*embedding_size))
+ fc1 = tf.matmul(flatten, W1)
+ relu1 = tf.nn.relu(fc1)
+ fc2 = tf.matmul(relu1, W2)
+ relu2 = tf.nn.relu(fc2)
+ y3 = tf.matmul(relu2, W3)
+
+ y4 = y1 + y2
+ y = y4 + y3
+ loss = tf.reduce_mean(
+ tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+ optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+ learning_rate)
+ return loss, y, optimizer
diff --git a/examples/ctr/tf_models/tf_wdl_adult.py b/examples/ctr/tf_models/tf_wdl_adult.py
new file mode 100644
index 0000000..4137d08
--- /dev/null
+++ b/examples/ctr/tf_models/tf_wdl_adult.py
@@ -0,0 +1,77 @@
+import tensorflow as tf
+import numpy as np
+
+
+def wdl_adult(X_deep, X_wide, y_, cluster=None, task_id=None):
+ lr_ = 5 / 128
+ dim_wide = 809
+ dim_deep = 68
+ use_ps = cluster is not None
+
+ if use_ps:
+ device = tf.device(tf.train.replica_device_setter(
+ worker_device="/job:worker/task:%d/gpu:0" % (task_id),
+ cluster=cluster))
+ else:
+ device = tf.device('/gpu:0')
+ global_step = tf.Variable(0, name="global_step", trainable=False)
+ with device:
+ if use_ps:
+ global_step = tf.Variable(0, name="global_step", trainable=False)
+
+ rand = np.random.RandomState(seed=123)
+ W = tf.Variable(rand.normal(scale=0.1, size=[
+ dim_wide+20, 2]), dtype=tf.float32)
+ W1 = tf.Variable(rand.normal(scale=0.1, size=[
+ dim_deep, 50]), dtype=tf.float32)
+ b1 = tf.Variable(rand.normal(scale=0.1, size=[50]), dtype=tf.float32)
+ W2 = tf.Variable(rand.normal(
+ scale=0.1, size=[50, 20]), dtype=tf.float32)
+ b2 = tf.Variable(rand.normal(scale=0.1, size=[20]), dtype=tf.float32)
+
+ Embedding = []
+
+ for i in range(8):
+ Embedding.append(tf.Variable(rand.normal(
+ scale=0.1, size=[20, 8]), dtype=tf.float32))
+
+ # deep
+ X_deep_input = None
+ for i in range(8):
+ now = tf.nn.embedding_lookup(Embedding[i], X_deep[i])
+ now = tf.reshape(now, (-1, 8))
+ if X_deep_input is None:
+ X_deep_input = now
+ else:
+ X_deep_input = tf.concat([X_deep_input, now], 1)
+
+ for i in range(4):
+ now = tf.reshape(X_deep[i + 8], (-1, 1))
+ X_deep_input = tf.concat([X_deep_input, now], 1)
+
+ mat1 = tf.matmul(X_deep_input, W1)
+ add1 = tf.add(mat1, b1)
+ relu1 = tf.nn.relu(add1)
+ dropout1 = relu1
+ mat2 = tf.matmul(dropout1, W2)
+ add2 = tf.add(mat2, b2)
+ relu2 = tf.nn.relu(add2)
+ dropout2 = relu2
+ dmodel = dropout2
+
+ # wide
+ wmodel = tf.concat([X_wide, dmodel], 1)
+ wmodel = tf.matmul(wmodel, W)
+
+ y = wmodel
+ loss = tf.reduce_mean(
+ tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
+ )
+
+ optimizer = tf.train.GradientDescentOptimizer(lr_)
+ train_op = optimizer.minimize(loss, global_step=global_step)
+
+ if use_ps:
+ return loss, y, train_op, global_step
+ else:
+ return loss, y, train_op
diff --git a/examples/ctr/tf_models/tf_wdl_criteo.py b/examples/ctr/tf_models/tf_wdl_criteo.py
new file mode 100644
index 0000000..d3d7454
--- /dev/null
+++ b/examples/ctr/tf_models/tf_wdl_criteo.py
@@ -0,0 +1,40 @@
+import tensorflow as tf
+
+
+def wdl_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+ feature_dimension = 33762577
+ embedding_size = 128
+ learning_rate = 0.01 / 8 # here to comply with HETU
+ all_partitioner, embed_partitioner = (
+ partitioner, None) if part_all else (None, partitioner)
+ with tf.compat.v1.variable_scope('wdl', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+ with tf.device('/cpu:0'):
+ Embedding = tf.compat.v1.get_variable(name="Embedding", shape=(
+ feature_dimension, embedding_size), partitioner=embed_partitioner)
+ sparse_input_embedding = tf.nn.embedding_lookup(
+ Embedding, sparse_input)
+ device = '/gpu:0' if param_on_gpu else '/cpu:0'
+ with tf.device(device):
+ W1 = tf.compat.v1.get_variable(name='W1', shape=[13, 256])
+ W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+ W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256])
+ W4 = tf.compat.v1.get_variable(
+ name='W4', shape=[256 + 26 * embedding_size, 1])
+ with tf.device('/gpu:0'):
+ sparse_input_embedding = tf.reshape(
+ sparse_input_embedding, (-1, 26*embedding_size))
+ flatten = dense_input
+ fc1 = tf.matmul(flatten, W1)
+ relu1 = tf.nn.relu(fc1)
+ fc2 = tf.matmul(relu1, W2)
+ relu2 = tf.nn.relu(fc2)
+ y3 = tf.matmul(relu2, W3)
+
+ y4 = tf.concat((sparse_input_embedding, y3), 1)
+ y = tf.matmul(y4, W4)
+ loss = tf.reduce_mean(
+ tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+ optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+ learning_rate)
+ return loss, y, optimizer
diff --git a/examples/gnn/README.md b/examples/gnn/README.md
new file mode 100644
index 0000000..28ea840
--- /dev/null
+++ b/examples/gnn/README.md
@@ -0,0 +1,112 @@
+# GNN Examples (with Distributed Settings)
+
+## Structure
+```
+- gnn
+ - gnn_tools/ scripts to prepare data and other
+ - config/ distributed configurations
+ - gnn_model/ gnn models
+ - run_dist.py train gnn models in ps setting
+ - run_dist_hybrid.py train gnn models in hybrid setting
+ - run_single.py train with a single gpu
+
+```
+
+## Configuration file explained
+
+We use a simple yaml file to specify the run configuration.
+
+```yaml
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+launch :
+ worker : 4
+ server : 1
+ graph_server : 1
+ scheduler : true
+```
+
+The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.
+
+The difference of GNN model and other models is that we need to launch a set of graph servers to carry out graph sampling. Note that the total number of graph server MUST be equal to the graph partition number. It is recommended that you partition the graph into the number of machines and launch one graph server on each machine.
+
+Note that there should be only 1 scheduler and should only be launched on the machine with DMLC_PS_ROOT_URI.
+
+Note that the launch automatically select network interface for you. If this fails, try adding "DMLC_INTERFACE : eth0" to select the right network device.
+
+## Prepare graph datasets
+
+1. Prepare Normal dataset (use dense feature and no embedding)
+
+ ```shell
+python3 -m graphmix.partition [-d DatasetName] -n4 -p ~/yourDataPath
+ ```
+
+ We currently have the following dataset Cora, PubMed, Reddit, Flickr, Yelp, ogbn-products, ogbn-arxiv.
+
+2. Prepare ogbn-mag or Reddit dataset (with sparse embedding)
+
+ Then you can use the following command to partition the graph into 4 parts for 4-workers to use.
+
+ ```bash
+ python3 gnn_tools/part_graph.py [-d DatasetName] -n 4 -p ~/yourDataPath
+ ```
+
+ Also note that if you want to train on K node, replace the -n 4 with -n K.
+
+3. Prepare Amazon dataset: This dataset is introduced in the cluster-GCN paper and there are two file to be downloaded: [metadata.json](https://drive.google.com/file/d/0B2jJQxNRDl_rVVZCdWVnYmUyRDg) and [map_files](https://drive.google.com/file/d/0B3lPMIHmG6vGd2U3VHB0Wkk4cGM). Once you download and extract the files and put them together under gnn_tools directory you can run
+
+ ```bash
+ python3 prepare_amazon_dataset.py
+ ```
+
+ Note that you need nltk installed in your environment to run this script and this will take a while.
+
+ After running the script, you will get the two output file: graph.npz and sparsefeature.npy. Put them in the right place.
+
+ ```bash
+ mkdir -p ~/.graphmix_dataset/AmazonSparse
+ mv graph.npz sparsefeature.npy ~/.graphmix_dataset/AmazonSparse
+ ```
+
+ Finally, use the part_graph.py to partition the graph
+
+ ```
+ python3 gnn_tools/part_graph.py -d AmazonSparse -n 4 -p ~/yourDataPath
+ ```
+
+## Training GNN Models
+
+After you have prepare one graph dataset, you can start training Embedding Models on graph datasets. We take Reddit as an example.
+
+To train on PS communication mode. Run
+
+```
+python3 run_dist.py [configfile] -p ~/yourDataPath/Reddit [--dense]
+```
+
+To train on Hybrid communication mode. Run
+
+```
+mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit [--dense]
+```
+
+When running on Hybrid mode, you will also have to launch some servers and scheduler seperately
+
+```
+python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit --server
+```
+
+A --dense argument is used if you are training with a normal dataset (with dense feature).
+
+## Train with a single card
+
+This time you will have to run partition as we mentioned before with n=1. After that, run
+
+```shell
+python3 run_single.py -p ~/yourDataPath/Reddit [--dense]
+```
+
diff --git a/examples/gnn/config/local_w2.yml b/examples/gnn/config/local_w2.yml
new file mode 100644
index 0000000..5e53073
--- /dev/null
+++ b/examples/gnn/config/local_w2.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 2
+ DMLC_NUM_SERVER : 1
+launch :
+ worker : 2
+ server : 1
+ graph_server : 1
+ scheduler : true
diff --git a/examples/gnn/config/local_w4.yml b/examples/gnn/config/local_w4.yml
new file mode 100644
index 0000000..8a7ea2d
--- /dev/null
+++ b/examples/gnn/config/local_w4.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 4
+ DMLC_NUM_SERVER : 1
+launch :
+ worker : 4
+ server : 1
+ graph_server : 4
+ scheduler : true
diff --git a/examples/gnn/config/local_w8.yml b/examples/gnn/config/local_w8.yml
new file mode 100644
index 0000000..cc83196
--- /dev/null
+++ b/examples/gnn/config/local_w8.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 8
+ DMLC_NUM_SERVER : 1
+launch :
+ worker : 8
+ server : 1
+ graph_server : 4
+ scheduler : true
diff --git a/examples/gnn/config/single.yml b/examples/gnn/config/single.yml
new file mode 100644
index 0000000..f669fca
--- /dev/null
+++ b/examples/gnn/config/single.yml
@@ -0,0 +1,10 @@
+shared :
+ DMLC_PS_ROOT_URI : 127.0.0.1
+ DMLC_PS_ROOT_PORT : 13100
+ DMLC_NUM_WORKER : 1
+ DMLC_NUM_SERVER : 0
+launch :
+ worker : 1
+ server : 0
+ graph_server : 1
+ scheduler : true
diff --git a/examples/gnn/gnn_model/__init__.py b/examples/gnn/gnn_model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/gnn/gnn_model/layer.py b/examples/gnn/gnn_model/layer.py
new file mode 100644
index 0000000..9270a02
--- /dev/null
+++ b/examples/gnn/gnn_model/layer.py
@@ -0,0 +1,68 @@
+import hetu as ht
+from hetu import init
+
+
+class GCN(object):
+ def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0,
+ name="GCN", custom_init=None):
+ if custom_init is not None:
+ self.weight = ht.Variable(
+ value=custom_init[0], name=name+"_Weight")
+ self.bias = ht.Variable(value=custom_init[1], name=name+"_Bias")
+ else:
+ self.weight = init.xavier_uniform(
+ shape=(in_features, out_features), name=name+"_Weight")
+ self.bias = init.zeros(shape=(out_features,), name=name+"_Bias")
+ # self.mp is a sparse matrix and should appear in feed_dict later
+ self.mp = norm_adj
+ self.activation = activation
+ self.dropout = dropout
+ self.output_width = out_features
+
+ def __call__(self, x):
+ """
+ Build the computation graph, return the output node
+ """
+ if self.dropout > 0:
+ x = ht.dropout_op(x, 1 - self.dropout)
+ x = ht.matmul_op(x, self.weight)
+ msg = x + ht.broadcastto_op(self.bias, x)
+ x = ht.csrmm_op(self.mp, msg)
+ if self.activation == "relu":
+ x = ht.relu_op(x)
+ elif self.activation is not None:
+ raise NotImplementedError
+ return x
+
+
+class SageConv(object):
+ def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0,
+ name="GCN", custom_init=None, mp_val=None):
+
+ self.weight = init.xavier_uniform(
+ shape=(in_features, out_features), name=name+"_Weight")
+ self.bias = init.zeros(shape=(out_features,), name=name+"_Bias")
+ self.weight2 = init.xavier_uniform(
+ shape=(in_features, out_features), name=name+"_Weight")
+ # self.mp is a sparse matrix and should appear in feed_dict later
+ self.mp = norm_adj
+ self.activation = activation
+ self.dropout = dropout
+ self.output_width = 2 * out_features
+
+ def __call__(self, x):
+ """
+ Build the computation graph, return the output node
+ """
+ feat = x
+ if self.dropout > 0:
+ x = ht.dropout_op(x, 1 - self.dropout)
+
+ x = ht.csrmm_op(self.mp, x)
+ x = ht.matmul_op(x, self.weight)
+ x = x + ht.broadcastto_op(self.bias, x)
+ if self.activation == "relu":
+ x = ht.relu_op(x)
+ elif self.activation is not None:
+ raise NotImplementedError
+ return ht.concat_op(x, ht.matmul_op(feat, self.weight2), axis=1)
diff --git a/examples/gnn/gnn_model/model.py b/examples/gnn/gnn_model/model.py
new file mode 100644
index 0000000..502271c
--- /dev/null
+++ b/examples/gnn/gnn_model/model.py
@@ -0,0 +1,62 @@
+import hetu as ht
+import numpy as np
+from .layer import GCN, SageConv
+
+
+def convert_to_one_hot(vals, max_val=0):
+ """Helper method to convert label array to one-hot array."""
+ if max_val == 0:
+ max_val = vals.max() + 1
+ one_hot_vals = np.zeros((vals.size, max_val))
+ one_hot_vals[np.arange(vals.size), vals] = 1
+ return one_hot_vals
+
+
+def sparse_model(int_feature, hidden_layer_size, embedding_idx_max, embedding_width, num_classes, lr):
+ y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot(
+ g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu()))
+ mask_ = ht.Variable(name="mask_")
+ index_ = ht.GNNDataLoaderOp(lambda g: ht.array(
+ g.i_feat[:, 0:-2], ctx=ht.cpu()), ctx=ht.cpu())
+ embedding = ht.init.random_normal(
+ [embedding_idx_max, embedding_width], stddev=0.1)
+ embed = ht.embedding_lookup_op(embedding, index_)
+ feat = ht.array_reshape_op(embed, (-1, int_feature * embedding_width))
+
+ norm_adj_ = ht.Variable("message_passing", trainable=False, value=None)
+ gcn1 = GCN(int_feature * embedding_width,
+ hidden_layer_size, norm_adj_, activation="relu")
+ gcn2 = GCN(gcn1.output_width, num_classes, norm_adj_)
+ x = gcn1(feat)
+ y = gcn2(x)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ train_loss = loss * mask_
+ train_loss = ht.reduce_mean_op(train_loss, [0])
+ opt = ht.optim.SGDOptimizer(lr)
+ train_op = opt.minimize(train_loss)
+ # model input & model output
+ return [loss, y, train_op], [mask_, norm_adj_]
+
+
+def dense_model(feature_dim, hidden_layer_size, num_classes, lr, arch=GCN):
+ y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot(
+ g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu()))
+ mask_ = ht.Variable(name="mask_")
+ feat = ht.GNNDataLoaderOp(lambda g: ht.array(
+ g.f_feat, ctx=ht.cpu()), ctx=ht.cpu())
+
+ norm_adj_ = ht.Variable("message_passing", trainable=False, value=None)
+ gcn1 = arch(feature_dim, hidden_layer_size, norm_adj_, activation="relu")
+ gcn2 = arch(gcn1.output_width, hidden_layer_size,
+ norm_adj_, activation="relu")
+ classifier = ht.init.xavier_uniform(shape=(gcn2.output_width, num_classes))
+ x = gcn1(feat)
+ x = gcn2(x)
+ y = ht.matmul_op(x, classifier)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ train_loss = loss * mask_
+ train_loss = ht.reduce_mean_op(train_loss, [0])
+ opt = ht.optim.SGDOptimizer(lr)
+ train_op = opt.minimize(train_loss)
+ # model input & model output
+ return [loss, y, train_op], [mask_, norm_adj_]
diff --git a/examples/gnn/gnn_model/utils.py b/examples/gnn/gnn_model/utils.py
new file mode 100644
index 0000000..33feeb9
--- /dev/null
+++ b/examples/gnn/gnn_model/utils.py
@@ -0,0 +1,46 @@
+import hetu
+import graphmix
+import numpy as np
+from tqdm import tqdm
+
+
+def padding(graph, target_num_nodes):
+ assert graph.num_nodes <= target_num_nodes
+ graph.convert2coo()
+ new_graph = graphmix.Graph(graph.edge_index, target_num_nodes)
+ new_graph.tag = graph.tag
+ new_graph.type = graph.type
+ extra = target_num_nodes - graph.num_nodes
+ new_graph.i_feat = np.concatenate(
+ [graph.i_feat, np.tile(graph.i_feat[0], [extra, 1])])
+ new_graph.f_feat = np.concatenate(
+ [graph.f_feat, np.tile(graph.f_feat[0], [extra, 1])])
+ if graph.extra.size:
+ new_graph.extra = np.concatenate([graph.extra, np.zeros([extra, 1])])
+ return new_graph
+
+
+def prepare_data(ngraph):
+ cli = graphmix.Client()
+ graphs = []
+ for i in tqdm(range(ngraph)):
+ query = cli.pull_graph()
+ graph = cli.wait(query)
+ graphs.append(graph)
+ max_num_nodes = 0
+ for i in range(ngraph):
+ max_num_nodes = max(max_num_nodes, graphs[i].num_nodes)
+ for i in range(ngraph):
+ graphs[i] = padding(graphs[i], max_num_nodes)
+ return graphs
+
+
+def get_norm_adj(graph, device, use_original_gcn_norm=False):
+ norm = graph.gcn_norm(use_original_gcn_norm)
+ mp_mat = hetu.ndarray.sparse_array(
+ values=norm,
+ indices=(graph.edge_index[1], graph.edge_index[0]),
+ shape=(graph.num_nodes, graph.num_nodes),
+ ctx=device
+ )
+ return mp_mat
diff --git a/examples/gnn/gnn_tools/__init__.py b/examples/gnn/gnn_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/gnn/gnn_tools/launcher.py b/examples/gnn/gnn_tools/launcher.py
new file mode 100644
index 0000000..bebb77b
--- /dev/null
+++ b/examples/gnn/gnn_tools/launcher.py
@@ -0,0 +1,135 @@
+import os
+import os.path as osp
+import signal
+import yaml
+import multiprocessing
+
+import libc_graphmix as _C
+import hetu as ht
+from graphmix.shard import Shard
+
+default_graph_root_port = 27770
+
+
+def start_graph_server(shard, server_init):
+ os.environ['GRAPHMIX_ROLE'] = "server"
+ _C.init()
+ shard.load_graph_shard(_C.rank())
+ server = _C.start_server()
+ server.init_meta(shard.meta)
+ server.init_data(shard.f_feat, shard.i_feat, shard.edges)
+ del shard
+ print("GraphMix Server {} : data initialized at {}:{}".format(
+ _C.rank(), _C.ip(), _C.port()))
+ _C.barrier_all()
+ server_init(server)
+ _C.finalize()
+
+
+def start_server():
+ os.environ["DMLC_ROLE"] = "server"
+ ht.server_init()
+ ht.server_finish()
+
+# two scheduler in one process
+
+
+def start_scheduler():
+ os.environ['GRAPHMIX_ROLE'] = "scheduler"
+ os.environ['DMLC_ROLE'] = "scheduler"
+ _C.init()
+ ht.scheduler_init()
+ ht.scheduler_finish()
+ _C.finalize()
+
+
+def start_worker(func, args):
+ os.environ['GRAPHMIX_ROLE'] = "worker"
+ os.environ['DMLC_ROLE'] = "worker"
+ _C.init()
+ ht.worker_init()
+ args.local_rank = _C.rank() % args.num_local_worker
+ _C.barrier_all()
+ func(args)
+ ht.worker_finish()
+ _C.finalize()
+
+
+def start_worker_standalone(func, args, local_rank):
+ args.local_rank = local_rank
+ func(args)
+
+
+def signal_handler(signal, frame):
+ print("SIGINT signal caught, stop Training")
+ for proc in process_list:
+ proc.kill()
+ exit(0)
+
+
+process_list = []
+
+
+def launch_graphmix_and_hetu_ps(target, args, server_init, hybrid_config=None):
+ # open setting file
+ file_path = osp.abspath(osp.expanduser(osp.normpath(args.config)))
+ with open(file_path) as setting_file:
+ settings = yaml.load(setting_file.read(), Loader=yaml.FullLoader)
+
+ # write environment variables
+ for key, value in settings["shared"].items():
+ os.environ[str(key)] = str(value)
+
+ # the graph data path is relative to the setting file path
+ graph_data_path = osp.abspath(osp.expanduser(osp.normpath(args.path)))
+ print("GraphMix launcher : Using Graph Data from ", graph_data_path)
+
+ # load graph and set the server number equal to the number of graph parts
+ shard = Shard(graph_data_path)
+ os.environ['GRAPHMIX_NUM_SERVER'] = str(shard.meta["num_part"])
+ os.environ['GRAPHMIX_NUM_WORKER'] = os.environ['DMLC_NUM_WORKER']
+ os.environ['GRAPHMIX_ROOT_URI'] = os.environ['DMLC_PS_ROOT_URI']
+ os.environ['GRAPHMIX_ROOT_PORT'] = str(default_graph_root_port)
+ if 'DMLC_INTERFACE' in os.environ.keys():
+ os.environ['GRAPHMIX_INTERFACE'] = os.environ['DMLC_INTERFACE']
+
+ # get local job number
+ args.num_local_worker = int(settings["launch"]["worker"])
+ args.num_local_graph_server = int(settings["launch"]["graph_server"])
+ args.num_local_server = int(settings["launch"]["server"])
+ args.scheduler = settings["launch"]["scheduler"]
+ assert args.num_local_graph_server <= shard.meta["num_part"]
+ assert args.num_local_worker <= int(os.environ['DMLC_NUM_WORKER'])
+ assert args.num_local_server <= int(os.environ['DMLC_NUM_SERVER'])
+ if hybrid_config == "worker":
+ args.num_local_server = 0
+ args.num_local_graph_server = 0
+ args.scheduler = False
+ args.num_local_worker = 1
+ elif hybrid_config == "server":
+ args.num_local_worker = 0
+
+ # launch workers
+ for i in range(args.num_local_worker):
+ proc = multiprocessing.Process(
+ target=start_worker, args=[target, args])
+ process_list.append(proc)
+ # launch graph servers
+ for i in range(args.num_local_graph_server):
+ proc = multiprocessing.Process(
+ target=start_graph_server, args=[shard, server_init])
+ process_list.append(proc)
+ # launch ps servers
+ for i in range(args.num_local_server):
+ proc = multiprocessing.Process(target=start_server, args=[])
+ process_list.append(proc)
+ # launch scheduler
+ if args.scheduler:
+ proc = multiprocessing.Process(target=start_scheduler)
+ process_list.append(proc)
+ # wait until all process finish
+ for proc in process_list:
+ proc.start()
+ signal.signal(signal.SIGINT, signal_handler)
+ for proc in process_list:
+ proc.join()
diff --git a/examples/gnn/gnn_tools/log.py b/examples/gnn/gnn_tools/log.py
new file mode 100644
index 0000000..1c5f536
--- /dev/null
+++ b/examples/gnn/gnn_tools/log.py
@@ -0,0 +1,68 @@
+import multiprocessing
+import numpy as np
+import time
+
+logfile = open("log.txt", "w")
+
+
+class SharedTrainingStat():
+ def __init__(self):
+ self.manager = multiprocessing.Manager()
+ self.lock = self.manager.Lock()
+ self.total = self.manager.Value("total", 0)
+ self.acc = self.manager.Value("acc", 0)
+ self.loss = self.manager.Value("loss", 0.0)
+ self.count = self.manager.Value("count", 0)
+ self.train_total = self.manager.Value("train_total", 0)
+ self.train_acc = self.manager.Value("train_acc", 0)
+ self.train_loss = self.manager.Value("train_loss", 0.0)
+ self.train_count = self.manager.Value("train_count", 0)
+ self.time = []
+
+ def update(self, acc, total, loss):
+ self.lock.acquire()
+ self.total.value += total
+ self.acc.value += acc
+ self.loss.value += loss
+ self.count.value += 1
+ self.lock.release()
+
+ def update_train(self, acc, total, loss):
+ self.lock.acquire()
+ self.train_total.value += total
+ self.train_acc.value += acc
+ self.train_loss.value += loss
+ self.train_count.value += 1
+ self.lock.release()
+
+ def print(self, start=""):
+ self.lock.acquire()
+ if len(self.time) > 3:
+ epoch_time = np.array(self.time[1:])-np.array(self.time[:-1])
+ print(
+ "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time)))
+ self.time.append(time.time())
+ print(
+ start,
+ "test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
+ self.loss.value / self.count.value,
+ self.acc.value / self.total.value,
+ self.train_loss.value / self.train_count.value,
+ self.train_acc.value / self.train_total.value
+ )
+ )
+ print(
+ self.loss.value / self.count.value, self.acc.value / self.total.value,
+ self.train_loss.value / self.train_count.value, self.train_acc.value /
+ self.train_total.value,
+ file=logfile, flush=True
+ )
+ self.total.value = 0
+ self.acc.value = 0
+ self.loss.value = 0
+ self.count.value = 0
+ self.train_total.value = 0
+ self.train_acc.value = 0
+ self.train_loss.value = 0
+ self.train_count.value = 0
+ self.lock.release()
diff --git a/examples/gnn/gnn_tools/part_graph.py b/examples/gnn/gnn_tools/part_graph.py
new file mode 100644
index 0000000..45896e5
--- /dev/null
+++ b/examples/gnn/gnn_tools/part_graph.py
@@ -0,0 +1,25 @@
+from graphmix.partition import part_graph
+from graphmix.dataset import load_dataset
+from sparse_datasets import load_sparse_dataset
+import argparse
+import os.path as osp
+import yaml
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--dataset", "-d", required=True)
+ parser.add_argument("--nparts", "-n", required=True)
+ parser.add_argument("--path", "-p", required=True)
+ args = parser.parse_args()
+ output_path = str(args.path)
+ nparts = int(args.nparts)
+ dataset, idx_max = load_sparse_dataset(args.dataset)
+ output_path = osp.expanduser(osp.join(output_path, args.dataset))
+ part_graph(dataset, nparts, output_path)
+ # now write idx_max into meta.yml
+ meta_file = osp.join(output_path, "meta.yml")
+ with open(meta_file) as f:
+ meta = yaml.load(f.read(), Loader=yaml.FullLoader)
+ meta["idx_max"] = idx_max
+ with open(meta_file, "w") as f:
+ yaml.dump(meta, f, sort_keys=False)
diff --git a/examples/gnn/gnn_tools/prepare_amazon_data.py b/examples/gnn/gnn_tools/prepare_amazon_data.py
new file mode 100644
index 0000000..71beb0d
--- /dev/null
+++ b/examples/gnn/gnn_tools/prepare_amazon_data.py
@@ -0,0 +1,120 @@
+import json
+import ast
+import numpy as np
+import nltk
+# all products with metadata
+filemap = {'train': './amazon-3M_train_map.txt',
+ 'test': './amazon-3M_test_map.txt',
+ 'bow': './amazon-3M_feature_map.txt',
+ 'meta': './metadata.json',
+ 'output': './graph.npz',
+ 'output_sparse': './sparsefeature.npy'}
+
+
+def getBagofWord():
+ bow = dict()
+ with open(filemap['bow'], 'r') as f:
+ # start with 1, 0 for padding
+ word_cnt = 1
+ for line in f.read().strip().split():
+ bow[line] = word_cnt
+ word_cnt += 1
+ return bow
+
+
+def gettoken(descriptions, length):
+ bow = getBagofWord()
+ token_matrix = []
+ for desc in descriptions:
+ token_id = []
+ token = nltk.word_tokenize(desc.lower())
+ for word in token:
+ if word in bow:
+ token_id.append(bow[word])
+ if len(token_id) == length:
+ break
+ while len(token_id) < length:
+ token_id.append(0)
+ token_matrix.append(token_id)
+ return np.array(token_matrix)
+
+
+prod_all = dict()
+prod_rcd = dict()
+with open(filemap['meta'], 'r') as f:
+ for line in f:
+ prod = ast.literal_eval(line.strip().replace('\n', '\\n'))
+ asin = prod['asin']
+ prod_all[asin] = prod
+ if 'related' in prod and 'categories' in prod and 'description' in prod:
+ prod_rcd[asin] = prod
+
+testNodes = set()
+prod_gcn = dict()
+asin2id = dict()
+cnt_id = 0
+asinlist = []
+
+for kword in ['train', 'test']:
+ with open(filemap[kword], 'r') as f:
+ for line in f:
+ asin = line.split()[0]
+ if asin in prod_rcd:
+ if kword == 'test':
+ testNodes.add(asin)
+ prod_gcn[asin] = prod_rcd[asin]
+ asin2id[asin] = cnt_id
+ cnt_id += 1
+ asinlist.append(asin)
+
+graphlen = len(prod_gcn)
+print('#products with rel/cat/des/feat (GCN assumptions)', graphlen)
+print('#trainNodes:', graphlen-len(testNodes), 'testNodes:', len(testNodes))
+
+print(len(asin2id))
+
+cat2id = dict()
+cnt_id = 0
+
+class_map = np.zeros(graphlen).astype(np.int32)
+train_map = np.zeros(graphlen).astype(np.int32)
+descriptions = []
+for idx, asin in enumerate(asinlist):
+ prod = prod_gcn[asin]
+ isTest = True if asin in testNodes else False
+
+ cat = prod['categories'][0][0]
+ if cat not in cat2id:
+ cat2id[cat] = (cnt_id, 0, 0)
+ cnt_id += 1
+
+ if isTest:
+ cat2id[cat] = (cat2id[cat][0], cat2id[cat][1], cat2id[cat][2]+1)
+ else:
+ cat2id[cat] = (cat2id[cat][0], cat2id[cat][1]+1, cat2id[cat][2])
+
+ class_map[idx] = cat2id[cat][0]
+ train_map[idx] = 0 if isTest else 1
+ if "title" in prod:
+ descriptions.append(prod["title"] + " " + prod['description'])
+ else:
+ descriptions.append(prod['description'])
+
+print('Classes:', cat2id)
+print("Num Classes:", len(cat2id))
+
+links_set = set()
+for idx, asin in enumerate(asinlist):
+ for rel, neighbors in prod_gcn[asin]['related'].items():
+ for asin_nei in neighbors:
+ if asin_nei not in asin2id:
+ continue
+ idx_nei = asin2id[asin_nei]
+ lk = (idx, idx_nei) if idx_nei > idx else (idx_nei, idx)
+ if lk not in links_set:
+ links_set.add(lk)
+links = np.array(list(links_set))
+print('#links between products:', len(links))
+token_matrix = gettoken(descriptions, 16)
+np.savez(file=filemap['output'], y=class_map, train_map=train_map, edge=links)
+np.save(file=filemap['output_sparse'], arr=token_matrix)
diff --git a/examples/gnn/gnn_tools/sparse_datasets.py b/examples/gnn/gnn_tools/sparse_datasets.py
new file mode 100644
index 0000000..54582f5
--- /dev/null
+++ b/examples/gnn/gnn_tools/sparse_datasets.py
@@ -0,0 +1,89 @@
+import graphmix
+from graphmix.dataset import load_dataset
+import numpy as np
+import os.path as osp
+
+
+class AmazonSparseDataset():
+ def __init__(self, dataset_root):
+ self.name = "AmazonSparse"
+ data = np.load(osp.join(dataset_root, "graph.npz"))
+ feat = np.load(osp.join(dataset_root, "sparsefeature.npy"))
+ num_nodes = feat.shape[0]
+ edge = data['edge'].T
+ directed = np.concatenate([edge, edge[[1, 0]]], axis=1)
+ self.idx_max = np.max(feat) + 1
+ node_id = np.arange(num_nodes).reshape(-1, 1) + self.idx_max
+ self.idx_max += num_nodes
+ self.x = np.empty([num_nodes, 0])
+ self.y = np.concatenate(
+ [feat, node_id, data['y'].reshape(-1, 1)], axis=-1)
+ self.train_mask = data["train_map"]
+ self.graph = graphmix.Graph(
+ edge_index=directed,
+ num_nodes=num_nodes
+ )
+ self.num_classes = int(np.max(data['y']) + 1)
+
+
+class OGBNmagDataset():
+ def __init__(self, dataset_root):
+ self.name = "ogbn-mag"
+ from ogb.nodeproppred import PygNodePropPredDataset
+ dataset = PygNodePropPredDataset(name=self.name, root=dataset_root)
+ data = dataset[0]
+ year = data.node_year['paper'].numpy()
+ self.train_mask = year < 2018
+ edge = data.edge_index_dict['paper', 'cites', 'paper'].numpy()
+ directed = np.concatenate([edge, edge[[1, 0]]], axis=1)
+ num_nodes = data.num_nodes_dict['paper']
+ self.graph = graphmix.Graph(
+ edge_index=directed,
+ num_nodes=num_nodes
+ )
+ self.num_classes = dataset.num_classes
+
+ def process_sparse_idx(rel, length, base):
+ sp_idx = [[] for i in range(num_nodes)]
+ for i, j in rel.T:
+ sp_idx[i].append(j)
+ for i in range(num_nodes):
+ if len(sp_idx[i]) > length:
+ sp_idx[i] = sp_idx[i][0:length]
+ while len(sp_idx[i]) < length:
+ sp_idx[i].append(-1)
+ sp_idx = np.array(sp_idx)
+ sp_idx += (base + 1)
+ return sp_idx
+
+ node_id = np.arange(num_nodes).reshape(-1, 1)
+ field = data.edge_index_dict[(
+ 'paper', 'has_topic', 'field_of_study')].numpy()
+ paper_field = process_sparse_idx(field, 10, num_nodes)
+ idx_max = num_nodes + data.num_nodes_dict['field_of_study'] + 1
+ author = data.edge_index_dict[('author', 'writes', 'paper')].numpy()
+ paper_author = process_sparse_idx(author[[1, 0]], 10, idx_max)
+ idx_max += data.num_nodes_dict['author'] + 1
+ self.idx_max = idx_max
+ self.x = np.empty([num_nodes, 0])
+ self.y = np.concatenate([
+ paper_field, paper_author, node_id, data.y_dict["paper"].numpy()
+ ], axis=1)
+
+
+def load_sparse_dataset(name):
+ root_dir = osp.expanduser(osp.join('~/.graphmix_dataset/', name))
+ if name == "Reddit":
+ dataset = load_dataset(name)
+ idx_max = dataset.x.shape[0]
+ node_id = np.arange(idx_max).reshape(-1, 1)
+ dataset.y = np.concatenate([node_id, dataset.y.reshape(-1, 1)], axis=1)
+ elif name == "AmazonSparse":
+ dataset = AmazonSparseDataset(root_dir)
+ idx_max = dataset.idx_max
+ elif name == "ogbn-mag":
+ dataset = OGBNmagDataset(root_dir)
+ idx_max = dataset.idx_max
+ else:
+ raise NotImplementedError
+ return dataset, int(idx_max)
diff --git a/examples/gnn/run_dist.py b/examples/gnn/run_dist.py
new file mode 100644
index 0000000..8b1bccc
--- /dev/null
+++ b/examples/gnn/run_dist.py
@@ -0,0 +1,91 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model
+from gnn_tools.log import SharedTrainingStat
+import graphmix
+
+import hetu as ht
+
+import numpy as np
+import argparse
+
+# usage : on each machine
+# python3 run_dist.py [configfile] [-p data_path]
+
+
+def train_main(args):
+ cli = graphmix.Client()
+ meta = cli.meta
+ hidden_layer_size = args.hidden_size
+ num_epoch = args.num_epoch
+ rank = cli.rank()
+ nrank = cli.num_worker()
+ ctx = ht.gpu(rank % args.num_local_worker)
+ embedding_width = args.hidden_size
+ # the last two is train label and other train mask
+ num_int_feature = meta["int_feature"] - 2
+ # sample some graphs
+ ngraph = meta["train_node"] // (args.batch_size * nrank)
+ graphs = prepare_data(ngraph)
+ # build model
+ [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+ num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+
+ idx = 0
+ graph = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph)
+ ht.GNNDataLoaderOp.step(graph)
+ executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS',
+ use_sparse_pull=False, cstable_policy=args.cache)
+ nbatches = meta["train_node"] // (args.batch_size * nrank)
+ for epoch in range(num_epoch):
+ for _ in range(nbatches):
+ graph_nxt = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph_nxt)
+ train_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+ eval_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+ feed_dict = {
+ norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)),
+ mask_: train_mask
+ }
+ loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+ y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+ acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+ train_acc = np.sum(
+ (y_predicted == graph.i_feat[:, -2]) * train_mask)
+ stat.update(acc, eval_mask.sum(), np.sum(
+ loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+ stat.update_train(train_acc, train_mask.sum(), np.sum(
+ loss_val.asnumpy()*train_mask)/train_mask.sum())
+ ht.get_worker_communicate().BarrierWorker()
+ graph = graph_nxt
+ if rank == 0:
+ stat.print(epoch)
+
+
+def server_init(server):
+ batch_size = args.batch_size
+ server.init_cache(0.1, graphmix.cache.LFUOpt)
+ worker_per_server = server.num_worker() // server.num_server()
+ server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+ depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+ server.is_ready()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("config")
+ parser.add_argument("--path", "-p", required=True)
+ parser.add_argument("--num_epoch", default=300, type=int)
+ parser.add_argument("--hidden_size", default=128, type=int)
+ parser.add_argument("--learning_rate", default=1, type=float)
+ parser.add_argument("--batch_size", default=128, type=int)
+ parser.add_argument("--cache", default="LFUOpt", type=str)
+ args = parser.parse_args()
+ stat = SharedTrainingStat()
+ launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init)
diff --git a/examples/gnn/run_dist_hybrid.py b/examples/gnn/run_dist_hybrid.py
new file mode 100644
index 0000000..21cf333
--- /dev/null
+++ b/examples/gnn/run_dist_hybrid.py
@@ -0,0 +1,153 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model
+import graphmix
+
+import hetu as ht
+from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+
+import numpy as np
+import time
+import os
+import sys
+import multiprocessing
+import argparse
+
+# usage :
+# mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] [-p data_path]
+# python3 run_dist_hybrid.py [configfile] [-p data_path] --server
+
+
+class TrainStat():
+ def __init__(self, comm):
+ self.file = open("log.txt", "w")
+ self.train_stat = np.zeros(4)
+ self.test_stat = np.zeros(4)
+ self.count = 0
+ self.time = []
+ self.comm = comm
+
+ def update_test(self, cnt, total, loss):
+ self.test_stat += [1, cnt, total, loss]
+
+ def update_train(self, cnt, total, loss):
+ self.train_stat += [1, cnt, total, loss]
+
+ def sync_and_clear(self):
+ self.count += 1
+ train_stat = ht.array(self.train_stat, ht.cpu())
+ test_stat = ht.array(self.test_stat, ht.cpu())
+ self.comm.dlarrayNcclAllReduce(
+ train_stat, train_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream)
+ self.comm.dlarrayNcclAllReduce(
+ test_stat, test_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream)
+ self.comm.stream.sync()
+ train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy()
+ printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
+ self.count,
+ test_stat[3] / test_stat[0],
+ test_stat[1] / test_stat[2],
+ train_stat[3] / train_stat[0],
+ train_stat[1] / train_stat[2],
+ )
+ logstr = "{} {} {} {}".format(
+ test_stat[3] / test_stat[0],
+ test_stat[1] / test_stat[2],
+ train_stat[3] / train_stat[0],
+ train_stat[1] / train_stat[2],
+ )
+ self.time.append(time.time())
+ if self.comm.device_id.value == 0:
+ print(printstr, flush=True)
+ print(logstr, file=self.file, flush=True)
+ if len(self.time) > 3:
+ epoch_time = np.array(self.time[1:])-np.array(self.time[:-1])
+ print(
+ "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time)))
+
+ self.train_stat[:] = 0
+ self.test_stat[:] = 0
+
+
+def train_main(args):
+ comm = ht.wrapped_mpi_nccl_init()
+ device_id = comm.dev_id
+ cli = graphmix.Client()
+ meta = cli.meta
+ hidden_layer_size = args.hidden_size
+ num_epoch = args.num_epoch
+ rank = cli.rank()
+ nrank = cli.num_worker()
+ ctx = ht.gpu(device_id)
+ embedding_width = args.hidden_size
+ # the last two is train label and other train mask
+ num_int_feature = meta["int_feature"] - 2
+ # sample some graphs
+ ngraph = 10 * meta["train_node"] // (args.batch_size * nrank)
+ graphs = prepare_data(ngraph)
+ # build model
+ [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+ num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+ idx = 0
+ graph = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph)
+ ht.GNNDataLoaderOp.step(graph)
+ executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='Hybrid',
+ use_sparse_pull=False, cstable_policy=args.cache)
+ nbatches = meta["train_node"] // (args.batch_size * nrank)
+ train_state = TrainStat(comm)
+ for epoch in range(num_epoch):
+ for _ in range(nbatches):
+ graph_nxt = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph_nxt)
+ train_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+ eval_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+ feed_dict = {
+ norm_adj_: get_norm_adj(graph, ht.gpu(device_id)),
+ mask_: train_mask
+ }
+ loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+ y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+ acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+ train_acc = np.sum(
+ (y_predicted == graph.i_feat[:, -2]) * train_mask)
+ train_state.update_test(acc, eval_mask.sum(), np.sum(
+ loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+ train_state.update_train(train_acc, train_mask.sum(), np.sum(
+ loss_val.asnumpy()*train_mask)/train_mask.sum())
+ ht.get_worker_communicate().BarrierWorker()
+ graph = graph_nxt
+ train_state.sync_and_clear()
+
+
+def server_init(server):
+ batch_size = args.batch_size
+ server.init_cache(0.1, graphmix.cache.LFUOpt)
+ worker_per_server = server.num_worker() // server.num_server()
+ server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+ depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+ server.is_ready()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("config")
+ parser.add_argument("--path", "-p", required=True)
+ parser.add_argument("--num_epoch", default=300, type=int)
+ parser.add_argument("--hidden_size", default=128, type=int)
+ parser.add_argument("--learning_rate", default=1, type=float)
+ parser.add_argument("--batch_size", default=128, type=int)
+ parser.add_argument("--cache", default="LFUOpt", type=str)
+ parser.add_argument("--server", action="store_true")
+ args = parser.parse_args()
+ if args.server:
+ launch_graphmix_and_hetu_ps(
+ train_main, args, server_init, hybrid_config="server")
+ else:
+ launch_graphmix_and_hetu_ps(
+ train_main, args, server_init, hybrid_config="worker")
diff --git a/examples/gnn/run_single.py b/examples/gnn/run_single.py
new file mode 100644
index 0000000..b5a8418
--- /dev/null
+++ b/examples/gnn/run_single.py
@@ -0,0 +1,95 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_tools.log import SharedTrainingStat
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model, dense_model
+import graphmix
+
+import hetu as ht
+
+import numpy as np
+import time
+import os
+import sys
+import argparse
+
+# usage
+# python3 run_single.py [-p data_path]
+
+
+def train_main(args):
+ cli = graphmix.Client()
+ meta = cli.meta
+ hidden_layer_size = args.hidden_size
+ num_epoch = args.num_epoch
+ rank = cli.rank()
+ nrank = cli.num_worker()
+ ctx = ht.gpu(rank % args.num_local_worker)
+ embedding_width = args.hidden_size
+ # the last two is train label and other train mask
+ num_int_feature = meta["int_feature"] - 2
+ # sample some graphs
+ ngraph = meta["train_node"] // (args.batch_size * nrank)
+ graphs = prepare_data(ngraph)
+ # build model
+ if args.dense:
+ [loss, y, train_op], [mask_, norm_adj_] = dense_model(
+ meta["float_feature"], args.hidden_size, meta["class"], args.learning_rate)
+ else:
+ [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+ num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+
+ idx = 0
+ graph = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph)
+ ht.GNNDataLoaderOp.step(graph)
+ executor = ht.Executor([loss, y, train_op], ctx=ctx)
+ nbatches = meta["train_node"] // (args.batch_size * nrank)
+ for epoch in range(num_epoch):
+ for _ in range(nbatches):
+ graph_nxt = graphs[idx]
+ idx = (idx + 1) % ngraph
+ ht.GNNDataLoaderOp.step(graph_nxt)
+ train_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+ eval_mask = np.bitwise_and(
+ graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+ feed_dict = {
+ norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)),
+ mask_: train_mask
+ }
+ loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+ y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+ acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+ train_acc = np.sum(
+ (y_predicted == graph.i_feat[:, -2]) * train_mask)
+ stat.update(acc, eval_mask.sum(), np.sum(
+ loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+ stat.update_train(train_acc, train_mask.sum(), np.sum(
+ loss_val.asnumpy()*train_mask)/train_mask.sum())
+ graph = graph_nxt
+ stat.print(epoch)
+
+
+def server_init(server):
+ batch_size = args.batch_size
+ server.init_cache(0.1, graphmix.cache.LFUOpt)
+ worker_per_server = server.num_worker() // server.num_server()
+ server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+ depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+ server.is_ready()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--config", default="config/single.yml")
+ parser.add_argument("--path", "-p", required=True)
+ parser.add_argument("--num_epoch", default=300, type=int)
+ parser.add_argument("--hidden_size", default=128, type=int)
+ parser.add_argument("--learning_rate", default=1, type=float)
+ parser.add_argument("--batch_size", default=128, type=int)
+ parser.add_argument("--dense", action="store_true")
+ args = parser.parse_args()
+ stat = SharedTrainingStat()
+ launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init)
diff --git a/examples/nlp/.gitignore b/examples/nlp/.gitignore
new file mode 100644
index 0000000..c90eef6
--- /dev/null
+++ b/examples/nlp/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+iwslt2016/
+logs/
\ No newline at end of file
diff --git a/examples/nlp/README.md b/examples/nlp/README.md
new file mode 100644
index 0000000..53351d8
--- /dev/null
+++ b/examples/nlp/README.md
@@ -0,0 +1,18 @@
+# NLP Examples
+In this directory we provide simple implementations for Transformer model. We use the IWSLT2016 de-en dataset.
+## Structure
+```
+- nlp
+ - hparams.py Hyperparameters
+ - prepare_data.py Downloading and preparing data
+ - data_load.py Dataloader
+ - hetu_transformer.py Transformer model in hetu
+ - tf_transformer.py Transformer model in tensorflow
+ - train_hetu_transformer.py Trainer for hetu
+ - train_tf_transformer.py Trainer for tensorflow
+```
+## Usage
+```bash
+python train_{framework}_transformer.py
+```
+To change the hyperparameters, please modify `hparams.py` file.
\ No newline at end of file
diff --git a/examples/nlp/__init__.py b/examples/nlp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/nlp/bookcorpus/bookcorpus.py b/examples/nlp/bookcorpus/bookcorpus.py
new file mode 100644
index 0000000..42c696e
--- /dev/null
+++ b/examples/nlp/bookcorpus/bookcorpus.py
@@ -0,0 +1,92 @@
+# Lint as: python3
+"""The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """
+
+
+import glob
+import os
+import pathlib
+
+import datasets
+
+
+_DESCRIPTION = """\
+Books are a rich source of both fine-grained information, how a character, \
+an object or a scene looks like, as well as high-level semantics, what \
+someone is thinking, feeling and how these states evolve through a story.\
+This version of bookcorpus has 17868 dataset items (books). Each item contains \
+two fields: title and text. The title is the name of the book (just the file name) \
+while text contains unprocessed book text. The bookcorpus has been prepared by \
+Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community \
+driven platform dedicated to the archiving and long-term preservation of any and \
+all data including but by no means limited to... websites, books, games, software, \
+video, audio, other digital-obscura and ideas.
+"""
+
+_CITATION = """\
+@InProceedings{Zhu_2015_ICCV,
+ title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
+ author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
+ booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+ month = {December},
+ year = {2015}
+}
+"""
+_PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27"
+# _DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"
+_DOWNLOAD_URL = "/home/xiaonan/develope/Athena/datasets/books_doc_format.tar.gz"
+
+
+class BookCorpusOpenConfig(datasets.BuilderConfig):
+ """BuilderConfig for BookCorpus."""
+
+ def __init__(self, **kwargs):
+ """BuilderConfig for BookCorpus.
+ Args:
+ **kwargs: keyword arguments forwarded to super.
+ """
+ super(BookCorpusOpenConfig, self).__init__(
+ version=datasets.Version("1.0.0", ""), **kwargs)
+
+
+class BookCorpusOpen(datasets.GeneratorBasedBuilder):
+ """BookCorpus dataset."""
+
+ BUILDER_CONFIGS = [
+ BookCorpusOpenConfig(
+ name="plain_text",
+ description="Plain text",
+ )
+ ]
+
+ def _info(self):
+ return datasets.DatasetInfo(
+ description=_DESCRIPTION,
+ features=datasets.Features(
+ {
+ "title": datasets.Value("string"),
+ "text": datasets.Value("string"),
+ }
+ ),
+ supervised_keys=None,
+ homepage=_PROJECT_URL,
+ citation=_CITATION,
+ )
+
+ def _split_generators(self, dl_manager):
+ arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
+
+ return [
+ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
+ "directory": arch_path}),
+ ]
+
+ def _generate_examples(self, directory):
+ glob_target = os.path.join(directory, "**/*.epub.txt")
+ book_files = glob.glob(glob_target, recursive=True)
+ book_files = sorted(book_files)
+ _id = 0
+ for book_file_path in book_files:
+ path = pathlib.PurePath(book_file_path)
+ with open(book_file_path, mode="r", encoding="utf-8") as f:
+ yield _id, {"title": str(path.name), "text": f.read()},
+ _id += 1
diff --git a/examples/nlp/data_load.py b/examples/nlp/data_load.py
new file mode 100644
index 0000000..b5966f2
--- /dev/null
+++ b/examples/nlp/data_load.py
@@ -0,0 +1,120 @@
+import numpy as np
+
+
+class DataLoader(object):
+ def __init__(self, fpath1, fpath2, maxlen1, maxlen2, vocab_fpath):
+ self.sents1, self.sents2 = self.load_data(
+ fpath1, fpath2, maxlen1, maxlen2)
+ self.token2idx, self.idx2token = self.load_vocab(vocab_fpath)
+ self.maxlen1 = maxlen1
+ self.maxlen2 = maxlen2
+
+ def load_vocab(self, vocab_fpath):
+ '''Loads vocabulary file and returns idx<->token maps
+ vocab_fpath: string. vocabulary file path.
+ Note that these are reserved
+ 0: , 1: , 2: , 3:
+
+ Returns
+ two dictionaries.
+ '''
+ vocab = [line.split()[0] for line in open(
+ vocab_fpath, 'r', encoding='utf-8').read().splitlines()]
+ token2idx = {token: idx for idx, token in enumerate(vocab)}
+ idx2token = {idx: token for idx, token in enumerate(vocab)}
+ return token2idx, idx2token
+
+ def load_data(self, fpath1, fpath2, maxlen1, maxlen2):
+ '''Loads source and target data and filters out too lengthy samples.
+ fpath1: source file path. string.
+ fpath2: target file path. string.
+ maxlen1: source sent maximum length. scalar.
+ maxlen2: target sent maximum length. scalar.
+
+ Returns
+ sents1: list of source sents
+ sents2: list of target sents
+ '''
+ sents1, sents2 = [], []
+ with open(fpath1, 'r', encoding='utf-8') as f1, open(fpath2, 'r', encoding='utf-8') as f2:
+ for sent1, sent2 in zip(f1, f2):
+ if len(sent1.split()) + 1 > maxlen1:
+ continue # 1:
+ if len(sent2.split()) + 1 > maxlen2:
+ continue # 1:
+ sents1.append(sent1.strip())
+ sents2.append(sent2.strip())
+ return sents1, sents2
+
+ def encode(self, inp, type, dict):
+ '''Converts string to number. Used for `generator_fn`.
+ inp: 1d byte array.
+ type: "x" (source side) or "y" (target side)
+ dict: token2idx dictionary
+
+ Returns
+ list of numbers
+ '''
+ inp_str = inp
+ if type == "x":
+ tokens = inp_str.split() + [""]
+ else:
+ tokens = [""] + inp_str.split() + [""]
+
+ x = [dict.get(t, dict[""]) for t in tokens]
+ return x
+
+ def make_epoch_data(self, batch_size, shuffle=False):
+ import copy
+ new_sents1 = copy.deepcopy(self.sents1)
+ new_sents2 = copy.deepcopy(self.sents2)
+ if shuffle:
+ import random
+ random.shuffle(new_sents1)
+ random.shuffle(new_sents2)
+ xs = [self.encode(sent1, "x", self.token2idx) for sent1 in new_sents1]
+ ys = [self.encode(sent2, "y", self.token2idx) for sent2 in new_sents2]
+ batch_xs = []
+ batch_ys = []
+ for i in range(0, len(xs), batch_size):
+ start = i
+ end = start + batch_size
+ batch_xs.append(xs[start:end])
+ batch_ys.append(ys[start:end])
+ if len(batch_xs[-1]) != batch_size:
+ batch_xs = batch_xs[:-1]
+ batch_ys = batch_ys[:-1]
+ self.cur_xs = batch_xs
+ self.cur_ys = batch_ys
+ self.batch_num = len(batch_xs)
+ self.idx = 0
+
+ def get_batch(self, fill_maxlen=True):
+ if self.idx >= self.batch_num:
+ assert False
+ cur_batch_x = self.cur_xs[self.idx]
+ cur_batch_y = self.cur_ys[self.idx]
+ self.idx += 1
+
+ if fill_maxlen:
+ cur_largest_len_x = self.maxlen1
+ cur_largest_len_y = self.maxlen2
+ else:
+ cur_largest_len_x = max([len(x) for x in cur_batch_x])
+ cur_largest_len_y = max([len(y) for y in cur_batch_y])
+
+ cur_batch_x = np.array([self.align(x, cur_largest_len_x)
+ for x in cur_batch_x]).astype(np.float32)
+ cur_batch_y = np.array([self.align(y, cur_largest_len_y)
+ for y in cur_batch_y]).astype(np.float32)
+ return (cur_batch_x, cur_largest_len_x), (cur_batch_y, cur_largest_len_y)
+
+ def align(self, arr, length):
+ ori_len = len(arr)
+ if length > ori_len:
+ return arr + [0] * (length - ori_len)
+ else:
+ return arr[:length]
+
+ def get_pad(self):
+ return self.token2idx[""]
diff --git a/examples/nlp/hetu_transformer.py b/examples/nlp/hetu_transformer.py
new file mode 100644
index 0000000..907f7a8
--- /dev/null
+++ b/examples/nlp/hetu_transformer.py
@@ -0,0 +1,266 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def layer_norm(
+ input_tensor,
+ feature_size,
+ eps=1e-8
+):
+ scale = init.ones(name='layer_norm_scale', shape=(feature_size, ))
+ bias = init.zeros(name='layer_norm_biad', shape=(feature_size, ))
+ return ht.layer_normalization_op(input_tensor, scale, bias, eps=eps)
+
+
+def dense(
+ input_tensor,
+ fan_in,
+ fan_out,
+ activation=None,
+ kernel_initializer=init.xavier_normal,
+ bias_initializer=init.zeros
+):
+ weights = kernel_initializer(name='dense_weights', shape=(fan_in, fan_out))
+ bias = bias_initializer(name='dense_bias', shape=(fan_out,))
+ outputs = ht.matmul_op(input_tensor, weights)
+ outputs = outputs + ht.broadcastto_op(bias, outputs)
+ if activation is not None:
+ outputs = activation(outputs)
+ return outputs
+
+
+def dropout(
+ input_tensor,
+ dropout_prob
+):
+ if dropout_prob is None or dropout_prob == 0.0:
+ return input_tensor
+ output = ht.dropout_op(input_tensor, 1.0 - dropout_prob)
+ return output
+
+
+def get_token_embeddings(vocab_size, num_units, initializer=init.xavier_normal, zero_pad=True):
+ if zero_pad:
+ embedding_part = initializer(
+ name='embedding_table', shape=(vocab_size-1, num_units))
+ padding_zero = init.zeros(
+ name='padding_zero', shape=(1, num_units), trainable=False)
+ embeddings = ht.concat_op(padding_zero, embedding_part)
+ else:
+ embeddings = initializer(
+ name='embedding_table', shape=(vocab_size, num_units))
+ return embeddings
+
+
+def multihead_attention(
+ queries, keys, values,
+ config,
+ query_act=None, key_act=None, value_act=None,
+ attention_mask=None,
+ causality=False):
+
+ def transpose_for_scores(input_tensor):
+ output_tensor = ht.array_reshape_op(
+ input_tensor, [config.batch_size, -1, config.num_heads, config.d_model // config.num_heads])
+
+ output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
+ return output_tensor
+
+ batch_size = config.batch_size
+ hidden_size = config.d_model
+ num_attention_heads = config.num_heads
+ caus_len = config.maxlen2 - 1
+ attention_probs_dropout_prob = config.dropout_rate
+
+ size_per_head = hidden_size // num_attention_heads
+
+ # reshape to 2d
+ queries2d = ht.array_reshape_op(
+ queries, [-1, hidden_size]) # (N * T_q, d_model)
+ keys2d = ht.array_reshape_op(keys, [-1, hidden_size]) # (N * T_k, d_model)
+ values2d = ht.array_reshape_op(
+ values, [-1, hidden_size]) # (N * T_k, d_model)
+
+ # linear transformation
+ query_layer = dense(queries2d, hidden_size, hidden_size,
+ query_act) # (N * T_k, d_model)
+ key_layer = dense(keys2d, hidden_size, hidden_size,
+ key_act) # (N * T_k, d_model)
+ value_layer = dense(values2d, hidden_size, hidden_size,
+ value_act) # (N * T_k, d_model)
+
+ # transpose
+ query_layer = transpose_for_scores(query_layer) # (N, h, T_q, d_model/h)
+ key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h)
+ value_layer = transpose_for_scores(value_layer) # (N, h, T_k, d_model/h)
+
+ # score
+ attention_scores = ht.batch_matmul_op(
+ query_layer, key_layer, trans_B=True) # (N, h, T_q, T_k)
+ attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head)))
+
+ # mask
+ if attention_mask is not None:
+ zeros = ht.Variable('no_mask', value=np.array(
+ (0,), dtype=np.float32), trainable=False)
+ adder = ht.Variable('attention_mask', value=np.array(
+ (-2**32+1,), dtype=np.float32), trainable=False)
+ zeros = ht.broadcastto_op(zeros, attention_mask)
+ adder = ht.broadcastto_op(adder, attention_mask)
+ attention_mask = ht.where_op(attention_mask, zeros, adder) # (N, T)
+ attention_mask = ht.array_reshape_op(
+ attention_mask, [batch_size, 1, 1, -1])
+ attention_scores = attention_scores + \
+ ht.broadcastto_op(attention_mask, attention_scores)
+ if causality:
+ tril = ht.Variable(name='tril', value=np.tril(
+ np.ones((caus_len, caus_len))), trainable=False) # (T, T)
+ future_masks = ht.broadcast_shape_op(
+ tril, [batch_size, num_attention_heads, caus_len, caus_len])
+ adder = ht.Variable('future_mask', value=np.array(
+ (-2**32+1,), dtype=np.float32), trainable=False)
+ adder = ht.broadcastto_op(adder, future_masks)
+ attention_scores = ht.where_op(
+ future_masks, attention_scores, adder) # (N, h, T, T)
+
+ # probs
+ attention_probs = ht.softmax_op(attention_scores)
+ attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+ context_layer = ht.batch_matmul_op(attention_probs, value_layer)
+ context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3])
+ outputs = ht.array_reshape_op(
+ context_layer,
+ [batch_size, -1, num_attention_heads * size_per_head])
+
+ # Residual connection
+ outputs = outputs + queries # (N, T_q, d_model)
+
+ # Normalize
+ outputs = layer_norm(outputs, hidden_size) # (N, T_q, d_model)
+ return outputs
+
+
+def ff(inputs, config):
+ outputs = ht.array_reshape_op(inputs, [-1, config.d_model])
+ outputs = dense(outputs, config.d_model,
+ config.d_ff, activation=ht.relu_op)
+ outputs = dense(outputs, config.d_ff, config.d_model)
+ outputs = ht.array_reshape_op(
+ outputs, [config.batch_size, -1, config.d_model])
+ outputs = outputs + inputs
+ outputs = layer_norm(outputs, config.d_model)
+ return outputs
+
+
+def label_smoothing(inputs, V, epsilon=0.1):
+ # V = inputs.shape[-1] # number of channels
+ return ((1-epsilon) * inputs) + (epsilon / V)
+
+
+def positional_encoding(
+ inputs,
+ inputs_shape,
+ maxlen,
+ masking=True
+):
+ N, T, E = tuple(inputs_shape)
+ position_enc = np.array([
+ [pos / np.power(10000, (i & -2)/E) for i in range(E)]
+ for pos in range(maxlen)])
+ position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
+ position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
+
+ position_enc = position_enc[:T, :]
+ outputs = ht.Variable(name='position_enc', value=np.tile(
+ position_enc, [N, 1, 1]), trainable=False)
+ zeros = ht.Variable(name='zeros', value=np.zeros(
+ inputs_shape), trainable=False)
+
+ if masking:
+ outputs = ht.where_op(inputs, outputs, zeros)
+
+ return outputs
+
+
+class Transformer(object):
+ def __init__(self, hp):
+ self.hp = hp
+ self.embeddings = get_token_embeddings(
+ self.hp.vocab_size, self.hp.d_model, zero_pad=True)
+
+ def encode(self, xs):
+ x = xs
+
+ # embedding
+ enc = ht.embedding_lookup_op(self.embeddings, x) # (N, T1, d_model)
+ enc = enc * self.hp.d_model**0.5 # scale
+
+ enc += positional_encoding(enc, (self.hp.batch_size,
+ self.hp.maxlen1, self.hp.d_model), self.hp.maxlen1)
+ enc = dropout(enc, self.hp.dropout_rate)
+
+ # Blocks
+ for i in range(self.hp.num_blocks):
+ # self-attention
+ enc = multihead_attention(
+ queries=enc, keys=enc, values=enc,
+ config=self.hp,
+ attention_mask=x,
+ causality=False
+ )
+ # feed forward
+ enc = ff(enc, config=self.hp)
+ memory = enc
+ return memory
+
+ def decode(self, ys, memory, src_masks):
+ decoder_inputs = ys
+
+ # embedding
+ dec = ht.embedding_lookup_op(
+ self.embeddings, decoder_inputs) # (N, T2, d_model)
+ dec = dec * self.hp.d_model ** 0.5 # scale
+
+ dec += positional_encoding(dec, (self.hp.batch_size,
+ self.hp.maxlen2-1, self.hp.d_model), self.hp.maxlen2)
+ dec = dropout(dec, self.hp.dropout_rate)
+
+ # Blocks
+ for i in range(self.hp.num_blocks):
+ # Masked self-attention (Note that causality is True at this time)
+ dec = multihead_attention(
+ queries=dec, keys=dec, values=dec,
+ config=self.hp,
+ attention_mask=decoder_inputs,
+ causality=True,
+ )
+ # Vanilla attention
+ dec = multihead_attention(
+ queries=dec, keys=memory, values=memory,
+ config=self.hp,
+ attention_mask=src_masks,
+ causality=False,
+ )
+ # Feed Forward
+ dec = ff(dec, config=self.hp)
+
+ dec = ht.array_reshape_op(
+ dec, [-1, self.hp.d_model]) # (N * T, d_model)
+ logits = ht.array_reshape_op(ht.matmul_op(dec, self.embeddings, trans_B=True), [
+ self.hp.batch_size, -1, self.hp.vocab_size]) # (N, T, vocab)
+
+ return logits
+
+ def train(self, xs, ys):
+ # forward
+ memory = self.encode(xs)
+ logits = self.decode(ys[0], memory, xs)
+
+ # train scheme
+ y = ys[1]
+ y_ = label_smoothing(ht.one_hot_op(
+ y, self.hp.vocab_size), self.hp.vocab_size) # (N, T, vocab)
+ loss = ht.softmaxcrossentropy_op(logits, y_)
+
+ return loss
diff --git a/examples/nlp/hparams.py b/examples/nlp/hparams.py
new file mode 100644
index 0000000..fbce591
--- /dev/null
+++ b/examples/nlp/hparams.py
@@ -0,0 +1,63 @@
+import argparse
+
+
+class Hparams:
+ parser = argparse.ArgumentParser()
+
+ # prepro
+ parser.add_argument('--vocab_size', default=32000, type=int)
+
+ # train
+ # files
+ parser.add_argument('--train1', default='iwslt2016/segmented/train.de.bpe',
+ help="german training segmented data")
+ parser.add_argument('--train2', default='iwslt2016/segmented/train.en.bpe',
+ help="english training segmented data")
+ parser.add_argument('--eval1', default='iwslt2016/segmented/eval.de.bpe',
+ help="german evaluation segmented data")
+ parser.add_argument('--eval2', default='iwslt2016/segmented/eval.en.bpe',
+ help="english evaluation segmented data")
+ parser.add_argument('--eval3', default='iwslt2016/prepro/eval.en',
+ help="english evaluation unsegmented data")
+
+ # vocabulary
+ parser.add_argument('--vocab', default='iwslt2016/segmented/bpe.vocab',
+ help="vocabulary file path")
+
+ # training scheme
+ parser.add_argument('--batch_size', default=16, type=int)
+ parser.add_argument('--eval_batch_size', default=128, type=int)
+
+ parser.add_argument('--lr', default=0.0003,
+ type=float, help="learning rate")
+ parser.add_argument('--warmup_steps', default=4000, type=int)
+ parser.add_argument('--logdir', default="logs/tf", help="log directory")
+ parser.add_argument('--num_epochs', default=20, type=int)
+ parser.add_argument('--evaldir', default="logs/tf/eval",
+ help="evaluation dir")
+
+ # model
+ parser.add_argument('--d_model', default=512, type=int,
+ help="hidden dimension of encoder/decoder")
+ parser.add_argument('--d_ff', default=2048, type=int,
+ help="hidden dimension of feedforward layer")
+ parser.add_argument('--num_blocks', default=6, type=int,
+ help="number of encoder/decoder blocks")
+ parser.add_argument('--num_heads', default=8, type=int,
+ help="number of attention heads")
+ parser.add_argument('--maxlen1', default=100, type=int,
+ help="maximum length of a source sequence")
+ parser.add_argument('--maxlen2', default=100, type=int,
+ help="maximum length of a target sequence")
+ parser.add_argument('--dropout_rate', default=0.3, type=float)
+ parser.add_argument('--smoothing', default=0.1, type=float,
+ help="label smoothing rate")
+
+ # test
+ parser.add_argument('--test1', default='iwslt2016/segmented/test.de.bpe',
+ help="german test segmented data")
+ parser.add_argument('--test2', default='iwslt2016/prepro/test.en',
+ help="english test data")
+ parser.add_argument('--ckpt', help="checkpoint file path")
+ parser.add_argument('--test_batch_size', default=128, type=int)
+ parser.add_argument('--testdir', default="test/1", help="test result dir")
diff --git a/examples/nlp/prepare_data.py b/examples/nlp/prepare_data.py
new file mode 100644
index 0000000..85b9ff6
--- /dev/null
+++ b/examples/nlp/prepare_data.py
@@ -0,0 +1,128 @@
+import os
+import wget
+import tarfile
+import errno
+import sentencepiece as spm
+import re
+from hparams import Hparams
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def prepro(hp):
+ """Load raw data -> Preprocessing -> Segmenting with sentencepice
+ hp: hyperparams. argparse.
+ """
+ logging.info("# Check if raw files exist")
+ train1 = "iwslt2016/de-en/train.tags.de-en.de"
+ train2 = "iwslt2016/de-en/train.tags.de-en.en"
+ eval1 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.de.xml"
+ eval2 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.en.xml"
+ test1 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.de.xml"
+ test2 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.en.xml"
+ for f in (train1, train2, eval1, eval2, test1, test2):
+ if not os.path.isfile(f):
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f)
+
+ logging.info("# Preprocessing")
+ # train
+
+ def _prepro(x): return [line.strip() for line in open(x, 'r').read().split("\n")
+ if not line.startswith("<")]
+ prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
+ assert len(prepro_train1) == len(
+ prepro_train2), "Check if train source and target files match."
+
+ # eval
+ def _prepro(x): return [re.sub("<[^>]+>", "", line).strip()
+ for line in open(x, 'r').read().split("\n")
+ if line.startswith("= num_to_predict:
+ break
+ masked_token = None
+ # replace with [MASK] at 80%.
+ if rng.random() < 0.8:
+ masked_token = "[MASK]"
+ else:
+ # keep original at 10%.
+ if rng.random() < 0.5:
+ masked_token = tokens[index]
+ # replace with random word at 10%.
+ else:
+ masked_token = vocab_words[rng.randint(
+ 0, len(vocab_words) - 1)]
+ output_tokens[index] = masked_token
+ masked_lms.append([index, tokens[index]])
+
+ masked_lms.sort(key=lambda x: x[0])
+ masked_lm_positions = []
+ masked_lm_labels = []
+
+ for p in masked_lms:
+ masked_lm_positions.append(p[0])
+ masked_lm_labels.append(p[1])
+
+ return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def create_data_from_document(all_document, doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+ """ Create Training example for input document """
+ document = all_document[doc_id]
+ max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP]
+ target_seq_length = max_num_tokens
+ # generate short sequence at the probility of short_seq_prob
+ # In order to minimize the mismatch between pre-training and fine-tuning.
+ if rng.random() < short_seq_prob:
+ target_seq_length = rng.randint(2, max_num_tokens)
+ instances = []
+ current_chunk = []
+ current_length = 0
+ i = 0
+ while i < len(document):
+ segment = document[i]
+ current_chunk.append(segment)
+ current_length += len(segment)
+ if i == len(document) - 1 or current_length >= target_seq_length:
+ if current_chunk:
+ # create sentence A
+ a_end = 1
+ if len(current_chunk) >= 2:
+ a_end = rng.randint(1, len(current_chunk) - 1)
+ tokens_a = []
+ for j in range(a_end):
+ tokens_a.extend([current_chunk[j]])
+ tokens_b = []
+ # Random next
+ is_random_next = False
+ if len(current_chunk) == 1 or rng.random() < 0.5:
+ is_random_next = True
+ target_b_length = target_seq_length - len(tokens_a)
+ for _ in range(10):
+ random_document_index = rng.randint(
+ 0, len(all_document) - 1)
+ if random_document_index != doc_id:
+ break
+ # If picked random document is the same as the current document
+ if random_document_index == doc_id:
+ is_random_next = False
+ random_document = all_document[random_document_index]
+ random_start = rng.randint(0, len(random_document) - 1)
+ for j in range(random_start, len(random_document)):
+ tokens_b.extend([random_document[j]])
+ if len(tokens_b) >= target_b_length:
+ break
+ # We didn't actually use these segments so we "put them back" so
+ # they don't go to waste.
+ num_unused_segments = len(current_chunk) - a_end
+ i -= num_unused_segments
+ # Actual next
+ else:
+ is_random_next = False
+ for j in range(a_end, len(current_chunk)):
+ tokens_b.extend([current_chunk[j]])
+ truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+ assert len(tokens_a) >= 1
+ assert len(tokens_b) >= 1
+
+ tokens = []
+ segment_ids = []
+ tokens.append("[CLS]")
+ segment_ids.append(0)
+ for token in tokens_a:
+ tokens.append(token)
+ segment_ids.append(0)
+
+ tokens.append("[SEP]")
+ segment_ids.append(0)
+
+ for token in tokens_b:
+ tokens.append(token)
+ segment_ids.append(1)
+ tokens.append("[SEP]")
+ segment_ids.append(1)
+
+ (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
+ tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+
+ instance = TrainingInstance(
+ tokens=tokens,
+ segment_ids=segment_ids,
+ is_random_next=is_random_next,
+ masked_lm_positions=masked_lm_positions,
+ masked_lm_labels=masked_lm_labels)
+ instances.append(instance)
+ current_chunk = []
+ current_length = 0
+ i += 1
+
+ return instances
+
+
+def convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq):
+
+ num_instances = len(instances)
+ input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+ input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+ segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+ masked_lm_positions_list = np.zeros(
+ [num_instances, max_predictions_per_seq], dtype="int32")
+ masked_lm_ids_list = np.zeros(
+ [num_instances, max_predictions_per_seq], dtype="int32")
+ next_sentence_labels_list = np.zeros(num_instances, dtype="int32")
+ for (idx, instance) in enumerate(instances):
+ input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+ input_mask = [1] * len(input_ids)
+ segment_ids = list(instance.segment_ids)
+ assert len(input_ids) <= max_seq_length
+
+ while len(input_ids) < max_seq_length:
+ input_ids.append(0)
+ input_mask.append(0)
+ segment_ids.append(0)
+
+ assert len(input_ids) == max_seq_length
+ assert len(input_mask) == max_seq_length
+ assert len(segment_ids) == max_seq_length
+
+ masked_lm_positions = list(instance.masked_lm_positions)
+ masked_lm_ids = tokenizer.convert_tokens_to_ids(
+ instance.masked_lm_labels)
+
+ while len(masked_lm_positions) < max_predictions_per_seq:
+ masked_lm_positions.append(0)
+ masked_lm_ids.append(0)
+
+ next_sentence_label = 1 if instance.is_random_next else 0
+
+ input_ids_list[idx][:] = input_ids
+ input_mask_list[idx][:] = input_mask
+ segment_ids_list[idx][:] = segment_ids
+ masked_lm_positions_list[idx][:] = masked_lm_ids
+ next_sentence_labels_list[idx] = next_sentence_label
+
+ return input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list
+
+
+def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng):
+
+ documents = []
+ for i in range(dataset['train'].shape[0]):
+ tokens = tokenizer.tokenize(dataset['train'][i]['text'])
+ documents.append(tokens)
+ print(len(tokens))
+
+ vocab_words = list(tokenizer.vocab.keys())
+ instances = []
+
+ for doc_id in range(len(documents)):
+ instances.extend(create_data_from_document(documents, doc_id,
+ max_seq_length, short_seq_prob, masked_lm_prob,
+ max_predictions_per_seq, vocab_words, rng))
+
+ # instance:
+ # tokens
+ # segment_ids
+ # is_random_next
+ # masked_lm_positions
+ # masked_lm_labels
+ return convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+ """Truncates a pair of sequences to a maximum sequence length."""
+ while True:
+ total_length = len(tokens_a) + len(tokens_b)
+ if total_length <= max_num_tokens:
+ break
+
+ trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+ assert len(trunc_tokens) >= 1
+
+ # add more randomness and avoid biases.
+ if rng.random() < 0.5:
+ del trunc_tokens[0]
+ else:
+ trunc_tokens.pop()
+
+
+def show_dataset_detail(dataset):
+ print(dataset.shape)
+ print(dataset.column_names)
+ print(dataset['train'].features)
+ print(dataset['train'][0]['text'])
+
+
+if __name__ == "__main__":
+ max_seq_length = 512
+ do_lower_case = True
+ short_seq_prob = 0.1
+ masked_lm_prob = 0.15
+ max_predictions_per_seq = 20
+
+ vocab_path = "/home/xiaonan/develope/Athena/datasets/bert-base-uncased-vocab.txt"
+ dataset = load_dataset(
+ '/home/xiaonan/develope/Athena/examples/nlp/bookcorpus', cache_dir=".")
+ print("total number of documents {} ".format(dataset['train'].shape[0]))
+ random_seed = 123
+ rng = random.Random(random_seed)
+ tokenizer = hetu.BertTokenizer(
+ vocab_file=vocab_path, do_lower_case=do_lower_case)
+
+ input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list = create_pretrain_data(
+ dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng)
+ print(input_ids_list[-1])
+ print(input_mask_list[-1])
+ print(segment_ids_list[-1])
+ print(masked_lm_positions_list[-1])
+ print(next_sentence_labels_list[-1])
diff --git a/examples/nlp/tf_transformer.py b/examples/nlp/tf_transformer.py
new file mode 100644
index 0000000..2c6b1d7
--- /dev/null
+++ b/examples/nlp/tf_transformer.py
@@ -0,0 +1,442 @@
+import numpy as np
+import tensorflow as tf
+
+from tqdm import tqdm
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def ln(inputs, epsilon=1e-8, scope="ln"):
+ '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
+ inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
+ epsilon: A floating number. A very small number for preventing ZeroDivision Error.
+ scope: Optional scope for `variable_scope`.
+
+ Returns:
+ A tensor with the same shape and data dtype as `inputs`.
+ '''
+ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+ inputs_shape = inputs.get_shape()
+ params_shape = inputs_shape[-1:]
+
+ mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
+ beta = tf.get_variable("beta", params_shape,
+ initializer=tf.zeros_initializer())
+ gamma = tf.get_variable("gamma", params_shape,
+ initializer=tf.ones_initializer())
+ normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
+ outputs = gamma * normalized + beta
+
+ return outputs
+
+
+def get_token_embeddings(vocab_size, num_units, initializer=tf.contrib.layers.xavier_initializer(), zero_pad=True):
+ '''Constructs token embedding matrix.
+ Note that the column of index 0's are set to zeros.
+ vocab_size: scalar. V.
+ num_units: embedding dimensionalty. E.
+ zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
+ To apply query/key masks easily, zero pad is turned on.
+
+ Returns
+ weight variable: (V, E)
+ '''
+ with tf.variable_scope("shared_weight_matrix"):
+ embeddings = tf.get_variable('weight_mat',
+ dtype=tf.float32,
+ shape=(vocab_size, num_units),
+ initializer=initializer)
+ if zero_pad:
+ embeddings = tf.concat((tf.zeros(shape=[1, num_units]),
+ embeddings[1:, :]), 0)
+ return embeddings
+
+
+def multihead_attention(
+ queries, keys, values,
+ batch_size, hidden_size,
+ num_attention_heads=8,
+ query_act=None, key_act=None, value_act=None,
+ attention_mask=None,
+ attention_probs_dropout_prob=0.0,
+ training=True, causality=False,
+ scope="multihead_attention"):
+
+ def transpose_for_scores(input_tensor):
+ output_tensor = tf.reshape(
+ input_tensor, [batch_size, -1, num_attention_heads, hidden_size // num_attention_heads])
+
+ output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+ return output_tensor
+
+ size_per_head = hidden_size // num_attention_heads
+ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+ # linear transformation
+ query_layer = tf.layers.dense(
+ queries, hidden_size, activation=query_act) # (N, T_q, d_model)
+ key_layer = tf.layers.dense(
+ keys, hidden_size, activation=key_act) # (N, T_k, d_model)
+ value_layer = tf.layers.dense(
+ values, hidden_size, activation=value_act) # (N, T_k, d_model)
+
+ # transpose
+ query_layer = transpose_for_scores(
+ query_layer) # (N, h, T_q, d_model/h)
+ key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h)
+ value_layer = transpose_for_scores(
+ value_layer) # (N, h, T_k, d_model/h)
+
+ # score
+ attention_scores = tf.matmul(
+ query_layer, key_layer, transpose_b=True) # (N, h, T_q, T_k)
+ attention_scores /= size_per_head ** 0.5
+
+ # mask
+ if attention_mask is not None:
+ attention_mask = tf.to_float(attention_mask) # (N, T_k)
+ attention_mask = tf.reshape(
+ attention_mask, [batch_size, 1, 1, -1]) # (N, 1, 1, T_k)
+ attention_scores = attention_scores + \
+ attention_mask * (-2**32+1) # (N, h, T_q, T_k)
+ if causality:
+ diag_vals = tf.ones_like(
+ attention_scores[0, 0, :, :]) # (T_q, T_k)
+ tril = tf.linalg.LinearOperatorLowerTriangular(
+ diag_vals).to_dense() # (T_q, T_k)
+ future_masks = tf.broadcast_to(
+ tril, [batch_size, num_attention_heads, tril.shape[0], tril.shape[1]]) # (N, h, T_q, T_k)
+ paddings = tf.ones_like(future_masks) * (-2**32+1)
+ attention_scores = tf.where(
+ tf.equal(future_masks, 0), paddings, attention_scores)
+
+ # probs
+ attention_probs = tf.nn.softmax(attention_scores) # (N, h, T_q, T_k)
+ attention_probs = tf.layers.dropout(
+ attention_probs, rate=attention_probs_dropout_prob, training=training)
+ # (N, h, T_q, d_model/h)
+ context_layer = tf.matmul(attention_probs, value_layer)
+ context_layer = tf.transpose(
+ context_layer, [0, 2, 1, 3]) # (N, T_q, h, d_model/h)
+ outputs = tf.reshape(context_layer, [
+ batch_size, -1, num_attention_heads * size_per_head]) # (N, T_q, d_model)
+
+ # Residual connection
+ outputs += queries # (N, T_q, d_model)
+
+ # Normalize
+ outputs = ln(outputs) # (N, T_q, d_model)
+
+ return outputs
+
+
+def ff(inputs, num_units, scope="positionwise_feedforward"):
+ '''position-wise feed forward net. See 3.3
+
+ inputs: A 3d tensor with shape of [N, T, C].
+ num_units: A list of two integers.
+ scope: Optional scope for `variable_scope`.
+
+ Returns:
+ A 3d tensor with the same shape and dtype as inputs
+ '''
+ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+ # Inner layer
+ outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
+ # Outer layer
+ outputs = tf.layers.dense(outputs, num_units[1])
+ # Residual connection
+ outputs += inputs
+ # Normalize
+ outputs = ln(outputs)
+ return outputs
+
+
+def label_smoothing(inputs, epsilon=0.1):
+ '''Applies label smoothing. See 5.4 and https://arxiv.org/abs/1512.00567.
+ inputs: 3d tensor. [N, T, V], where V is the number of vocabulary.
+ epsilon: Smoothing rate.
+
+ For example,
+
+ ```
+ import tensorflow as tf
+ inputs = tf.convert_to_tensor([[[0, 0, 1],
+ [0, 1, 0],
+ [1, 0, 0]],
+
+ [[1, 0, 0],
+ [1, 0, 0],
+ [0, 1, 0]]], tf.float32)
+
+ outputs = label_smoothing(inputs)
+
+ with tf.Session() as sess:
+ print(sess.run([outputs]))
+
+ >>
+ [array([[[ 0.03333334, 0.03333334, 0.93333334],
+ [ 0.03333334, 0.93333334, 0.03333334],
+ [ 0.93333334, 0.03333334, 0.03333334]],
+
+ [[ 0.93333334, 0.03333334, 0.03333334],
+ [ 0.93333334, 0.03333334, 0.03333334],
+ [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)]
+ ```
+ '''
+ V = inputs.get_shape().as_list()[-1] # number of channels
+ return ((1-epsilon) * inputs) + (epsilon / V)
+
+
+def positional_encoding(inputs,
+ maxlen,
+ masking=True,
+ scope="positional_encoding"):
+ '''Sinusoidal Positional_Encoding. See 3.5
+ inputs: 3d tensor. (N, T, E)
+ maxlen: scalar. Must be >= T
+ masking: Boolean. If True, padding positions are set to zeros.
+ scope: Optional scope for `variable_scope`.
+
+ returns
+ 3d tensor that has the same shape as inputs.
+ '''
+
+ E = inputs.get_shape().as_list()[-1] # static
+ N, T = tf.shape(inputs)[0], tf.shape(inputs)[1] # dynamic
+ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+ # position indices
+ position_ind = tf.tile(tf.expand_dims(
+ tf.range(T), 0), [N, 1]) # (N, T)
+
+ # First part of the PE function: sin and cos argument
+ position_enc = np.array([
+ [pos / np.power(10000, (i-i % 2)/E) for i in range(E)]
+ for pos in range(maxlen)])
+
+ # Second part, apply the cosine to even columns and sin to odds.
+ position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
+ position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
+ position_enc = tf.convert_to_tensor(
+ position_enc, tf.float32) # (maxlen, E)
+
+ # lookup
+ outputs = tf.nn.embedding_lookup(position_enc, position_ind)
+
+ # masks
+ if masking:
+ outputs = tf.where(tf.equal(inputs, 0), inputs, outputs)
+
+ return tf.to_float(outputs)
+
+# def noam_scheme(init_lr, global_step, warmup_steps=4000.):
+# '''Noam scheme learning rate decay
+# init_lr: initial learning rate. scalar.
+# global_step: scalar.
+# warmup_steps: scalar. During warmup_steps, learning rate increases
+# until it reaches init_lr.
+# '''
+# step = tf.cast(global_step + 1, dtype=tf.float32)
+# return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
+
+
+class Transformer(object):
+ '''
+ xs: tuple of
+ x: int32 tensor. (N, T1)
+ x_seqlens: int32 tensor. (N,)
+ sents1: str tensor. (N,)
+ ys: tuple of
+ decoder_input: int32 tensor. (N, T2)
+ y: int32 tensor. (N, T2)
+ y_seqlen: int32 tensor. (N, )
+ sents2: str tensor. (N,)
+ training: boolean.
+ '''
+
+ def __init__(self, hp):
+ self.hp = hp
+ # self.token2idx, self.idx2token = load_vocab(hp.vocab)
+ self.embeddings = get_token_embeddings(
+ self.hp.vocab_size, self.hp.d_model, zero_pad=True)
+
+ def encode(self, xs, training=True):
+ '''
+ Returns
+ memory: encoder outputs. (N, T1, d_model)
+ '''
+ with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
+ x = xs
+
+ # src_masks
+ src_masks = tf.math.equal(x, 0) # (N, T1)
+
+ # embedding
+ enc = tf.nn.embedding_lookup(
+ self.embeddings, x) # (N, T1, d_model)
+ enc *= self.hp.d_model**0.5 # scale
+
+ enc += positional_encoding(enc, self.hp.maxlen1)
+ enc = tf.layers.dropout(
+ enc, self.hp.dropout_rate, training=training)
+
+ # Blocks
+ for i in range(self.hp.num_blocks):
+ with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
+ # self-attention
+ enc = multihead_attention(
+ queries=enc, keys=enc, values=enc,
+ batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+ num_attention_heads=self.hp.num_heads,
+ attention_mask=src_masks,
+ attention_probs_dropout_prob=self.hp.dropout_rate,
+ training=training,
+ causality=False
+ )
+ # feed forward
+ enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
+ memory = enc
+ return memory, src_masks
+
+ def decode(self, ys, memory, src_masks, training=True):
+ '''
+ memory: encoder outputs. (N, T1, d_model)
+ src_masks: (N, T1)
+
+ Returns
+ logits: (N, T2, V). float32.
+ y_hat: (N, T2). int32
+ y: (N, T2). int32
+ sents2: (N,). string.
+ '''
+ with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
+ decoder_inputs = ys
+
+ # tgt_masks
+ tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2)
+
+ # embedding
+ dec = tf.nn.embedding_lookup(
+ self.embeddings, decoder_inputs) # (N, T2, d_model)
+ dec *= self.hp.d_model ** 0.5 # scale
+
+ dec += positional_encoding(dec, self.hp.maxlen2)
+ dec = tf.layers.dropout(
+ dec, self.hp.dropout_rate, training=training)
+
+ # Blocks
+ for i in range(self.hp.num_blocks):
+ with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
+ # Masked self-attention (Note that causality is True at this time)
+ dec = multihead_attention(
+ queries=dec, keys=dec, values=dec,
+ batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+ num_attention_heads=self.hp.num_heads,
+ attention_mask=tgt_masks,
+ attention_probs_dropout_prob=self.hp.dropout_rate,
+ training=training,
+ causality=True,
+ scope="self_attention"
+ )
+ # Vanilla attention
+ dec = multihead_attention(
+ queries=dec, keys=memory, values=memory,
+ batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+ num_attention_heads=self.hp.num_heads,
+ attention_mask=src_masks,
+ attention_probs_dropout_prob=self.hp.dropout_rate,
+ training=training,
+ causality=False,
+ scope="vanilla_attention"
+ )
+ # Feed Forward
+ dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
+
+ # Final linear projection (embedding weights are shared)
+ weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
+ logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
+ # y_hat = tf.to_int32(tf.argmax(logits, axis=-1))
+
+ return logits
+
+ def train(self, xs, ys):
+ '''
+ Returns
+ loss: scalar.
+ train_op: training operation
+ global_step: scalar.
+ summaries: training summary node
+ '''
+ # forward
+ memory, src_masks = self.encode(xs)
+ logits = self.decode(ys[0], memory, src_masks)
+
+ # train scheme
+ y = ys[1]
+ y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
+ loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+ logits=logits, labels=y_)
+
+ return loss
+
+ # def eval(self, xs, ys):
+ # '''Predicts autoregressively
+ # At inference, input ys is ignored.
+ # Returns
+ # y_hat: (N, T2)
+ # '''
+ # decoder_inputs, y, y_seqlen, sents2 = ys
+
+ # decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx[""]
+ # ys = (decoder_inputs, y, y_seqlen, sents2)
+
+ # memory, sents1, src_masks = self.encode(xs, False)
+
+ # logging.info("Inference graph is being built. Please be patient.")
+ # for _ in tqdm(range(self.hp.maxlen2)):
+ # logits, y_hat, y, sents2 = self.decode(ys, memory, src_masks, False)
+ # if tf.reduce_sum(y_hat, 1) == self.token2idx[""]: break
+
+ # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
+ # ys = (_decoder_inputs, y, y_seqlen, sents2)
+
+ # # monitor a random sample
+ # n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32)
+ # sent1 = sents1[n]
+ # pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
+ # sent2 = sents2[n]
+
+ # tf.summary.text("sent1", sent1)
+ # tf.summary.text("pred", pred)
+ # tf.summary.text("sent2", sent2)
+ # summaries = tf.summary.merge_all()
+
+ # return y_hat, summaries
+
+
+# def convert_idx_to_token_tensor(inputs, idx2token):
+# '''Converts int32 tensor to string tensor.
+# inputs: 1d int32 tensor. indices.
+# idx2token: dictionary
+
+# Returns
+# 1d string tensor.
+# '''
+# def my_func(inputs):
+# return " ".join(idx2token[elem] for elem in inputs)
+
+# return tf.py_func(my_func, [inputs], tf.string)
+
+# def load_vocab(vocab_fpath):
+# '''Loads vocabulary file and returns idx<->token maps
+# vocab_fpath: string. vocabulary file path.
+# Note that these are reserved
+# 0: , 1: , 2: , 3:
+
+# Returns
+# two dictionaries.
+# '''
+# vocab = [line.split()[0] for line in open(vocab_fpath, 'r', encoding='utf-8').read().splitlines()]
+# token2idx = {token: idx for idx, token in enumerate(vocab)}
+# idx2token = {idx: token for idx, token in enumerate(vocab)}
+# return token2idx, idx2token
diff --git a/examples/nlp/train_hetu_transformer.py b/examples/nlp/train_hetu_transformer.py
new file mode 100644
index 0000000..174f2b1
--- /dev/null
+++ b/examples/nlp/train_hetu_transformer.py
@@ -0,0 +1,62 @@
+from tqdm import tqdm
+import os
+import math
+import logging
+from hparams import Hparams
+from hetu_transformer import Transformer
+from data_load import DataLoader
+import hetu as ht
+import numpy as np
+# import time
+
+logging.basicConfig(level=logging.INFO)
+
+
+logging.info("# hparams")
+hparams = Hparams()
+parser = hparams.parser
+hp = parser.parse_args()
+print(hp)
+
+logging.info("# Prepare train/eval batches")
+dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)
+
+ctx = ht.gpu(1)
+xs = ht.Variable(name='xs')
+ys1 = ht.Variable(name='ys1')
+ys2 = ht.Variable(name='ys2')
+nonpadding = ht.Variable(name='nonpadding')
+
+logging.info("# Load model")
+m = Transformer(hp)
+loss = m.train(xs, (ys1, ys2))
+loss = ht.div_op(ht.reduce_sum_op(loss * nonpadding,
+ axes=[0, 1]), ht.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7)
+opt = ht.optim.SGDOptimizer(hp.lr)
+train_op = opt.minimize(loss)
+executor = ht.Executor([loss, train_op], ctx=ctx)
+
+logging.info("# Session")
+
+
+for ep in range(hp.num_epochs):
+ dataloader.make_epoch_data(hp.batch_size)
+ for i in tqdm(range(dataloader.batch_num)):
+ xs_val, ys_val = dataloader.get_batch()
+ # st = time.time()
+ xs_val = xs_val[0]
+ ys1_val = ys_val[0][:, :-1]
+ ys2_val = ys_val[0][:, 1:]
+ nonpadding_val = np.not_equal(
+ ys2_val, dataloader.get_pad()).astype(np.float32)
+ _loss, _ = executor.run(
+ feed_dict={xs: xs_val, ys1: ys1_val, ys2: ys2_val, nonpadding: nonpadding_val})
+ # en = time.time()
+ # if i == 100:
+ # exit()
+
+ log_str = 'Iteration %d, loss %f' % (i, _loss.asnumpy())
+ print(log_str)
+ # print('time: ', (en - st))
+
+logging.info("Done")
diff --git a/examples/nlp/train_tf_transformer.py b/examples/nlp/train_tf_transformer.py
new file mode 100644
index 0000000..8def5d0
--- /dev/null
+++ b/examples/nlp/train_tf_transformer.py
@@ -0,0 +1,98 @@
+import tensorflow as tf
+
+from tqdm import tqdm
+import os
+import math
+import logging
+from hparams import Hparams
+from tf_transformer import Transformer
+from data_load import DataLoader
+# import time
+
+logging.basicConfig(level=logging.INFO)
+
+
+logging.info("# hparams")
+hparams = Hparams()
+parser = hparams.parser
+hp = parser.parse_args()
+print(hp)
+# save_hparams(hp, hp.logdir)
+
+logging.info("# Prepare train/eval batches")
+dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)
+
+xs = tf.placeholder(name='xs', dtype=tf.int32, shape=[16, 100])
+ys1 = tf.placeholder(name='ys1', dtype=tf.int32, shape=[16, 99])
+ys2 = tf.placeholder(name='ys2', dtype=tf.int32, shape=[16, 99])
+
+logging.info("# Load model")
+m = Transformer(hp)
+loss = m.train(xs, (ys1, ys2))
+nonpadding = tf.to_float(tf.not_equal(ys2, dataloader.get_pad())) # 0:
+loss = tf.reduce_sum(loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
+
+global_step = tf.train.get_or_create_global_step()
+optimizer = tf.train.GradientDescentOptimizer(hp.lr)
+train_op = optimizer.minimize(loss, global_step=global_step)
+# y_hat, eval_summaries = m.eval(xs, ys)
+# y_hat = m.infer(xs, ys)
+
+logging.info("# Session")
+saver = tf.train.Saver(max_to_keep=hp.num_epochs)
+with tf.Session() as sess:
+ ckpt = tf.train.latest_checkpoint(hp.logdir)
+ if ckpt is None:
+ logging.info("Initializing from scratch")
+ sess.run(tf.global_variables_initializer())
+ # save_variable_specs(os.path.join(hp.logdir, "specs"))
+ else:
+ saver.restore(sess, ckpt)
+
+ _gs = sess.run(global_step)
+
+ for ep in range(hp.num_epochs):
+ dataloader.make_epoch_data(hp.batch_size)
+ for i in tqdm(range(dataloader.batch_num)):
+ xs_val, ys_val = dataloader.get_batch()
+ # st = time.time()
+ _loss, _, _gs = sess.run([loss, train_op, global_step], feed_dict={
+ xs: xs_val[0], ys1: ys_val[0][:, :-1], ys2: ys_val[0][:, 1:]})
+ # en = time.time()
+ # if i == 100:
+ # exit()
+ # epoch = math.ceil(_gs / num_train_batches)
+
+ log_str = 'Iteration %d, loss %f' % (i, _loss)
+ print(log_str)
+ # print('time: ', (en - st))
+
+ # logging.info("epoch {} is done".format(ep))
+ # _loss = sess.run(loss) # train loss
+
+ # logging.info("# test evaluation")
+ # _, _eval_summaries = sess.run([eval_init_op, eval_summaries])
+ # summary_writer.add_summary(_eval_summaries, _gs)
+
+ # logging.info("# get hypotheses")
+ # hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, m.idx2token)
+
+ # logging.info("# write results")
+ # model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss)
+ # if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir)
+ # translation = os.path.join(hp.evaldir, model_output)
+ # with open(translation, 'w') as fout:
+ # fout.write("\n".join(hypotheses))
+
+ # logging.info("# calc bleu score and append it to translation")
+ # calc_bleu(hp.eval3, translation)
+
+ # logging.info("# save models")
+ # ckpt_name = os.path.join(hp.logdir, model_output)
+ # saver.save(sess, ckpt_name, global_step=_gs)
+ # logging.info("after training of {} epochs, {} has been saved.".format(epoch, ckpt_name))
+
+ # logging.info("# fall back to train mode")
+
+
+logging.info("Done")
diff --git a/examples/nlp/wikipedia.py b/examples/nlp/wikipedia.py
new file mode 100644
index 0000000..d86c9ac
--- /dev/null
+++ b/examples/nlp/wikipedia.py
@@ -0,0 +1,540 @@
+"""Wikipedia dataset containing cleaned articles of all languages."""
+import bz2
+import codecs
+import json
+import re
+import xml.etree.cElementTree as etree
+
+import datasets
+
+
+logger = datasets.logging.get_logger(__name__)
+
+
+_CITATION = """\
+@ONLINE {wikidump,
+ author = {Wikimedia Foundation},
+ title = {Wikimedia Downloads},
+ url = {https://dumps.wikimedia.org}
+}
+"""
+
+_DESCRIPTION = """\
+Wikipedia dataset containing cleaned articles of all languages.
+The datasets are built from the Wikipedia dump
+(https://dumps.wikimedia.org/) with one split per language. Each example
+contains the content of one full Wikipedia article with cleaning to strip
+markdown and unwanted sections (references, etc.).
+"""
+
+_LICENSE = (
+ "This work is licensed under the Creative Commons Attribution-ShareAlike "
+ "3.0 Unported License. To view a copy of this license, visit "
+ "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
+ "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
+)
+
+# Source: https://en.wikipedia.org/wiki/List_of_Wikipedias (accessed 3/1/2019)
+# Removed because no articles: hz.
+WIKIPEDIA_LANGUAGES = [
+ "aa",
+ "ab",
+ "ace",
+ "ady",
+ "af",
+ "ak",
+ "als",
+ "am",
+ "an",
+ "ang",
+ "ar",
+ "arc",
+ "arz",
+ "as",
+ "ast",
+ "atj",
+ "av",
+ "ay",
+ "az",
+ "azb",
+ "ba",
+ "bar",
+ "bat-smg",
+ "bcl",
+ "be",
+ "be-x-old",
+ "bg",
+ "bh",
+ "bi",
+ "bjn",
+ "bm",
+ "bn",
+ "bo",
+ "bpy",
+ "br",
+ "bs",
+ "bug",
+ "bxr",
+ "ca",
+ "cbk-zam",
+ "cdo",
+ "ce",
+ "ceb",
+ "ch",
+ "cho",
+ "chr",
+ "chy",
+ "ckb",
+ "co",
+ "cr",
+ "crh",
+ "cs",
+ "csb",
+ "cu",
+ "cv",
+ "cy",
+ "da",
+ "de",
+ "din",
+ "diq",
+ "dsb",
+ "dty",
+ "dv",
+ "dz",
+ "ee",
+ "el",
+ "eml",
+ "en",
+ "eo",
+ "es",
+ "et",
+ "eu",
+ "ext",
+ "fa",
+ "ff",
+ "fi",
+ "fiu-vro",
+ "fj",
+ "fo",
+ "fr",
+ "frp",
+ "frr",
+ "fur",
+ "fy",
+ "ga",
+ "gag",
+ "gan",
+ "gd",
+ "gl",
+ "glk",
+ "gn",
+ "gom",
+ "gor",
+ "got",
+ "gu",
+ "gv",
+ "ha",
+ "hak",
+ "haw",
+ "he",
+ "hi",
+ "hif",
+ "ho",
+ "hr",
+ "hsb",
+ "ht",
+ "hu",
+ "hy",
+ "ia",
+ "id",
+ "ie",
+ "ig",
+ "ii",
+ "ik",
+ "ilo",
+ "inh",
+ "io",
+ "is",
+ "it",
+ "iu",
+ "ja",
+ "jam",
+ "jbo",
+ "jv",
+ "ka",
+ "kaa",
+ "kab",
+ "kbd",
+ "kbp",
+ "kg",
+ "ki",
+ "kj",
+ "kk",
+ "kl",
+ "km",
+ "kn",
+ "ko",
+ "koi",
+ "krc",
+ "ks",
+ "ksh",
+ "ku",
+ "kv",
+ "kw",
+ "ky",
+ "la",
+ "lad",
+ "lb",
+ "lbe",
+ "lez",
+ "lfn",
+ "lg",
+ "li",
+ "lij",
+ "lmo",
+ "ln",
+ "lo",
+ "lrc",
+ "lt",
+ "ltg",
+ "lv",
+ "mai",
+ "map-bms",
+ "mdf",
+ "mg",
+ "mh",
+ "mhr",
+ "mi",
+ "min",
+ "mk",
+ "ml",
+ "mn",
+ "mr",
+ "mrj",
+ "ms",
+ "mt",
+ "mus",
+ "mwl",
+ "my",
+ "myv",
+ "mzn",
+ "na",
+ "nah",
+ "nap",
+ "nds",
+ "nds-nl",
+ "ne",
+ "new",
+ "ng",
+ "nl",
+ "nn",
+ "no",
+ "nov",
+ "nrm",
+ "nso",
+ "nv",
+ "ny",
+ "oc",
+ "olo",
+ "om",
+ "or",
+ "os",
+ "pa",
+ "pag",
+ "pam",
+ "pap",
+ "pcd",
+ "pdc",
+ "pfl",
+ "pi",
+ "pih",
+ "pl",
+ "pms",
+ "pnb",
+ "pnt",
+ "ps",
+ "pt",
+ "qu",
+ "rm",
+ "rmy",
+ "rn",
+ "ro",
+ "roa-rup",
+ "roa-tara",
+ "ru",
+ "rue",
+ "rw",
+ "sa",
+ "sah",
+ "sat",
+ "sc",
+ "scn",
+ "sco",
+ "sd",
+ "se",
+ "sg",
+ "sh",
+ "si",
+ "simple",
+ "sk",
+ "sl",
+ "sm",
+ "sn",
+ "so",
+ "sq",
+ "sr",
+ "srn",
+ "ss",
+ "st",
+ "stq",
+ "su",
+ "sv",
+ "sw",
+ "szl",
+ "ta",
+ "tcy",
+ "te",
+ "tet",
+ "tg",
+ "th",
+ "ti",
+ "tk",
+ "tl",
+ "tn",
+ "to",
+ "tpi",
+ "tr",
+ "ts",
+ "tt",
+ "tum",
+ "tw",
+ "ty",
+ "tyv",
+ "udm",
+ "ug",
+ "uk",
+ "ur",
+ "uz",
+ "ve",
+ "vec",
+ "vep",
+ "vi",
+ "vls",
+ "vo",
+ "wa",
+ "war",
+ "wo",
+ "wuu",
+ "xal",
+ "xh",
+ "xmf",
+ "yi",
+ "yo",
+ "za",
+ "zea",
+ "zh",
+ "zh-classical",
+ "zh-min-nan",
+ "zh-yue",
+ "zu",
+]
+
+_BASE_URL_TMPL = "https://dumps.wikimedia.org/{lang}wiki/{date}/"
+_INFO_FILE = "dumpstatus.json"
+
+
+class WikipediaConfig(datasets.BuilderConfig):
+ """BuilderConfig for Wikipedia."""
+
+ def __init__(self, language=None, date=None, **kwargs):
+ """BuilderConfig for Wikipedia.
+ Args:
+ language: string, the language code for the Wikipedia dump to use.
+ date: string, date of the Wikipedia dump in YYYYMMDD format. A list of
+ available dates can be found at https://dumps.wikimedia.org/enwiki/.
+ **kwargs: keyword arguments forwarded to super.
+ """
+ super(WikipediaConfig, self).__init__(
+ name="{0}.{1}".format(date, language),
+ description="Wikipedia dataset for {0}, parsed from {1} dump.".format(
+ language, date),
+ **kwargs,
+ )
+ self.date = date
+ self.language = language
+
+
+_VERSION = datasets.Version("1.0.0", "")
+
+
+class Wikipedia(datasets.BeamBasedBuilder):
+ """Wikipedia dataset."""
+
+ # Use mirror (your.org) to avoid download caps.
+ BUILDER_CONFIG_CLASS = WikipediaConfig
+ BUILDER_CONFIGS = [
+ WikipediaConfig(
+ version=_VERSION,
+ language=lang,
+ date="20200501",
+ ) # pylint:disable=g-complex-comprehension
+ for lang in WIKIPEDIA_LANGUAGES
+ ]
+
+ def _info(self):
+ return datasets.DatasetInfo(
+ description=_DESCRIPTION,
+ features=datasets.Features({"title": datasets.Value(
+ "string"), "text": datasets.Value("string")}),
+ # No default supervised_keys.
+ supervised_keys=None,
+ homepage="https://dumps.wikimedia.org",
+ citation=_CITATION,
+ )
+
+ def _split_generators(self, dl_manager, pipeline):
+ def _base_url(lang):
+ return _BASE_URL_TMPL.format(lang=lang.replace("-", "_"), date=self.config.date)
+
+ lang = self.config.language
+
+ info_url = _base_url(lang) + _INFO_FILE
+ # Use dictionary since testing mock always returns the same result.
+ downloaded_files = dl_manager.download_and_extract({"info": info_url})
+
+ xml_urls = []
+ total_bytes = 0
+ with open(downloaded_files["info"], encoding="utf-8") as f:
+ dump_info = json.load(f)
+ multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
+ assert (
+ multistream_dump_info["status"] == "done"
+ ), "Specified dump (%s) multistream status is not 'done': %s" % (
+ _base_url(lang),
+ multistream_dump_info["status"],
+ )
+
+ for fname, info in multistream_dump_info["files"].items():
+ if ".xml" not in fname:
+ continue
+ total_bytes += info["size"]
+ xml_urls.append(_base_url(lang) + fname)
+
+ # Use dictionary since testing mock always returns the same result.
+ downloaded_files = dl_manager.download({"xml": xml_urls})
+ if not pipeline.is_local():
+ downloaded_files = dl_manager.ship_files_with_pipeline(
+ downloaded_files, pipeline)
+
+ return [
+ datasets.SplitGenerator( # pylint:disable=g-complex-comprehension
+ name=datasets.Split.TRAIN, gen_kwargs={
+ "filepaths": downloaded_files["xml"], "language": lang}
+ )
+ ]
+
+ def _build_pcollection(self, pipeline, filepaths, language):
+ """Build PCollection of examples in the raw (text) form."""
+ import apache_beam as beam
+ import mwparserfromhell
+
+ def _extract_content(filepath):
+ """Extracts article content from a single WikiMedia XML file."""
+ logger.info("generating examples from = %s", filepath)
+ with beam.io.filesystems.FileSystems.open(filepath) as f:
+ f = bz2.BZ2File(filename=f)
+ # Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563
+ utf_f = codecs.getreader("utf-8")(f)
+ context = etree.iterparse(utf_f, events=("end",))
+ for unused_event, elem in context:
+ if not elem.tag.endswith("page"):
+ continue
+ namespace = elem.tag[:-4]
+ title = elem.find("./{0}title".format(namespace)).text
+ ns = elem.find("./{0}ns".format(namespace)).text
+ id_ = elem.find("./{0}id".format(namespace)).text
+
+ # Filter pages that are not in the "main" namespace.
+ if ns != "0":
+ elem.clear()
+ continue
+
+ raw_content = elem.find(
+ "./{0}revision/{0}text".format(namespace)).text
+ elem.clear()
+
+ # Filter redirects.
+ if raw_content is None or raw_content.lower().startswith("#redirect"):
+ beam.metrics.Metrics.counter(
+ language, "filtered-redirects").inc()
+ continue
+
+ beam.metrics.Metrics.counter(
+ language, "extracted-examples").inc()
+ yield (id_, title, raw_content)
+
+ def _clean_content(inputs):
+ """Cleans raw wikicode to extract text."""
+ id_, title, raw_content = inputs
+ try:
+ text = _parse_and_clean_wikicode(
+ raw_content, parser=mwparserfromhell)
+ except (mwparserfromhell.parser.ParserError) as e:
+ beam.metrics.Metrics.counter(language, "parser-error").inc()
+ logger.error("mwparserfromhell ParseError: %s", e)
+ return
+
+ if not text:
+ beam.metrics.Metrics.counter(
+ language, "empty-clean-examples").inc()
+ return
+
+ beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
+
+ yield id_, {"title": title, "text": text}
+
+ return (
+ pipeline
+ | "Initialize" >> beam.Create(filepaths)
+ | "Extract content" >> beam.FlatMap(_extract_content)
+ | "Distribute" >> beam.transforms.Reshuffle()
+ | "Clean content" >> beam.FlatMap(_clean_content)
+ )
+
+
+def _parse_and_clean_wikicode(raw_content, parser):
+ """Strips formatting and unwanted sections from raw page content."""
+ wikicode = parser.parse(raw_content)
+
+ # Filters for references, tables, and file/image links.
+ re_rm_wikilink = re.compile(
+ "^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE)
+
+ def rm_wikilink(obj):
+ return bool(re_rm_wikilink.match(str(obj.title)))
+
+ def rm_tag(obj):
+ return str(obj.tag) in {"ref", "table"}
+
+ def rm_template(obj):
+ return obj.name.lower() in {"reflist", "notelist", "notelist-ua", "notelist-lr", "notelist-ur", "notelist-lg"}
+
+ def try_remove_obj(obj, section):
+ try:
+ section.remove(obj)
+ except ValueError:
+ # For unknown reasons, objects are sometimes not found.
+ pass
+
+ section_text = []
+ # Filter individual sections to clean.
+ for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
+ for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True):
+ try_remove_obj(obj, section)
+ for obj in section.ifilter_templates(matches=rm_template, recursive=True):
+ try_remove_obj(obj, section)
+ for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
+ try_remove_obj(obj, section)
+
+ section_text.append(section.strip_code().strip())
+ return "\n\n".join(section_text)
diff --git a/examples/rec/.gitignore b/examples/rec/.gitignore
new file mode 100644
index 0000000..ccb0cb2
--- /dev/null
+++ b/examples/rec/.gitignore
@@ -0,0 +1,2 @@
+datasets/
+logs/
diff --git a/examples/rec/README.md b/examples/rec/README.md
new file mode 100644
index 0000000..5fc2afa
--- /dev/null
+++ b/examples/rec/README.md
@@ -0,0 +1,42 @@
+# Recommendation Model Example (with Distributed Settings)
+In this directory we provide NCF model for recommendation task on movielens dataset.
+
+## Structure
+```
+- rec
+ - run_hetu.py basic trainer for hetu
+ - run_tf.py basic trainer for tensorflow
+ - run_tfworker.py trainer for tensorflow in PS setting
+ - run_parallax.py trainer for tensorflow in parallax setting
+ - hetu_ncf.py model implementatino in hetu
+ - tf_ncf.py model implementation in tensorflow
+ - movielens.py script to download and handle dataset
+```
+
+## Prepare movielens data
+Simply `python movielens.py` .
+
+## Usage
+```bash
+# run locally
+python run_hetu.py
+# run in ps setting (locally)
+bash ps_ncf.sh
+# run in hybrid setting (locally)
+bash hybrid_ncf.sh
+
+# run tensorflow locally
+python run_tf.py
+# run tensorflow in parallax
+python {absolute_path_to}/run_parallax.py
+# run tensorflow in ps setting
+python ../ctr/tf_launch_server.py --config {config} --id {rank}
+python run_tfworker.py --rank {rank} --config {config}
+# or
+python ../ctr/tf_launch_server.py --config ../ctr/settings/tf_local_s1_w8.json --id 0
+bash tf_8workers.sh
+```
+
+
+## Configuration
+Please refer to `ctr` directory.
diff --git a/examples/rec/hetu_ncf.py b/examples/rec/hetu_ncf.py
new file mode 100644
index 0000000..ecf8b5d
--- /dev/null
+++ b/examples/rec/hetu_ncf.py
@@ -0,0 +1,47 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+
+
+def neural_mf(user_input, item_input, y_, num_users, num_items):
+ embed_dim = 8
+ layers = [64, 32, 16, 8]
+ learning_rate = 0.01
+
+ User_Embedding = init.random_normal(
+ (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ht.cpu(0))
+ Item_Embedding = init.random_normal(
+ (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ht.cpu(0))
+
+ user_latent = ht.embedding_lookup_op(
+ User_Embedding, user_input, ctx=ht.cpu(0))
+ item_latent = ht.embedding_lookup_op(
+ Item_Embedding, item_input, ctx=ht.cpu(0))
+
+ mf_user_latent = ht.slice_op(user_latent, (0, 0), (-1, embed_dim))
+ mlp_user_latent = ht.slice_op(user_latent, (0, embed_dim), (-1, -1))
+ mf_item_latent = ht.slice_op(item_latent, (0, 0), (-1, embed_dim))
+ mlp_item_latent = ht.slice_op(item_latent, (0, embed_dim), (-1, -1))
+
+ W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1')
+ W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2')
+ W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3')
+ W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4')
+
+ mf_vector = ht.mul_op(mf_user_latent, mf_item_latent)
+ mlp_vector = ht.concat_op(mlp_user_latent, mlp_item_latent, axis=1)
+ fc1 = ht.matmul_op(mlp_vector, W1)
+ relu1 = ht.relu_op(fc1)
+ fc2 = ht.matmul_op(relu1, W2)
+ relu2 = ht.relu_op(fc2)
+ fc3 = ht.matmul_op(relu2, W3)
+ relu3 = ht.relu_op(fc3)
+ concat_vector = ht.concat_op(mf_vector, relu3, axis=1)
+ y = ht.matmul_op(concat_vector, W4)
+ y = ht.sigmoid_op(y)
+ loss = ht.binarycrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+ train_op = opt.minimize(loss)
+ return loss, y, train_op
diff --git a/examples/rec/hybrid_ncf.sh b/examples/rec/hybrid_ncf.sh
new file mode 100644
index 0000000..39f7c9a
--- /dev/null
+++ b/examples/rec/hybrid_ncf.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_hetu.py
+
+python -m hetu.launcher ${workdir}/../ctr/settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_w4.yml
diff --git a/examples/rec/movielens.py b/examples/rec/movielens.py
new file mode 100644
index 0000000..90b5048
--- /dev/null
+++ b/examples/rec/movielens.py
@@ -0,0 +1,119 @@
+import os
+import wget
+import zipfile
+from collections import defaultdict as dd
+import numpy as np
+import scipy.sparse as sp
+from tqdm import tqdm
+
+
+DATASETS = ["ml-1m", "ml-20m", "ml-25m"]
+urls = {
+ "ml-1m": "https://files.grouplens.org/datasets/movielens/ml-1m.zip",
+ "ml-20m": "https://files.grouplens.org/datasets/movielens/ml-20m.zip",
+ "ml-25m": "https://files.grouplens.org/datasets/movielens/ml-25m.zip",
+}
+
+
+def download(dataset, data_dir, num_negatives=4):
+ if not os.path.exists(data_dir):
+ os.mkdir(data_dir)
+ assert dataset in ["ml-1m", "ml-20m",
+ "ml-25m"], 'Invalid dataset: %s.' % dataset
+ data_subdir = os.path.join(data_dir, dataset)
+ print('Data in', data_subdir)
+ zip_file = os.path.join(data_dir, dataset + '.zip')
+ ratings = os.path.join(data_subdir, 'ratings.csv')
+ if not os.path.exists(ratings):
+ if not os.path.exists(zip_file):
+ print('Downloading movielens %s...' % dataset)
+ wget.download(urls[dataset], zip_file)
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+ print('Extracting movielens %s...' % dataset)
+ zip_ref.extractall(data_dir)
+ ratings = os.path.join(data_subdir, 'ratings.csv')
+
+ num_users, num_items = {
+ 'ml-1m': (6040, 3706),
+ 'ml-20m': (138493, 26744),
+ 'ml-25m': (162541, 59047),
+ }[dataset]
+
+ # Generate raw training and testing files
+ item_reverse_mapping = {}
+ cur_item_idx = 0
+ latest = [(0, -1)] * num_users
+ mat = sp.dok_matrix((num_users, num_items), dtype=np.float32)
+ with open(ratings, 'r') as fr:
+ fr.readline()
+ for line in tqdm(fr):
+ entries = line.strip().split(',')
+ user = int(entries[0])
+ item = int(entries[1])
+ if item not in item_reverse_mapping:
+ item_reverse_mapping[item] = cur_item_idx
+ cur_item_idx += 1
+ rating = float(entries[2])
+ if rating <= 0:
+ continue
+ reitem = item_reverse_mapping[item]
+ mat[user-1, reitem] = 1
+ timestamp = int(entries[-1])
+ if latest[user-1][0] < timestamp:
+ latest[user-1] = (timestamp, reitem)
+ print('#users:', num_users, '#items:', num_items)
+
+ new_lates = np.concatenate((np.array(latest, dtype=np.int32)[
+ :, 1:], np.empty((num_users, 99), dtype=np.int32)), 1)
+
+ # sample for test data first, each user 99 items, using all data
+ for i, lat in enumerate(latest):
+ new_lates[i][0] = lat[1]
+ for k in range(1, 100):
+ j = np.random.randint(num_items)
+ while (i, j) in mat.keys():
+ j = np.random.randint(num_items)
+ new_lates[i][k] = j
+ np.save(os.path.join(data_subdir, 'test.npy'), new_lates)
+
+ # sample for train data, each data with num_negative negative samples
+ all_num = (1 + num_negatives) * (len(mat.keys()) - num_users)
+ user_input = np.empty((all_num,), dtype=np.int32)
+ item_input = np.empty((all_num,), dtype=np.int32)
+ labels = np.empty((all_num,), dtype=np.int32)
+ idx = 0
+ for (i, j) in mat.keys():
+ if new_lates[i][0] == j:
+ continue
+ # positive instance
+ user_input[idx] = i
+ item_input[idx] = j
+ labels[idx] = 1
+ idx += 1
+ # negative instances
+ for t in range(num_negatives):
+ k = np.random.randint(num_items)
+ while (i, k) in mat.keys():
+ k = np.random.randint(num_items)
+ user_input[idx] = i
+ item_input[idx] = k
+ labels[idx] = 0
+ idx += 1
+ assert all_num == idx
+ np.savez(os.path.join(data_subdir, 'train.npz'),
+ user_input=user_input, item_input=item_input, labels=labels)
+
+
+def getdata(dataset, data_dir='datasets'):
+ assert dataset in ["ml-1m", "ml-20m",
+ "ml-25m"], 'Invalid dataset: %s.' % dataset
+ data_subdir = os.path.join(data_dir, dataset)
+ file_paths = [os.path.join(data_subdir, data)
+ for data in ['train.npz', 'test.npy']]
+ if any([not os.path.exists(path) for path in file_paths]):
+ download(dataset, data_dir)
+ return np.load(file_paths[0]), np.load(file_paths[1])
+
+
+if __name__ == "__main__":
+ download('ml-25m', 'datasets')
diff --git a/examples/rec/ps_ncf.sh b/examples/rec/ps_ncf.sh
new file mode 100644
index 0000000..415f752
--- /dev/null
+++ b/examples/rec/ps_ncf.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_hetu.py
+
+python ${mainpy} --comm PS --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_s1_w4.yml
diff --git a/examples/rec/run_hetu.py b/examples/rec/run_hetu.py
new file mode 100644
index 0000000..e20cb85
--- /dev/null
+++ b/examples/rec/run_hetu.py
@@ -0,0 +1,187 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import numpy as np
+import yaml
+import time
+import math
+import argparse
+from tqdm import tqdm
+from hetu_ncf import neural_mf
+import heapq # for retrieval topK
+
+
+def getHitRatio(ranklist, gtItem):
+ for item in ranklist:
+ if item == gtItem:
+ return 1
+ return 0
+
+
+def getNDCG(ranklist, gtItem):
+ for i in range(len(ranklist)):
+ item = ranklist[i]
+ if item == gtItem:
+ return math.log(2) / math.log(i+2)
+ return 0
+
+
+class Logging(object):
+ def __init__(self, path='logs/hetulog.txt'):
+ with open(path, 'w') as fw:
+ fw.write('')
+ self.path = path
+
+ def write(self, s):
+ print(s)
+ with open(self.path, 'a') as fw:
+ fw.write(s + '\n')
+ fw.flush()
+
+
+def worker(args):
+ def validate():
+ hits, ndcgs = [], []
+ for idx in range(testData.shape[0]):
+ start_index = idx * 100
+ predictions = executor.run(
+ 'validate', convert_to_numpy_ret_vals=True)
+ map_item_score = {
+ testItemInput[start_index + i]: predictions[0][i] for i in range(100)}
+ gtItem = testItemInput[start_index]
+ # Evaluate top rank list
+ ranklist = heapq.nlargest(
+ topK, map_item_score, key=map_item_score.get)
+ hr = getHitRatio(ranklist, gtItem)
+ ndcg = getNDCG(ranklist, gtItem)
+ hits.append(hr)
+ ndcgs.append(ndcg)
+ hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+ return hr, ndcg
+
+ def get_current_shard(data):
+ if args.comm is not None:
+ part_size = data.shape[0] // nrank
+ start = part_size * rank
+ end = start + part_size if rank != nrank - 1 else data.shape[0]
+ return data[start:end]
+ else:
+ return data
+
+ device_id = 0
+ if args.comm == 'PS':
+ rank = ht.get_worker_communicate().rank()
+ nrank = int(os.environ['DMLC_NUM_WORKER'])
+ device_id = rank % 8
+ elif args.comm == 'Hybrid':
+ comm = ht.wrapped_mpi_nccl_init()
+ device_id = comm.dev_id
+ rank = comm.rank
+ nrank = int(os.environ['DMLC_NUM_WORKER'])
+
+ from movielens import getdata
+ if args.all:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = get_current_shard(trainData['user_input'])
+ trainItems = get_current_shard(trainData['item_input'])
+ trainLabels = get_current_shard(trainData['labels'])
+ testData = get_current_shard(testData)
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+ else:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = get_current_shard(trainData['user_input'][:1024000])
+ trainItems = get_current_shard(trainData['item_input'][:1024000])
+ trainLabels = get_current_shard(trainData['labels'][:1024000])
+ testData = get_current_shard(testData[:1470])
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+
+ num_users, num_items = {
+ 'ml-1m': (6040, 3706),
+ 'ml-20m': (138493, 26744),
+ 'ml-25m': (162541, 59047),
+ }['ml-25m']
+ # assert not args.all or num_users == testData.shape[0]
+ batch_size = 1024
+ num_negatives = 4
+ topK = 10
+ user_input = ht.dataloader_op([
+ ht.Dataloader(trainUsers, batch_size, 'train'),
+ ht.Dataloader(testUserInput, 100, 'validate'),
+ ])
+ item_input = ht.dataloader_op([
+ ht.Dataloader(trainItems, batch_size, 'train'),
+ ht.Dataloader(testItemInput, 100, 'validate'),
+ ])
+ y_ = ht.dataloader_op([
+ ht.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'),
+ ])
+
+ loss, y, train_op = neural_mf(
+ user_input, item_input, y_, num_users, num_items)
+
+ executor = ht.Executor({'train': [loss, train_op], 'validate': [y]}, ctx=ht.gpu(device_id),
+ comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
+
+ path = 'logs/hetulog_%s' % ({None: 'local',
+ 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm])
+ path += '_%d.txt' % rank if args.comm else '.txt'
+ log = Logging(path=path)
+ epoch = 7
+ start = time.time()
+ for ep in range(epoch):
+ ep_st = time.time()
+ log.write('epoch %d' % ep)
+ train_loss = []
+ for idx in tqdm(range(executor.get_batch_num('train'))):
+ loss_val = executor.run('train', convert_to_numpy_ret_vals=True)
+ train_loss.append(loss_val[0])
+
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+
+ # validate phase
+ if args.val:
+ hr, ndcg = validate()
+ printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+ tra_loss, hr, ndcg, ep_en - ep_st)
+ else:
+ printstr = "train_loss: %.4f, train_time: %.4f" % (
+ tra_loss, ep_en - ep_st)
+ log.write(printstr)
+ log.write('all time: %f' % (time.time() - start))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--val", action="store_true",
+ help="whether to perform validation")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data, default to use 1024000 training data")
+ parser.add_argument("--comm", default=None,
+ help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
+ parser.add_argument("--bsp", action="store_true",
+ help="whether to use bsp instead of asp")
+ parser.add_argument("--cache", default=None, help="cache policy")
+ parser.add_argument("--bound", default=100, help="cache bound")
+ parser.add_argument(
+ "--config", type=str, default="./settings/local_s1_w4.yml", help="configuration for ps")
+ args = parser.parse_args()
+
+ if args.comm is None:
+ worker(args)
+ elif args.comm == 'Hybrid':
+ settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+ value = settings['shared']
+ os.environ['DMLC_ROLE'] = 'worker'
+ for k, v in value.items():
+ os.environ[k] = str(v)
+ worker(args)
+ elif args.comm == 'PS':
+ launch(worker, args)
+ else:
+ raise NotImplementedError
diff --git a/examples/rec/run_parallax.py b/examples/rec/run_parallax.py
new file mode 100644
index 0000000..276d49a
--- /dev/null
+++ b/examples/rec/run_parallax.py
@@ -0,0 +1,192 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq # for retrieval topK
+import math
+
+from autodist import AutoDist
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
+from autodist.strategy.base import Strategy
+from autodist.kernel.common.utils import get_op_name
+from tensorflow.python.framework import ops
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+# Please DO NOT modify /etc/bash.bashrc to activate conda environment.
+# Use python_venv in spec yml file instead.
+# Use absolute path of python file.
+# Here we use the tf native partitioner instead of autodist's PartitionPS.
+
+
+class Parallaxx(PSLoadBalancing, AllReduce):
+ """
+ Modify original parallax to remove replica on CPUs.
+ """
+
+ def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
+ PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
+ AllReduce.__init__(self, chunk_size)
+
+ # pylint: disable=attribute-defined-outside-init
+ def build(self, graph_item, resource_spec):
+ """Generate the strategy."""
+ expr = Strategy()
+
+ # For each variable, generate variable synchronizer config
+ expr.graph_config.replicas.extend(
+ [k for k, v in resource_spec.gpu_devices])
+ reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+ self.loads = {ps: 0.0 for ps in reduction_device_names}
+
+ # Generate node config
+ node_config = []
+ for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
+ var_op_name = get_op_name(var.name)
+ grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
+ if isinstance(grad, ops.Tensor): # this is a dense variable
+ group_id = idx // self.chunk_size
+ config = self._gen_all_reduce_node_config(
+ var.name, group=group_id)
+ else: # sparse updates
+ # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
+ # Sparse variables are likely larger, so keeping copies would be costlier,
+ # and usually each device only requires a small part of the overall variable.
+ config = self._gen_ps_node_config(
+ var,
+ # For Parallax Strategy, all PS vars are sparse which does not need proxy.
+ False,
+ self._sync,
+ self._staleness
+ )
+ node_config.append(config)
+ expr.node_config.extend(node_config)
+
+ return expr
+
+
+def getHitRatio(ranklist, gtItem):
+ for item in ranklist:
+ if item == gtItem:
+ return 1
+ return 0
+
+
+def getNDCG(ranklist, gtItem):
+ for i in range(len(ranklist)):
+ item = ranklist[i]
+ if item == gtItem:
+ return math.log(2) / math.log(i+2)
+ return 0
+
+
+class Logging(object):
+ def __init__(self, path='logs/tflog.txt'):
+ with open(path, 'w') as fw:
+ fw.write('')
+ self.path = path
+
+ def write(self, s):
+ print(s)
+ with open(self.path, 'a') as fw:
+ fw.write(s + '\n')
+ fw.flush()
+
+
+def main():
+ resource_spec_file = os.path.join(os.path.dirname(
+ __file__), '../ctr/settings', 'plx_local_spec.yml')
+ autodist = AutoDist(resource_spec_file, Parallaxx())
+ respec = ResourceSpec(resource_spec_file)
+
+ def validate():
+ # validate phase
+ hits, ndcgs = [], []
+ for idx in range(num_users):
+ start_index = idx * 100
+ my_feed_dict = {
+ user_input: testUserInput[start_index:start_index+100],
+ item_input: testItemInput[start_index:start_index+100],
+ }
+ predictions = sess.run([y], feed_dict=my_feed_dict)
+ map_item_score = {
+ testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+ # Evaluate top rank list
+ ranklist = heapq.nlargest(
+ topK, map_item_score, key=map_item_score.get)
+ hr = getHitRatio(ranklist, testItemInput[start_index])
+ ndcg = getNDCG(ranklist, testItemInput[start_index])
+ hits.append(hr)
+ ndcgs.append(ndcg)
+ hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+ return hr, ndcg
+
+ from movielens import getdata
+ trainData, testData = getdata('ml-25m', 'datasets')
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+ num_users, num_items = {
+ 'ml-1m': (6040, 3706),
+ 'ml-20m': (138493, 26744),
+ 'ml-25m': (162541, 59047),
+ }['ml-25m']
+ batch_size = 1024
+ num_negatives = 4
+ topK = 10
+ with tf.Graph().as_default() as g, autodist.scope():
+ user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+ loss, y, opt = neural_mf(
+ user_input, item_input, y_, num_users, num_items)
+ train_op = opt.minimize(loss)
+
+ sess = autodist.create_distributed_session()
+
+ log = Logging(path=os.path.join(
+ os.path.dirname(__file__), 'logs', 'tfplx.txt'))
+ epoch = 7
+ iterations = trainData['user_input'].shape[0] // batch_size
+ start = time.time()
+ for ep in range(epoch):
+ ep_st = time.time()
+ log.write('epoch %d' % ep)
+ train_loss = []
+ for idx in range(iterations):
+ start_index = idx * batch_size
+ my_feed_dict = {
+ user_input: trainData['user_input'][start_index:start_index+batch_size],
+ item_input: trainData['item_input'][start_index:start_index+batch_size],
+ y_: trainData['labels'][start_index:start_index+batch_size],
+ }
+
+ loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+ train_loss.append(loss_val[0])
+
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+
+ # validate phase
+ hr, ndcg = validate()
+ printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+ tra_loss, hr, ndcg, ep_en - ep_st)
+ log.write(printstr)
+ log.write('all time:', (time.time() - start))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/rec/run_tf.py b/examples/rec/run_tf.py
new file mode 100644
index 0000000..f5c1e03
--- /dev/null
+++ b/examples/rec/run_tf.py
@@ -0,0 +1,145 @@
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq # for retrieval topK
+import math
+
+
+def getHitRatio(ranklist, gtItem):
+ for item in ranklist:
+ if item == gtItem:
+ return 1
+ return 0
+
+
+def getNDCG(ranklist, gtItem):
+ for i in range(len(ranklist)):
+ item = ranklist[i]
+ if item == gtItem:
+ return math.log(2) / math.log(i+2)
+ return 0
+
+
+class Logging(object):
+ def __init__(self, path='logs/tflog.txt'):
+ with open(path, 'w') as fw:
+ fw.write('')
+ self.path = path
+
+ def write(self, s):
+ print(s)
+ with open(self.path, 'a') as fw:
+ fw.write(s + '\n')
+ fw.flush()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--val", action="store_true",
+ help="whether to perform validation")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+
+ def validate():
+ # validate phase
+ hits, ndcgs = [], []
+ for idx in range(num_users):
+ start_index = idx * 100
+ my_feed_dict = {
+ user_input: testUserInput[start_index:start_index+100],
+ item_input: testItemInput[start_index:start_index+100],
+ }
+ predictions = sess.run([y], feed_dict=my_feed_dict)
+ map_item_score = {
+ testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+ # Evaluate top rank list
+ ranklist = heapq.nlargest(
+ topK, map_item_score, key=map_item_score.get)
+ hr = getHitRatio(ranklist, testItemInput[start_index])
+ ndcg = getNDCG(ranklist, testItemInput[start_index])
+ hits.append(hr)
+ ndcgs.append(ndcg)
+ hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+ return hr, ndcg
+
+ from movielens import getdata
+ if args.all:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = trainData['user_input']
+ trainItems = trainData['item_input']
+ trainLabels = trainData['labels']
+ testData = testData
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+ else:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = trainData['user_input'][:1024000]
+ trainItems = trainData['item_input'][:1024000]
+ trainLabels = trainData['labels'][:1024000]
+ testData = testData[:1470]
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+ num_users, num_items = {
+ 'ml-1m': (6040, 3706),
+ 'ml-20m': (138493, 26744),
+ 'ml-25m': (162541, 59047),
+ }['ml-25m']
+ batch_size = 1024
+ num_negatives = 4
+ topK = 10
+ user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+ loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items)
+ train_op = opt.minimize(loss)
+
+ init = tf.compat.v1.global_variables_initializer()
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ sess = tf.compat.v1.Session(
+ config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+ sess.run(init)
+
+ log = Logging()
+ epoch = 7
+ iterations = trainUsers.shape[0] // batch_size
+ start = time.time()
+ for ep in range(epoch):
+ ep_st = time.time()
+ log.write('epoch %d' % ep)
+ train_loss = []
+ for idx in range(iterations):
+ start_index = idx * batch_size
+ my_feed_dict = {
+ user_input: trainUsers[start_index:start_index+batch_size],
+ item_input: trainItems[start_index:start_index+batch_size],
+ y_: trainLabels[start_index:start_index+batch_size],
+ }
+
+ loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+ train_loss.append(loss_val[0])
+
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+
+ # validate phase
+ if args.val:
+ hr, ndcg = validate()
+ printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+ tra_loss, hr, ndcg, ep_en - ep_st)
+ else:
+ printstr = "train_loss: %.4f, train_time: %.4f" % (
+ tra_loss, ep_en - ep_st)
+ log.write(printstr)
+ log.write('all time:%f' % (time.time() - start))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/rec/run_tfworker.py b/examples/rec/run_tfworker.py
new file mode 100644
index 0000000..20c0402
--- /dev/null
+++ b/examples/rec/run_tfworker.py
@@ -0,0 +1,192 @@
+import os
+import json
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq # for retrieval topK
+import math
+
+
+def pop_env():
+ for k in ['https_proxy', 'http_proxy']:
+ if k in os.environ:
+ os.environ.pop(k)
+
+
+pop_env()
+
+
+def getHitRatio(ranklist, gtItem):
+ for item in ranklist:
+ if item == gtItem:
+ return 1
+ return 0
+
+
+def getNDCG(ranklist, gtItem):
+ for i in range(len(ranklist)):
+ item = ranklist[i]
+ if item == gtItem:
+ return math.log(2) / math.log(i+2)
+ return 0
+
+
+class Logging(object):
+ def __init__(self, path='logs/tflog.txt'):
+ with open(path, 'w') as fw:
+ fw.write('')
+ self.path = path
+
+ def write(self, s):
+ print(s)
+ with open(self.path, 'a') as fw:
+ fw.write(s + '\n')
+ fw.flush()
+
+
+def train_ncf(cluster, rank, nrank, args):
+ def validate():
+ # validate phase
+ hits, ndcgs = [], []
+ for idx in range(testData.shape[0]):
+ start_index = idx * 100
+ my_feed_dict = {
+ user_input: testUserInput[start_index:start_index+100],
+ item_input: testItemInput[start_index:start_index+100],
+ }
+ predictions = sess.run([y], feed_dict=my_feed_dict)
+ map_item_score = {
+ testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+ # Evaluate top rank list
+ ranklist = heapq.nlargest(
+ topK, map_item_score, key=map_item_score.get)
+ hr = getHitRatio(ranklist, testItemInput[start_index])
+ ndcg = getNDCG(ranklist, testItemInput[start_index])
+ hits.append(hr)
+ ndcgs.append(ndcg)
+ hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+ return hr, ndcg
+
+ def get_current_shard(data):
+ part_size = data.shape[0] // nrank
+ start = part_size * rank
+ end = start + part_size if rank != nrank - 1 else data.shape[0]
+ return data[start:end]
+
+ from movielens import getdata
+ if args.all:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = get_current_shard(trainData['user_input'])
+ trainItems = get_current_shard(trainData['item_input'])
+ trainLabels = get_current_shard(trainData['labels'])
+ testData = get_current_shard(testData)
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+ else:
+ trainData, testData = getdata('ml-25m', 'datasets')
+ trainUsers = get_current_shard(trainData['user_input'][:1024000])
+ trainItems = get_current_shard(trainData['item_input'][:1024000])
+ trainLabels = get_current_shard(trainData['labels'][:1024000])
+ testData = get_current_shard(testData[:1470])
+ testUserInput = np.repeat(
+ np.arange(testData.shape[0], dtype=np.int32), 100)
+ testItemInput = testData.reshape((-1,))
+
+ num_users, num_items = {
+ 'ml-1m': (6040, 3706),
+ 'ml-20m': (138493, 26744),
+ 'ml-25m': (162541, 59047),
+ }['ml-25m']
+ batch_size = 1024
+ num_negatives = 4
+ topK = 10
+
+ worker_device = "/job:worker/task:%d/gpu:0" % (rank)
+ with tf.device(worker_device):
+ user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+ y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+ with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)):
+ server_num = len(cluster.as_dict()['ps'])
+ embed_partitioner = tf.fixed_size_partitioner(
+ server_num, 0) if server_num > 1 else None
+ loss, y, opt = neural_mf(
+ user_input, item_input, y_, num_users, num_items, embed_partitioner)
+ train_op = opt.minimize(loss)
+
+ server = tf.train.Server(
+ cluster, job_name="worker", task_index=rank)
+ init = tf.compat.v1.global_variables_initializer()
+ sv = tf.train.Supervisor(
+ is_chief=(rank == 0),
+ init_op=init,
+ recovery_wait_secs=1)
+ sess_config = tf.compat.v1.ConfigProto(
+ allow_soft_placement=True,
+ log_device_placement=False,
+ device_filters=["/job:ps",
+ "/job:worker/task:%d" % rank])
+ sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
+
+ log = Logging(path='logs/tflog%d.txt' % rank)
+ epoch = 7
+ iterations = trainUsers.shape[0] // batch_size
+ start = time.time()
+ for ep in range(epoch):
+ ep_st = time.time()
+ log.write('epoch %d' % ep)
+ train_loss = []
+ for idx in tqdm(range(iterations)):
+ start_index = idx * batch_size
+ my_feed_dict = {
+ user_input: trainUsers[start_index:start_index+batch_size],
+ item_input: trainItems[start_index:start_index+batch_size],
+ y_: trainLabels[start_index:start_index+batch_size],
+ }
+
+ loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+ train_loss.append(loss_val[0])
+
+ tra_loss = np.mean(train_loss)
+ ep_en = time.time()
+
+ # validate phase
+ if args.val:
+ hr, ndcg = validate()
+ printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+ tra_loss, hr, ndcg, ep_en - ep_st)
+ else:
+ printstr = "train_loss: %.4f, train_time: %.4f" % (
+ tra_loss, ep_en - ep_st)
+ log.write(printstr)
+ log.write('all time: %f' % (time.time() - start))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--val", action="store_true",
+ help="whether to perform validation")
+ parser.add_argument("--rank", type=int, required=True,
+ help="rank of process")
+ parser.add_argument(
+ "--config", type=str, default='../ctr/settings/tf_local_s1_w2.json', help="config file path")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ args = parser.parse_args()
+ task_id = int(args.rank)
+ raw_config = args.config
+
+ config = json.load(open(raw_config))
+ cluster = tf.train.ClusterSpec(config)
+
+ train_ncf(cluster, task_id, len(config['worker']), args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/rec/tf_8workers.sh b/examples/rec/tf_8workers.sh
new file mode 100644
index 0000000..cd8c494
--- /dev/null
+++ b/examples/rec/tf_8workers.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_tfworker.py
+
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 0 &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 1 &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 2 &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 3 &
+CUDA_VISIBLE_DEVICES=4 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 4 &
+CUDA_VISIBLE_DEVICES=5 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 5 &
+CUDA_VISIBLE_DEVICES=6 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 6 &
+CUDA_VISIBLE_DEVICES=7 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 7 &
+wait
\ No newline at end of file
diff --git a/examples/rec/tf_ncf.py b/examples/rec/tf_ncf.py
new file mode 100644
index 0000000..c8ca51f
--- /dev/null
+++ b/examples/rec/tf_ncf.py
@@ -0,0 +1,47 @@
+import tensorflow as tf
+
+
+def neural_mf(user_input, item_input, y_, num_users, num_items, embed_partitioner=None):
+ embed_dim = 8
+ layers = [64, 32, 16, 8]
+ learning_rate = 0.01
+ with tf.compat.v1.variable_scope('nmf', dtype=tf.float32):
+ with tf.device('/cpu:0'):
+ User_Embedding = tf.compat.v1.get_variable(name="user_embed", shape=(
+ num_users, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner)
+ Item_Embedding = tf.compat.v1.get_variable(name="item_embed", shape=(
+ num_items, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner)
+
+ user_latent = tf.nn.embedding_lookup(User_Embedding, user_input)
+ item_latent = tf.nn.embedding_lookup(Item_Embedding, item_input)
+
+ W1 = tf.compat.v1.get_variable(name='W1', shape=(
+ layers[0], layers[1]), initializer=tf.random_normal_initializer(stddev=0.1))
+ W2 = tf.compat.v1.get_variable(name='W2', shape=(
+ layers[1], layers[2]), initializer=tf.random_normal_initializer(stddev=0.1))
+ W3 = tf.compat.v1.get_variable(name='W3', shape=(
+ layers[2], layers[3]), initializer=tf.random_normal_initializer(stddev=0.1))
+ W4 = tf.compat.v1.get_variable(name='W4', shape=(
+ embed_dim + layers[3], 1), initializer=tf.random_normal_initializer(stddev=0.1))
+
+ with tf.device('/gpu:0'):
+ mf_user_latent, mlp_user_latent = tf.split(
+ user_latent, [embed_dim, layers[0] // 2], 1)
+ mf_item_latent, mlp_item_latent = tf.split(
+ item_latent, [embed_dim, layers[0] // 2], 1)
+ mf_vector = tf.multiply(mf_user_latent, mf_item_latent)
+ mlp_vector = tf.concat((mlp_user_latent, mlp_item_latent), 1)
+ fc1 = tf.matmul(mlp_vector, W1)
+ relu1 = tf.nn.relu(fc1)
+ fc2 = tf.matmul(relu1, W2)
+ relu2 = tf.nn.relu(fc2)
+ fc3 = tf.matmul(relu2, W3)
+ relu3 = tf.nn.relu(fc3)
+ concat_vector = tf.concat((mf_vector, relu3), 1)
+ y = tf.reshape(tf.matmul(concat_vector, W4), (-1,))
+ loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)
+ loss = tf.reduce_mean(loss)
+ y = tf.sigmoid(y)
+ optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+ learning_rate)
+ return loss, y, optimizer
diff --git a/examples/runner/README.md b/examples/runner/README.md
new file mode 100644
index 0000000..272dd45
--- /dev/null
+++ b/examples/runner/README.md
@@ -0,0 +1,27 @@
+## Usage
+This directory contains examples using `heturun` command.
+
+* Data Parallel (MLP model and WDL model):
+```bash
+# Local Data Parallel Using AllReduce
+heturun -c local_allreduce.yml python run_mlp.py --config lar
+
+# Local Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters
+heturun -c local_ps.yml python run_wdl.py --config lhy
+
+# Local Data Parallel Using PS
+heturun -c local_ps.yml python run_mlp.py --config lps
+heturun -c local_ps.yml python run_wdl.py --config lps
+
+# Distributed Data Parallel Using AllReduce
+heturun -c remote_allreduce.yml python run_mlp.py --config rar
+
+# Distributed Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters
+heturun -c remote_ps.yml python run_wdl.py --config rhy
+
+# Distributed Data Parallel Using PS
+heturun -c remote_ps.yml python run_mlp.py --config rps
+heturun -c remote_ps.yml python run_wdl.py --config rps
+```
+
+* For other parallel schemes, please refer to `parallel` directory.
diff --git a/examples/runner/local_allreduce.yml b/examples/runner/local_allreduce.yml
new file mode 100644
index 0000000..5028d3f
--- /dev/null
+++ b/examples/runner/local_allreduce.yml
@@ -0,0 +1,5 @@
+nodes:
+ - host: localhost
+ servers: 0
+ workers: 4
+ chief: true
diff --git a/examples/runner/local_ps.yml b/examples/runner/local_ps.yml
new file mode 100644
index 0000000..c71709c
--- /dev/null
+++ b/examples/runner/local_ps.yml
@@ -0,0 +1,5 @@
+nodes:
+ - host: localhost
+ servers: 1
+ workers: 4
+ chief: true
diff --git a/examples/runner/models/MLP.py b/examples/runner/models/MLP.py
new file mode 100644
index 0000000..df60da8
--- /dev/null
+++ b/examples/runner/models/MLP.py
@@ -0,0 +1,33 @@
+import hetu as ht
+from hetu import init
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+ bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+def mlp(x, y_):
+ '''
+ MLP model, for MNIST dataset.
+
+ Parameters:
+ x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+ y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ Return:
+ loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+ y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+ '''
+
+ print("Building MLP model...")
+ x = fc(x, (784, 256), 'mlp_fc1', with_relu=True)
+ x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
+ y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
+ loss = ht.softmaxcrossentropy_op(y, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ return loss, y
diff --git a/examples/runner/models/__init__.py b/examples/runner/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/runner/models/load_data.py b/examples/runner/models/load_data.py
new file mode 100644
index 0000000..4edeb0d
--- /dev/null
+++ b/examples/runner/models/load_data.py
@@ -0,0 +1,220 @@
+import numpy as np
+import six.moves.cPickle as pickle
+import gzip
+import os
+
+
+def load_mnist_data(dataset):
+ """ Load the dataset
+ Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py
+ :type dataset: string
+ :param dataset: the path to the dataset (here MNIST)
+ """
+ # Download the MNIST dataset if it is not present
+ data_dir, data_file = os.path.split(dataset)
+ if data_dir == "" and not os.path.isfile(dataset):
+ # Check if dataset is in the data directory.
+ new_path = os.path.join(
+ os.path.split(__file__)[0],
+ dataset
+ )
+ if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+ dataset = new_path
+
+ if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+ from six.moves import urllib
+ origin = (
+ 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+ )
+ print('Downloading data from %s' % origin)
+ urllib.request.urlretrieve(origin, dataset)
+
+ # Load the dataset
+ with gzip.open(dataset, 'rb') as f:
+ try:
+ train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
+ except:
+ train_set, valid_set, test_set = pickle.load(f)
+ # train_set, valid_set, test_set format: tuple(input, target)
+ # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
+ # where each row corresponds to an example. target is a
+ # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
+ # as the number of rows in the input. It should give the target
+ # to the example with the same index in the input.
+ return train_set, valid_set, test_set
+
+
+def convert_to_one_hot(vals, max_val=0):
+ """Helper method to convert label array to one-hot array."""
+ if max_val == 0:
+ max_val = vals.max() + 1
+ one_hot_vals = np.zeros((vals.size, max_val))
+ one_hot_vals[np.arange(vals.size), vals] = 1
+ return one_hot_vals
+
+###########################################################################
+# adult
+###########################################################################
+
+
+def maybe_download(train_data, test_data):
+ import pandas as pd
+ """if adult data "train.csv" and "test.csv" are not in your directory,
+ download them.
+ """
+
+ COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+ "marital_status", "occupation", "relationship", "race", "gender",
+ "capital_gain", "capital_loss", "hours_per_week", "native_country",
+ "income_bracket"]
+
+ if not os.path.exists(train_data):
+ print("downloading training data...")
+ df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+ names=COLUMNS, skipinitialspace=True)
+ else:
+ df_train = pd.read_csv("train.csv")
+
+ if not os.path.exists(test_data):
+ print("downloading testing data...")
+ df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+ names=COLUMNS, skipinitialspace=True, skiprows=1)
+ else:
+ df_test = pd.read_csv("test.csv")
+
+ return df_train, df_test
+
+
+def cross_columns(x_cols):
+ """simple helper to build the crossed columns in a pandas dataframe
+ """
+ crossed_columns = dict()
+ colnames = ['_'.join(x_c) for x_c in x_cols]
+ for cname, x_c in zip(colnames, x_cols):
+ crossed_columns[cname] = x_c
+ return crossed_columns
+
+
+def val2idx(df, cols):
+ """helper to index categorical columns before embeddings.
+ """
+ val_types = dict()
+ for c in cols:
+ val_types[c] = df[c].unique()
+
+ val_to_idx = dict()
+ for k, v in val_types.items():
+ val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
+
+ for k, v in val_to_idx.items():
+ df[k] = df[k].apply(lambda x: v[x])
+
+ unique_vals = dict()
+ for c in cols:
+ unique_vals[c] = df[c].nunique()
+
+ return df, unique_vals
+
+
+def onehot(x):
+ from sklearn.preprocessing import OneHotEncoder
+ return np.array(OneHotEncoder().fit_transform(x).todense())
+
+
+def wide(df_train, df_test, wide_cols, x_cols, target):
+ import pandas as pd
+ print('Processing wide data')
+ df_train['IS_TRAIN'] = 1
+ df_test['IS_TRAIN'] = 0
+ df_wide = pd.concat([df_train, df_test])
+
+ crossed_columns_d = cross_columns(x_cols)
+ categorical_columns = list(
+ df_wide.select_dtypes(include=['object']).columns)
+
+ wide_cols += list(crossed_columns_d.keys())
+
+ for k, v in crossed_columns_d.items():
+ df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
+
+ df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
+
+ dummy_cols = [
+ c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
+ df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
+
+ train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+ test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+ assert all(train.columns == test.columns)
+
+ cols = [c for c in train.columns if c != target]
+ X_train = train[cols].values
+ y_train = train[target].values.reshape(-1, 1)
+ X_test = test[cols].values
+ y_test = test[target].values.reshape(-1, 1)
+ return X_train, y_train, X_test, y_test
+
+
+def load_adult_data(return_val=True):
+ import pandas as pd
+ df_train, df_test = maybe_download("train.csv", "test.csv")
+
+ df_train['income_label'] = (
+ df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+ df_test['income_label'] = (
+ df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+
+ age_groups = [0, 25, 65, 90]
+ age_labels = range(len(age_groups) - 1)
+ df_train['age_group'] = pd.cut(
+ df_train['age'], age_groups, labels=age_labels)
+ df_test['age_group'] = pd.cut(
+ df_test['age'], age_groups, labels=age_labels)
+
+ # columns for wide model
+ wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
+ 'relationship', 'race', 'gender', 'native_country', 'age_group']
+ x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
+
+ # columns for deep model
+ embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
+ 'relationship', 'race', 'gender', 'native_country']
+ cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
+
+ target = 'income_label'
+
+ x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
+ df_train, df_test, wide_cols, x_cols, target)
+ x_train_wide = np.array(x_train_wide).astype(np.float32)
+ x_test_wide = np.array(x_test_wide).astype(np.float32)
+
+ print('Processing deep data')
+ df_train['IS_TRAIN'] = 1
+ df_test['IS_TRAIN'] = 0
+ df_deep = pd.concat([df_train, df_test])
+
+ deep_cols = embedding_cols + cont_cols
+ df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
+ from sklearn.preprocessing import StandardScaler
+ scaler = StandardScaler()
+ df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
+ columns=cont_cols)
+ df_deep, unique_vals = val2idx(df_deep, embedding_cols)
+
+ train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+ test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+
+ x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
+ y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
+ x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
+ y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
+
+ x_train_deep = np.transpose(x_train_deep)
+ x_test_deep = np.transpose(x_test_deep)
+ y_train = onehot(y_train)
+ y_test = onehot(y_test)
+
+ if return_val:
+ return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
+ else:
+ return x_train_deep, x_train_wide, y_train
diff --git a/examples/runner/models/wdl_adult.py b/examples/runner/models/wdl_adult.py
new file mode 100644
index 0000000..6cef28c
--- /dev/null
+++ b/examples/runner/models/wdl_adult.py
@@ -0,0 +1,57 @@
+import hetu as ht
+from hetu import init
+
+
+def wdl_adult(X_deep, X_wide, y_, dense_param_ctx):
+ lr = 5 / 128
+ dim_wide = 809
+ dim_deep = 68
+
+ with ht.context(dense_param_ctx):
+ W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
+ W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
+ b1 = init.random_normal([50], stddev=0.1, name="b1")
+ W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
+ b2 = init.random_normal([20], stddev=0.1, name="b2")
+
+ # deep
+ Embedding = []
+ X_deep_input = None
+
+ for i in range(8):
+ Embedding_name = "Embedding_deep_" + str(i)
+ Embedding.append(init.random_normal(
+ [50, 8], stddev=0.1, name=Embedding_name))
+ now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
+ now = ht.array_reshape_op(now, (-1, 8))
+ if X_deep_input is None:
+ X_deep_input = now
+ else:
+ X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+ for i in range(4):
+ now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
+ X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+ mat1 = ht.matmul_op(X_deep_input, W1)
+ add1 = mat1 + ht.broadcastto_op(b1, mat1)
+ relu1 = ht.relu_op(add1)
+ dropout1 = relu1
+ mat2 = ht.matmul_op(dropout1, W2)
+ add2 = mat2 + ht.broadcastto_op(b2, mat2)
+ relu2 = ht.relu_op(add2)
+ dropout2 = relu2
+ dmodel = dropout2
+
+ # wide
+ wmodel = ht.concat_op(X_wide, dmodel, 1)
+ wmodel = ht.matmul_op(wmodel, W)
+
+ prediction = wmodel
+ loss = ht.softmaxcrossentropy_op(prediction, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+
+ opt = ht.optim.SGDOptimizer(learning_rate=lr)
+ train_op = opt.minimize(loss)
+
+ return loss, prediction, y_, train_op
diff --git a/examples/runner/parallel/README.md b/examples/runner/parallel/README.md
new file mode 100644
index 0000000..f5fc8d9
--- /dev/null
+++ b/examples/runner/parallel/README.md
@@ -0,0 +1,35 @@
+## Usage
+* Complex Pipeline Parallel (not using heturun):
+```bash
+mpirun --allow-run-as-root --tag-output -np 8 python complex_pipeline_mlp.py
+```
+
+* Simple Pipeline Parallel:
+```bash
+heturun -c config8.yml python simple_pipeline_mlp.py
+```
+
+* Data + Pipeline Parallel:
+```bash
+heturun -c config8.yml python data_pipeline_mlp.py
+```
+
+* Multiple Machine Data + Pipeline Parallel:
+```bash
+heturun -c dist_config8.yml python dist_data_pipeline_mlp.py
+```
+
+* Test Model Parallel (the following commands should give the same results):
+```bash
+heturun -c config3.yml python test_model_mlp_base.py --save
+heturun -c config4.yml python test_model_mlp.py --split left
+heturun -c config4.yml python test_model_mlp.py --split right
+heturun -c config4.yml python test_model_mlp.py --split middle
+```
+
+* Data + Model (+ Pipeline) Parallel:
+```bash
+heturun -c config8.yml python data_model_pipeline_mlp.py --split left
+heturun -c config8.yml python data_model_pipeline_mlp.py --split right
+heturun -c config8.yml python data_model_pipeline_mlp.py --split middle
+```
diff --git a/examples/runner/parallel/complex_pipeline_mlp.py b/examples/runner/parallel/complex_pipeline_mlp.py
new file mode 100644
index 0000000..a9a3d25
--- /dev/null
+++ b/examples/runner/parallel/complex_pipeline_mlp.py
@@ -0,0 +1,200 @@
+import hetu as ht
+from hetu import stream
+from hetu import init
+
+import os
+import sys
+import json
+import time
+import argparse
+import numpy as np
+import logging
+
+np.random.seed(123)
+
+
+def convert_to_one_hot(vals, max_val=0):
+ """Helper method to convert label array to one-hot array."""
+ if max_val == 0:
+ max_val = vals.max() + 1
+ one_hot_vals = np.zeros((vals.size, max_val))
+ one_hot_vals[np.arange(vals.size), vals] = 1
+ return one_hot_vals
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+ weight = init.random_normal(
+ shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx)
+ bias = init.random_normal(
+ shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx)
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--steps', type=int, default=8, help='training steps')
+ parser.add_argument('--warmup', type=int, default=2,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.00001, help='learning rate')
+ args = parser.parse_args()
+
+ # init and opt for both ranks
+ comm = ht.wrapped_mpi_nccl_init()
+ device_id = comm.dev_id
+ print("mpi_nccl init for gpu device: {}".format(device_id))
+ executor_ctx = ht.gpu(device_id)
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+
+ # init logger
+ logger = logging.getLogger()
+ ch = logging.StreamHandler()
+ formatter = logging.Formatter('[rank{}, PID{}]'.format(
+ device_id, os.getpid()) + ' %(asctime)s: %(message)s')
+ ch.setLevel(logging.DEBUG)
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+ log = logger.warning
+
+ # nccl communicate stream for pipeline_send/receive
+ communicate_stream = stream.create_stream_handle(executor_ctx)
+
+ # dataset
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ batch_size = 10000
+ batch_num = 5
+ value_x_list = []
+ value_y_list = []
+ for i in range(batch_num):
+ start = i * batch_size
+ ending = (i+1) * batch_size
+ value_x_list.append(train_set_x[start:ending])
+ value_y_list.append(train_set_y[start:ending])
+
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+
+ # model parallel
+ if comm.myRank.value == 0:
+ # rank0
+
+ # forward
+ activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True,
+ ctx=ht.gpu(comm.localRank.value))
+ activation = fc(activation, (1024, 2048), 'mlp_fc2',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ activation = fc(activation, (2048, 1024), 'mlp_fc3',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ activation_send_op = ht.pipeline_send_op(
+ activation, 1, comm, stream=communicate_stream)
+
+ # backward
+ gradient_receive_op = ht.pipeline_receive_op(
+ 1, comm, ctx=executor_ctx, stream=communicate_stream)
+ required_vars = opt.get_var_list(activation)
+ opt.params = required_vars
+ grads = ht.gradients(activation, required_vars,
+ insert_grad=gradient_receive_op)
+ train_op = ht.optim.OptimizerOp(grads, opt)
+
+ executor = ht.Executor(
+ [activation_send_op, train_op], ctx=executor_ctx)
+
+ elif comm.myRank.value != 7:
+ # from rank1 to rank6
+ previous_rank = comm.myRank.value - 1
+ next_rank = comm.myRank.value + 1
+
+ # 1. receive activation from previous rank
+ activation_receive_op = ht.pipeline_receive_op(
+ previous_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+ # forward
+ activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ activation = fc(activation, (2048, 2048), 'mlp_fc2',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ activation = fc(activation, (2048, 1024), 'mlp_fc3',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+
+ # 2. send activation to next rank
+ activation_send_op = ht.pipeline_send_op(
+ activation, next_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+
+ # 3. receive gradients from next rank
+ gradient_receive_op = ht.pipeline_receive_op(
+ next_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+ # backward
+ required_vars = opt.get_var_list(activation)
+ opt.params = required_vars
+ required_vars = [activation_receive_op] + required_vars
+ grads = ht.gradients(activation, required_vars,
+ insert_grad=gradient_receive_op)
+ train_op = ht.optim.OptimizerOp(grads[1:], opt)
+
+ # 4. send gradients to previous rank
+ sendback_grad_op = ht.pipeline_send_op(
+ grads[0], previous_rank, comm, stream=communicate_stream)
+
+ executor = ht.Executor(
+ [activation_send_op, sendback_grad_op, train_op], ctx=executor_ctx)
+
+ else:
+ # rank7
+ activation_receive_op = ht.pipeline_receive_op(
+ 6, comm, ctx=executor_ctx, stream=communicate_stream)
+
+ # forward
+ activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ activation = fc(activation, (2048, 1024), 'mlp_fc2',
+ with_relu=True, ctx=ht.gpu(comm.localRank.value))
+ y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+
+ # backward
+ required_vars = opt.get_var_list(loss)
+ opt.params = required_vars
+ required_vars = [activation_receive_op] + required_vars
+ grads = ht.gradients(loss, required_vars)
+ train_op = ht.optim.OptimizerOp(grads[1:], opt)
+
+ sendback_grad_op = ht.pipeline_send_op(
+ grads[0], 6, comm, stream=communicate_stream)
+ executor = ht.Executor(
+ [loss, sendback_grad_op, train_op], ctx=executor_ctx)
+
+ # training
+ for step in range(args.steps):
+ if step == args.warmup:
+ start = time.time()
+ if comm.myRank.value == 0:
+ log("step {}:".format(step))
+ if comm.myRank.value == 0:
+ executor.run(feed_dict={x: value_x_list[step % batch_num]})
+ log("gpu0 ok")
+ elif comm.myRank.value == 7:
+ loss, _, _ = executor.run(
+ feed_dict={y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+ log("gpu7 ok, loss: {}".format(loss[0]))
+ else:
+ executor.run()
+ log("gpu{} ok".format(comm.myRank.value))
+
+ # comm.stream.sync()
+ if communicate_stream:
+ communicate_stream.sync()
+
+ end = time.time()
+ log("time elapsed for {} steps: {}s".format(
+ args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/config3.yml b/examples/runner/parallel/config3.yml
new file mode 100644
index 0000000..7c2ad6f
--- /dev/null
+++ b/examples/runner/parallel/config3.yml
@@ -0,0 +1,5 @@
+nodes:
+ - host: localhost
+ servers: 0
+ workers: 3
+ chief: true
diff --git a/examples/runner/parallel/config4.yml b/examples/runner/parallel/config4.yml
new file mode 100644
index 0000000..5028d3f
--- /dev/null
+++ b/examples/runner/parallel/config4.yml
@@ -0,0 +1,5 @@
+nodes:
+ - host: localhost
+ servers: 0
+ workers: 4
+ chief: true
diff --git a/examples/runner/parallel/config8.yml b/examples/runner/parallel/config8.yml
new file mode 100644
index 0000000..2676c7d
--- /dev/null
+++ b/examples/runner/parallel/config8.yml
@@ -0,0 +1,5 @@
+nodes:
+ - host: localhost
+ servers: 0
+ workers: 8
+ chief: true
diff --git a/examples/runner/parallel/data_model_pipeline_mlp.py b/examples/runner/parallel/data_model_pipeline_mlp.py
new file mode 100644
index 0000000..ff6539c
--- /dev/null
+++ b/examples/runner/parallel/data_model_pipeline_mlp.py
@@ -0,0 +1,97 @@
+import hetu as ht
+
+import time
+import argparse
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+ weight = ht.init.random_normal(
+ shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx)
+ bias = ht.init.random_normal(
+ shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx)
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--steps', type=int, default=8, help='training steps')
+ parser.add_argument('--warmup', type=int, default=2,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.00001, help='learning rate')
+ parser.add_argument('--split', type=str, default='left',
+ help='left, middle, right')
+ args = parser.parse_args()
+ assert args.split in ('left', 'middle', 'right')
+
+ # dataset
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ batch_size = 10000
+ batch_num = 5
+ value_x_list = []
+ value_y_list = []
+ for i in range(batch_num):
+ start = i * batch_size
+ ending = (i+1) * batch_size
+ value_x_list.append(train_set_x[start:ending])
+ value_y_list.append(train_set_y[start:ending])
+
+ # model parallel
+ with ht.context([ht.gpu(0), ht.gpu(4)]):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+ activation = fc(activation, (1024, 2048), 'mlp_fc2', with_relu=True)
+ activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True)
+ if args.split == 'left':
+ activation = ht.dispatch(activation, (2, 1))
+ weight = ht.dispatch(ht.init.random_normal(
+ shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 1), duplicate=2)
+ elif args.split == 'right':
+ activation = ht.dispatch(activation, (1, 1), duplicate=2)
+ weight = ht.dispatch(ht.init.random_normal(
+ shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 2))
+ else:
+ activation = ht.dispatch(activation, (1, 2))
+ weight = ht.dispatch(ht.init.random_normal(
+ shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (2, 1))
+
+ with ht.context([(ht.gpu(1), ht.gpu(2)), (ht.gpu(5), ht.gpu(6))]):
+ activation = ht.matmul_op(activation, weight)
+ activation = ht.dispatch(activation, (1, 1))
+
+ with ht.context([ht.gpu(3), ht.gpu(7)]):
+ activation = ht.relu_op(activation)
+ activation = fc(activation, (2048, 2048), 'mlp_fc2', with_relu=True)
+ activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True)
+ y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+
+ executor = ht.Executor([loss, train_op])
+
+ # training
+ for step in range(args.steps):
+ if step == args.warmup:
+ start = time.time()
+ loss_val, _ = executor.run(feed_dict={
+ x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+ if executor.rank == 3:
+ print('step:', step, 'loss:', loss_val)
+
+ end = time.time()
+ if executor.rank == 3:
+ print("time elapsed for {} steps: {}s".format(
+ args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/data_pipeline_mlp.py b/examples/runner/parallel/data_pipeline_mlp.py
new file mode 100644
index 0000000..076a5bd
--- /dev/null
+++ b/examples/runner/parallel/data_pipeline_mlp.py
@@ -0,0 +1,66 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+ bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--warmup', type=int, default=1,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int,
+ default=10000, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.01, help='learning rate')
+ args = parser.parse_args()
+
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ with ht.context("gpu:0,gpu:4"):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True)
+
+ with ht.context("gpu:1,gpu:5"):
+ activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True)
+ activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True)
+
+ with ht.context("gpu:2,gpu:6"):
+ activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True)
+ activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True)
+
+ with ht.context("gpu:3,gpu:7"):
+ y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+ executor = ht.Executor([loss, train_op])
+
+ print_devices = [3, 7]
+
+ # training
+ steps = train_set_x.shape[0] // args.batch_size
+ for step in range(steps):
+ start = step * args.batch_size
+ end = start + args.batch_size
+ loss_val, _ = executor.run(feed_dict={
+ x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+ if executor.local_rank in print_devices:
+ print('[step {}]: loss: {}'.format(step, loss_val[0]))
diff --git a/examples/runner/parallel/dist_config8.yml b/examples/runner/parallel/dist_config8.yml
new file mode 100644
index 0000000..4129fb9
--- /dev/null
+++ b/examples/runner/parallel/dist_config8.yml
@@ -0,0 +1,8 @@
+nodes:
+ - host: daim118
+ servers: 0
+ workers: 4
+ chief: true
+ - host: daim117
+ servers: 0
+ workers: 4
\ No newline at end of file
diff --git a/examples/runner/parallel/dist_data_pipeline_mlp.py b/examples/runner/parallel/dist_data_pipeline_mlp.py
new file mode 100644
index 0000000..e05d228
--- /dev/null
+++ b/examples/runner/parallel/dist_data_pipeline_mlp.py
@@ -0,0 +1,68 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+import socket
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+ bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--warmup', type=int, default=1,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int,
+ default=10000, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.01, help='learning rate')
+ args = parser.parse_args()
+
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ with ht.context([ht.rgpu('daim117', 0), ht.rgpu('daim117', 1)]):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True)
+
+ with ht.context([ht.rgpu('daim117', 2), ht.rgpu('daim117', 3)]):
+ activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True)
+ activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True)
+
+ with ht.context([ht.rgpu('daim118', 0), ht.rgpu('daim118', 1)]):
+ activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True)
+ activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True)
+
+ with ht.context([ht.rgpu('daim118', 2), ht.rgpu('daim118', 3)]):
+ y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+ executor = ht.Executor([loss, train_op])
+
+ print_ranks = [2, 3]
+ hostname = socket.gethostname()
+
+ # training
+ steps = train_set_x.shape[0] // args.batch_size
+ for step in range(steps):
+ start = step * args.batch_size
+ end = start + args.batch_size
+ loss_val, _ = executor.run(feed_dict={
+ x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+ if executor.local_rank in print_ranks and hostname == 'daim118':
+ print('[step {}]: loss: {}'.format(step, loss_val[0]))
diff --git a/examples/runner/parallel/simple_pipeline_mlp.py b/examples/runner/parallel/simple_pipeline_mlp.py
new file mode 100644
index 0000000..d200364
--- /dev/null
+++ b/examples/runner/parallel/simple_pipeline_mlp.py
@@ -0,0 +1,76 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True):
+ weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+ bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--epochs', type=int, default=8,
+ help='training epochs')
+ parser.add_argument('--warmup', type=int, default=1,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int,
+ default=10000, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.01, help='learning rate')
+ args = parser.parse_args()
+
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ # pipeline parallel
+ with ht.context(ht.gpu(0)):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+
+ for i in range(1, 7):
+ with ht.context(ht.gpu(i)):
+ activation = fc(activation, (1024, 1024), 'mlp_fc%d' %
+ (i + 1), with_relu=True)
+
+ with ht.context(ht.gpu(7)):
+ y_pred = fc(activation, (1024, 10), 'mlp_fc8', with_relu=True)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+
+ executor = ht.Executor([loss, train_op])
+
+ # training
+ steps = train_set_x.shape[0] // args.batch_size
+ for epoch in range(args.epochs):
+ loss_vals = []
+ if epoch == args.warmup:
+ start_time = time.time()
+ for step in range(steps):
+ start = step * args.batch_size
+ end = start + args.batch_size
+ loss_val, _ = executor.run(feed_dict={
+ x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+ loss_vals.append(loss_val)
+ if executor.rank == 7:
+ print('epoch: {}, loss: {}'.format(epoch, np.mean(loss_vals)))
+
+ if executor.rank == 0:
+ end_time = time.time()
+ print("time elapsed for {} epochs: {}s".format(
+ args.epochs-args.warmup, round(end_time-start_time, 3)))
diff --git a/examples/runner/parallel/test_model_mlp.py b/examples/runner/parallel/test_model_mlp.py
new file mode 100644
index 0000000..2eeab1a
--- /dev/null
+++ b/examples/runner/parallel/test_model_mlp.py
@@ -0,0 +1,92 @@
+import hetu as ht
+
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+ weight_save = np.load('std/' + name + '_weight.npy')
+ bias_save = np.load('std/' + name + '_bias.npy')
+ weight = ht.Variable(value=weight_save, name=name+'_weight', ctx=ctx)
+ bias = ht.Variable(value=bias_save, name=name+'_bias', ctx=ctx)
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--steps', type=int, default=8, help='training steps')
+ parser.add_argument('--warmup', type=int, default=2,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.00001, help='learning rate')
+ parser.add_argument('--split', type=str, default='left')
+ args = parser.parse_args()
+ assert args.split in ('left', 'right', 'middle')
+
+ # dataset
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ batch_size = 10000
+ batch_num = 5
+ value_x_list = []
+ value_y_list = []
+ for i in range(batch_num):
+ start = i * batch_size
+ ending = (i+1) * batch_size
+ value_x_list.append(train_set_x[start:ending])
+ value_y_list.append(train_set_y[start:ending])
+
+ # model parallel
+ with ht.context(ht.gpu(0)):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+ weight_save = np.load('std/' + 'special_weight.npy')
+ weight = ht.Variable(value=weight_save, name='mlp_fc1_weight')
+ if args.split == 'left':
+ activation = ht.dispatch(activation, (2, 1))
+ weight = ht.dispatch(weight, (1, 1), duplicate=2)
+ elif args.split == 'right':
+ activation = ht.dispatch(activation, (1, 1), duplicate=2)
+ weight = ht.dispatch(weight, (1, 2))
+ else:
+ activation = ht.dispatch(activation, (1, 2))
+ weight = ht.dispatch(weight, (2, 1))
+
+ with ht.context((ht.gpu(1), ht.gpu(2))):
+ activation = ht.matmul_op(activation, weight)
+ activation = ht.dispatch(activation, (1, 1))
+
+ with ht.context(ht.gpu(3)):
+ activation = ht.relu_op(activation)
+ y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+
+ executor = ht.Executor([loss, train_op])
+
+ # training
+ for step in range(args.steps):
+ if step == args.warmup:
+ start = time.time()
+ loss_val, _ = executor.run(feed_dict={
+ x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+ if executor.rank == 3:
+ print('step:', step, 'loss:', loss_val)
+
+ end = time.time()
+ if executor.rank == 3:
+ print("time elapsed for {} steps: {}s".format(
+ args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/test_model_mlp_base.py b/examples/runner/parallel/test_model_mlp_base.py
new file mode 100644
index 0000000..2941e96
--- /dev/null
+++ b/examples/runner/parallel/test_model_mlp_base.py
@@ -0,0 +1,94 @@
+import hetu as ht
+
+import time
+import argparse
+import os
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True, rank=-1):
+ weight_save = np.random.normal(0, 0.04, size=shape)
+ bias_save = np.random.normal(0, 0.04, size=shape[-1:])
+ weight = ht.Variable(value=weight_save, name=name+'_weight')
+ bias = ht.Variable(value=bias_save, name=name+'_bias')
+ global args
+ if args.save and args.rank == rank:
+ np.save('std/' + name + '_weight.npy', weight_save)
+ np.save('std/' + name + '_bias.npy', bias_save)
+ x = ht.matmul_op(x, weight)
+ x = x + ht.broadcastto_op(bias, x)
+ if with_relu:
+ x = ht.relu_op(x)
+ return x
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--steps', type=int, default=8, help='training steps')
+ parser.add_argument('--warmup', type=int, default=2,
+ help='warm up steps excluded from timing')
+ parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.00001, help='learning rate')
+ parser.add_argument('--save', action='store_true')
+ global args
+ args = parser.parse_args()
+ if args.save:
+ comm = ht.wrapped_mpi_nccl_init()
+ args.rank = comm.rank
+ if args.rank == 0 and not os.path.exists('std'):
+ os.mkdir('std')
+
+ # dataset
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+
+ batch_size = 10000
+ batch_num = 5
+ value_x_list = []
+ value_y_list = []
+ for i in range(batch_num):
+ start = i * batch_size
+ ending = (i+1) * batch_size
+ value_x_list.append(train_set_x[start:ending])
+ value_y_list.append(train_set_y[start:ending])
+
+ # model parallel
+ with ht.context(ht.gpu(0)):
+ x = ht.Variable(name="dataloader_x", trainable=False)
+ activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True, rank=0)
+
+ with ht.context(ht.gpu(1)):
+ weight_save = np.random.normal(0, 0.04, size=(1024, 2048))
+ if args.save and args.rank == 1:
+ np.save('std/' + 'special_weight.npy', weight_save)
+ weight = ht.Variable(value=weight_save, name='mlp_fc1_weight')
+ activation = ht.matmul_op(activation, weight)
+
+ with ht.context(ht.gpu(2)):
+ activation = ht.relu_op(activation)
+ y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False, rank=2)
+ y_ = ht.Variable(name="dataloader_y", trainable=False)
+ loss = ht.softmaxcrossentropy_op(y_pred, y_)
+ loss = ht.reduce_mean_op(loss, [0])
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ train_op = opt.minimize(loss)
+
+ executor = ht.Executor([loss, train_op])
+
+ # training
+ for step in range(args.steps):
+ if step == args.warmup:
+ start = time.time()
+ loss_val, _ = executor.run(feed_dict={
+ x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+ if executor.rank == 2:
+ print('step:', step, 'loss:', loss_val)
+
+ end = time.time()
+ if executor.rank == 2:
+ print("time elapsed for {} steps: {}s".format(
+ args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/remote_allreduce.yml b/examples/runner/remote_allreduce.yml
new file mode 100644
index 0000000..7da1df7
--- /dev/null
+++ b/examples/runner/remote_allreduce.yml
@@ -0,0 +1,6 @@
+nodes:
+ - host: daim118
+ workers: 4
+ chief: true
+ - host: daim117
+ workers: 2
diff --git a/examples/runner/remote_ps.yml b/examples/runner/remote_ps.yml
new file mode 100644
index 0000000..fee7c84
--- /dev/null
+++ b/examples/runner/remote_ps.yml
@@ -0,0 +1,8 @@
+nodes:
+ - host: daim118
+ servers: 1
+ workers: 4
+ chief: true
+ - host: daim117
+ servers: 1
+ workers: 2
diff --git a/examples/runner/run_mlp.py b/examples/runner/run_mlp.py
new file mode 100644
index 0000000..e773dcb
--- /dev/null
+++ b/examples/runner/run_mlp.py
@@ -0,0 +1,118 @@
+import hetu as ht
+from models import MLP
+
+import os
+import numpy as np
+import argparse
+import json
+from time import time
+
+
+if __name__ == "__main__":
+ # argument parser
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--config', type=str, default='local',
+ help='[local, lps(localps), lar(localallreduce), rps(remoteps), rar]')
+ parser.add_argument('--batch-size', type=int,
+ default=128, help='batch size')
+ parser.add_argument('--learning-rate', type=float,
+ default=0.1, help='learning rate')
+ parser.add_argument('--opt', type=str, default='sgd',
+ help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+ parser.add_argument('--num-epochs', type=int,
+ default=10, help='epoch number')
+ parser.add_argument('--validate', action='store_true',
+ help='whether to use validation')
+ parser.add_argument('--timing', action='store_true',
+ help='whether to time the training phase')
+ args = parser.parse_args()
+
+ dataset = 'MNIST'
+
+ assert args.opt in ['sgd', 'momentum', 'nesterov',
+ 'adagrad', 'adam'], 'Optimizer not supported!'
+ if args.opt == 'sgd':
+ print('Use SGD Optimizer.')
+ opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+ elif args.opt == 'momentum':
+ print('Use Momentum Optimizer.')
+ opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
+ elif args.opt == 'nesterov':
+ print('Use Nesterov Momentum Optimizer.')
+ opt = ht.optim.MomentumOptimizer(
+ learning_rate=args.learning_rate, nesterov=True)
+ elif args.opt == 'adagrad':
+ print('Use AdaGrad Optimizer.')
+ opt = ht.optim.AdaGradOptimizer(
+ learning_rate=args.learning_rate, initial_accumulator_value=0.1)
+ else:
+ print('Use Adam Optimizer.')
+ opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)
+
+ # data loading
+ print('Loading %s data...' % dataset)
+ if dataset == 'MNIST':
+ datasets = ht.data.mnist()
+ train_set_x, train_set_y = datasets[0]
+ valid_set_x, valid_set_y = datasets[1]
+ test_set_x, test_set_y = datasets[2]
+ # train_set_x: (50000, 784), train_set_y: (50000,)
+ # valid_set_x: (10000, 784), valid_set_y: (10000,)
+ # x_shape = (args.batch_size, 784)
+ # y_shape = (args.batch_size, 10)
+
+ # model definition
+ ctx = {
+ 'local': ht.gpu(0),
+ 'lps': [ht.cpu(0), ht.gpu(0), ht.gpu(1), ht.gpu(4), ht.gpu(5)],
+ 'lar': [ht.gpu(1), ht.gpu(2), ht.gpu(3), ht.gpu(6)],
+ 'rps': ['cpu:0', 'daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3'],
+ 'rar': ['daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3']
+ }[args.config]
+ with ht.context(ctx):
+ print('Building model...')
+ x = ht.dataloader_op([
+ ht.Dataloader(train_set_x, args.batch_size, 'train'),
+ ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
+ ])
+ y_ = ht.dataloader_op([
+ ht.Dataloader(train_set_y, args.batch_size, 'train'),
+ ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
+ ])
+
+ loss, y = MLP.mlp(x, y_)
+ train_op = opt.minimize(loss)
+
+ executor = ht.Executor(
+ {'train': [loss, y, train_op], 'validate': [loss, y, y_]})
+ n_train_batches = executor.get_batch_num('train')
+ n_valid_batches = executor.get_batch_num('validate')
+
+ # training
+ print("Start training loop...")
+ for i in range(args.num_epochs):
+ print("Epoch %d" % i)
+ loss_all = 0
+ if args.timing:
+ start = time()
+ for minibatch_index in range(n_train_batches):
+ loss_val, predict_y, _ = executor.run('train')
+ loss_val = loss_val.asnumpy()
+ loss_all += loss_val * x.dataloaders['train'].last_batch_size
+ loss_all /= len(train_set_x)
+ print("Loss = %f" % loss_all)
+ if args.timing:
+ end = time()
+ print("Time = %f" % (end - start))
+
+ if args.validate:
+ correct_predictions = []
+ for minibatch_index in range(n_valid_batches):
+ loss_val, valid_y_predicted, y_val = executor.run(
+ 'validate', convert_to_numpy_ret_vals=True)
+ correct_prediction = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(valid_y_predicted, 1)).astype(np.float32)
+ correct_predictions.extend(correct_prediction)
+ accuracy = np.mean(correct_predictions)
+ print("Validation accuracy = %f" % accuracy)
diff --git a/examples/runner/run_wdl.py b/examples/runner/run_wdl.py
new file mode 100644
index 0000000..e4163c7
--- /dev/null
+++ b/examples/runner/run_wdl.py
@@ -0,0 +1,130 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import numpy as np
+import yaml
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+from models import load_data, wdl_adult
+
+
+def worker(args):
+ def train(iterations, auc_enabled=True, tqdm_enabled=False):
+ localiter = tqdm(range(iterations)
+ ) if tqdm_enabled else range(iterations)
+ train_loss = []
+ train_acc = []
+ if auc_enabled:
+ train_auc = []
+ for it in localiter:
+ loss_val, predict_y, y_val, _ = executor.run(
+ 'train', convert_to_numpy_ret_vals=True)
+ acc_val = np.equal(
+ np.argmax(y_val, 1),
+ np.argmax(predict_y, 1)).astype(np.float32)
+ train_loss.append(loss_val[0])
+ train_acc.append(acc_val)
+ if auc_enabled:
+ train_auc.append(metrics.roc_auc_score(y_val, predict_y))
+ if auc_enabled:
+ return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
+ else:
+ return np.mean(train_loss), np.mean(train_acc)
+
+ def validate(iterations, tqdm_enabled=False):
+ localiter = tqdm(range(iterations)
+ ) if tqdm_enabled else range(iterations)
+ test_loss = []
+ test_acc = []
+ test_auc = []
+ for it in localiter:
+ loss_val, test_y_predicted, y_test_val = executor.run(
+ 'validate', convert_to_numpy_ret_vals=True)
+ correct_prediction = np.equal(
+ np.argmax(y_test_val, 1),
+ np.argmax(test_y_predicted, 1)).astype(np.float32)
+ test_loss.append(loss_val[0])
+ test_acc.append(correct_prediction)
+ test_auc.append(metrics.roc_auc_score(
+ y_test_val, test_y_predicted))
+ return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)
+
+ batch_size = 128
+
+ ctx = {
+ 'local': 'gpu:0',
+ 'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7',
+ 'lhy': 'cpu:0,gpu:1,gpu:2,gpu:3,gpu:6',
+ 'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3',
+ 'rhy': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3'
+ }[args.config]
+ dense_param_ctx = {'local': 'gpu:0', 'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7', 'lhy': 'gpu:1,gpu:2,gpu:3,gpu:6',
+ 'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3',
+ 'rhy': 'daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3'}[args.config]
+ with ht.context(ctx):
+ x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_data.load_adult_data()
+ dense_input = [
+ ht.dataloader_op([
+ [x_train_deep[:, i], batch_size, 'train'],
+ [x_test_deep[:, i], batch_size, 'validate'],
+ ]) for i in range(12)
+ ]
+ sparse_input = ht.dataloader_op([
+ [x_train_wide, batch_size, 'train'],
+ [x_test_wide, batch_size, 'validate'],
+ ])
+ y_ = ht.dataloader_op([
+ [y_train, batch_size, 'train'],
+ [y_test, batch_size, 'validate'],
+ ])
+ print("Data loaded.")
+
+ loss, prediction, y_, train_op = wdl_adult.wdl_adult(
+ dense_input, sparse_input, y_, dense_param_ctx)
+
+ eval_nodes = {'train': [loss, prediction, y_, train_op]}
+ if args.val:
+ print('Validation enabled...')
+ eval_nodes['validate'] = [loss, prediction, y_]
+ executor = ht.Executor(eval_nodes,
+ cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
+
+ total_epoch = args.nepoch if args.nepoch > 0 else 50
+ for ep in range(total_epoch):
+ if ep == 5:
+ start = time.time()
+ print("epoch %d" % ep)
+ ep_st = time.time()
+ train_loss, train_acc = train(
+ executor.get_batch_num('train'), auc_enabled=False)
+ ep_en = time.time()
+ if args.val:
+ val_loss, val_acc, val_auc = validate(
+ executor.get_batch_num('validate'))
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
+ % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
+ else:
+ print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+ % (train_loss, train_acc, ep_en - ep_st))
+ print('all time:', time.time() - start)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--config', type=str, default='local',
+ help='[local, lps(localps), lhy(localhybrid), rps(remoteps), rhy]')
+ parser.add_argument("--val", action="store_true",
+ help="whether to use validation")
+ parser.add_argument("--all", action="store_true",
+ help="whether to use all data")
+ parser.add_argument("--bsp", action="store_true",
+ help="whether to use bsp instead of asp")
+ parser.add_argument("--cache", default=None, help="cache policy")
+ parser.add_argument("--bound", default=100, help="cache bound")
+ parser.add_argument("--nepoch", type=int, default=-1,
+ help="num of epochs, each train 1/10 data")
+ args = parser.parse_args()
+ worker(args)
diff --git a/hetu.exp b/hetu.exp
new file mode 100755
index 0000000..1528483
--- /dev/null
+++ b/hetu.exp
@@ -0,0 +1,5 @@
+#!/bin/bash
+path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "Hetu root is" $path
+export PATH="$path/bin:$PATH"
+export PYTHONPATH="$path/python:$path/build/lib:$path/third_party/GraphMix/python:$PYTHONPATH:$path/third_party/HetuML/hetuml/python"
diff --git a/img/alibabacloud.png b/img/alibabacloud.png
new file mode 100644
index 0000000..2b0095e
Binary files /dev/null and b/img/alibabacloud.png differ
diff --git a/img/hetu.png b/img/hetu.png
new file mode 100644
index 0000000..5791951
Binary files /dev/null and b/img/hetu.png differ
diff --git a/img/kuaishou.png b/img/kuaishou.png
new file mode 100644
index 0000000..acce838
Binary files /dev/null and b/img/kuaishou.png differ
diff --git a/img/tencent.png b/img/tencent.png
new file mode 100644
index 0000000..26fdb92
Binary files /dev/null and b/img/tencent.png differ
diff --git a/ps-lite/.gitignore b/ps-lite/.gitignore
new file mode 100644
index 0000000..ba96dc2
--- /dev/null
+++ b/ps-lite/.gitignore
@@ -0,0 +1,2 @@
+src/meta.pb.cc
+src/meta.pb.h
diff --git a/ps-lite/CMakeLists.txt b/ps-lite/CMakeLists.txt
new file mode 100644
index 0000000..4d6fce6
--- /dev/null
+++ b/ps-lite/CMakeLists.txt
@@ -0,0 +1,41 @@
+# port from https://github.com/dmlc/ps-lite/blob/master/CMakeLists.txt
+
+aux_source_directory(src PS_SRC)
+add_library(ps SHARED ${PS_SRC})
+target_include_directories(ps PUBLIC include)
+
+# find and build zeroMQ
+find_package(ZMQ 4.3.2)
+if(NOT ZMQ_FOUND)
+ message(STATUS "ZMQ not Found, Preparing ZMQ ...")
+ FetchContent_Declare(zmq URL https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip)
+ FetchContent_GetProperties(zmq)
+ if(NOT zmq_POPULATED)
+ FetchContent_Populate(zmq)
+ if(POLICY CMP0077) # Avoid building shared library and tests on CMake 3.13+
+ cmake_policy(SET CMP0077 NEW)
+ set(BUILD_SHARED OFF CACHE BOOL "")
+ set(BUILD_TESTS OFF CACHE BOOL "")
+ endif()
+ add_subdirectory(${zmq_SOURCE_DIR} ${zmq_BINARY_DIR})
+ endif()
+ target_link_libraries(ps PRIVATE libzmq-static)
+else()
+ target_include_directories(ps PRIVATE ${ZMQ_INCLUDE_DIRS})
+ target_link_libraries(ps PRIVATE ${ZMQ_LIBRARIES})
+endif()
+
+# find and build protobuf
+find_package(Protobuf)
+if(NOT Protobuf_FOUND)
+ message(FATAL_ERROR "Protobuf not found, use `conda install protobuf`")
+endif()
+
+set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE)
+file(GLOB_RECURSE PROTO_FILES "proto/*.proto")
+protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES})
+target_sources(ps PRIVATE ${PROTO_SRCS})
+target_include_directories(ps PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # PROTO_HDRS files are here
+target_include_directories(ps PRIVATE ${PROTOBUF_INCLUDE_DIR})
+target_include_directories(ps PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_link_libraries(ps PRIVATE ${PROTOBUF_LIBRARY})
diff --git a/ps-lite/README.md b/ps-lite/README.md
new file mode 100644
index 0000000..4587c0f
--- /dev/null
+++ b/ps-lite/README.md
@@ -0,0 +1,83 @@
+# PS-lite Module [adapted from https://github.com/dmlc/ps-lite]
+
+---
+
+## Use Guide
+
+PS-lite module is a a light-weighted C++ parameter server with ctypes python interface. It provides a list of PS functions that are useful in distributed training.
+
+To use PS,we will have three roles: worker, server and scheduler. Worker are training process. Servers are where the parameters are stored. The scheduler setup and tear down the connection. There will be multiple servers and workers and only one scheduler.
+
+Currently, We only implemented python interface for hetu. Since it contains some symbol from hetu, we can only use ps functions after we import hetu. Here is a quick example on how we use ps-lite with hetu.
+
+```python
+# worker.py
+import hetu
+import numpy as np
+import ctypes
+# create arrays
+tgt_array = hetu.ndarray.empty([128])
+name = 0 # A number specifies a parameter, should be the same among all workers
+param_type = 0 # 0 for dense parameter
+# PS initialize
+hetu.worker_init()
+# PS functions here
+comm = hetu.get_worker_communicate()
+# InitTensor(node_name, param_type, length, width, init_type, init_param_a, init_param_b, seed, opt_type, opt_args, num_opt_args)
+# This function is synchronous.
+comm.InitTensor(name, param_type, 128, 1, 0, 5.0, 1.0, 123, 0, (ctypes.c_float * 1)(0.1), 1)
+comm.Pull(name, tgt_array.handle)
+comm.Wait(name)
+print(tgt_array.asnumpy())
+# PS finialize
+hetu.worker_finish()
+```
+We will also have server code and scheduler code
+```python
+# server.py
+import hetu
+hetu.server_init()
+hetu.server_finish()
+```
+
+```python
+# scheduler.py
+import hetu
+hetu.scheduler_init()
+hetu.scheduler_finish()
+```
+
+To run the sricpts, we should use environment variables to specify which ip address and port to use. Note that it is recommended to use a yaml or json file to store these environment variables.
+
+```shell
+export DMLC_PS_ROOT_URI=127.0.0.1 DMLC_PS_ROOT_PORT=4080 DMLC_NUM_WORKER=1 DMLC_NUM_SERVER=1 DMLC_PS_VAN_TYPE=p3
+DMLC_ROLE=scheduler python3 scheduler.py &
+DMLC_ROLE=server SERVER_ID=0 DMLC_PS_SERVER_URI=127.0.0.1 DMLC_PS_SERVER_PORT=4081 python3 server.py &
+DMLC_ROLE=worker WORKER_ID=0 DMLC_PS_WORKER_URI=127.0.0.1 DMLC_PS_WORKER_PORT=4082 python3 worker.py
+```
+
+## PS functions
+
+We provide a list of useful parameter server functions for training.
+
+It also has the ability to easily extend to new ps functions. There will be several steps to go.
+
+1. Create a enum in psf/PSFunc.h and write a struct to define the ps function.
+
+ ```C++
+ template<> struct PSFData {
+ using Request = tuple<
+ unsigned long,
+ SArray
+ >;
+ using Response = tuple<>;
+ static void _callback(const Response &response) {/* callback here */}
+ };
+ ```
+
+ here we can use scalar types like int,float... or arrays as function parameters. Note that arrays are shared and scalars are copied.
+
+2. Implement server handler in server/PSFHandler.h
+
+3. use a kvworker.Request to launch yout ps function and kvworker.Wait to wait till callback ends, see more example in PSAgent.h. We can also write python binding to expose the ps function to python layer.
+
diff --git a/ps-lite/include/common/dmlc_base.h b/ps-lite/include/common/dmlc_base.h
new file mode 100644
index 0000000..f33ba10
--- /dev/null
+++ b/ps-lite/include/common/dmlc_base.h
@@ -0,0 +1,195 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file base.h
+ * \brief defines configuration macros
+ */
+#ifndef DMLC_BASE_H_
+#define DMLC_BASE_H_
+
+/*! \brief whether use glog for logging */
+#ifndef DMLC_USE_GLOG
+#define DMLC_USE_GLOG 0
+#endif
+
+/*!
+ * \brief whether throw dmlc::Error instead of
+ * directly calling abort when FATAL error occured
+ * NOTE: this may still not be perfect.
+ * do not use FATAL and CHECK in destructors
+ */
+#ifndef DMLC_LOG_FATAL_THROW
+#define DMLC_LOG_FATAL_THROW 1
+#endif
+
+/*!
+ * \brief Whether to print stack trace for fatal error,
+ * enabled on linux when using gcc.
+ */
+#if (defined(__GNUC__) && !defined(__MINGW32__) && !defined(__sun) \
+ && !defined(__SVR4) && !(defined __MINGW64__) && !(defined __ANDROID__))
+#if (!defined(DMLC_LOG_STACK_TRACE))
+#define DMLC_LOG_STACK_TRACE 1
+#endif
+#if (!defined(DMLC_LOG_STACK_TRACE_SIZE))
+#define DMLC_LOG_STACK_TRACE_SIZE 10
+#endif
+#endif
+
+/*! \brief whether compile with hdfs support */
+#ifndef DMLC_USE_HDFS
+#define DMLC_USE_HDFS 0
+#endif
+
+/*! \brief whether compile with s3 support */
+#ifndef DMLC_USE_S3
+#define DMLC_USE_S3 0
+#endif
+
+/*! \brief whether or not use parameter server */
+#ifndef DMLC_USE_PS
+#define DMLC_USE_PS 0
+#endif
+
+/*! \brief whether or not use c++11 support */
+#ifndef DMLC_USE_CXX11
+#define DMLC_USE_CXX11 \
+ (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L \
+ || defined(_MSC_VER))
+#endif
+
+/// check if g++ is before 4.6
+#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 6
+#pragma message("Will need g++-4.6 or higher to compile all" \
+ "the features in dmlc-core, " \
+ "compile without c++11, some features may be disabled")
+#undef DMLC_USE_CXX11
+#define DMLC_USE_CXX11 0
+#endif
+#endif
+
+/*!
+ * \brief Disable copy constructor and assignment operator.
+ *
+ * If C++11 is supported, both copy and move constructors and
+ * assignment operators are deleted explicitly. Otherwise, they are
+ * only declared but not implemented. Place this macro in private
+ * section if C++11 is not available.
+ */
+#ifndef DISALLOW_COPY_AND_ASSIGN
+#if DMLC_USE_CXX11
+#define DISALLOW_COPY_AND_ASSIGN(T) \
+ T(T const &) = delete; \
+ T(T &&) = delete; \
+ T &operator=(T const &) = delete; \
+ T &operator=(T &&) = delete
+#else
+#define DISALLOW_COPY_AND_ASSIGN(T) \
+ T(T const &); \
+ T &operator=(T const &)
+#endif
+#endif
+
+///
+/// code block to handle optionally loading
+///
+#if !defined(__GNUC__)
+#define fopen64 std::fopen
+#endif
+#ifdef _MSC_VER
+#if _MSC_VER < 1900
+// NOTE: sprintf_s is not equivalent to snprintf,
+// they are equivalent when success, which is sufficient for our case
+#define snprintf sprintf_s
+#define vsnprintf vsprintf_s
+#endif
+#else
+#ifdef _FILE_OFFSET_BITS
+#if _FILE_OFFSET_BITS == 32
+#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
+#endif
+#endif
+
+#ifdef __APPLE__
+#define off64_t off_t
+#define fopen64 std::fopen
+#endif
+
+extern "C" {
+#include
+}
+#endif
+
+#ifdef _MSC_VER
+//! \cond Doxygen_Suppress
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+//! \endcond
+#else
+#include
+#endif
+#include
+#include
+
+/*! \brief namespace for dmlc */
+namespace dmlc {
+/*!
+ * \brief safely get the beginning address of a vector
+ * \param vec input vector
+ * \return beginning address of a vector
+ */
+template
+inline T *BeginPtr(std::vector &vec) { // NOLINT(*)
+ if (vec.size() == 0) {
+ return NULL;
+ } else {
+ return &vec[0];
+ }
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param vec input vector
+ * \return beginning address of a vector
+ */
+template
+inline const T *BeginPtr(const std::vector &vec) {
+ if (vec.size() == 0) {
+ return NULL;
+ } else {
+ return &vec[0];
+ }
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param str input string
+ * \return beginning address of a string
+ */
+inline char *BeginPtr(std::string &str) { // NOLINT(*)
+ if (str.length() == 0)
+ return NULL;
+ return &str[0];
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param str input string
+ * \return beginning address of a string
+ */
+inline const char *BeginPtr(const std::string &str) {
+ if (str.length() == 0)
+ return NULL;
+ return &str[0];
+}
+} // namespace dmlc
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define constexpr const
+#define alignof __alignof
+#endif
+
+#endif // DMLC_BASE_H_
diff --git a/ps-lite/include/common/logging.h b/ps-lite/include/common/logging.h
new file mode 100644
index 0000000..0aa23bc
--- /dev/null
+++ b/ps-lite/include/common/logging.h
@@ -0,0 +1,349 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file logging.h
+ * \brief defines logging macros of dmlc
+ * allows use of GLOG, fall back to internal
+ * implementation when disabled
+ */
+#ifndef DMLC_LOGGING_H_
+#define DMLC_LOGGING_H_
+#include
+#include
+#include
+#include
+#include
+#include
+#include "dmlc_base.h"
+
+#if DMLC_LOG_STACK_TRACE
+#include
+#endif
+
+#if DMLC_LOG_STACK_TRACE
+#include
+#endif
+
+namespace dmlc {
+/*!
+ * \brief exception class that will be thrown by
+ * default logger if DMLC_LOG_FATAL_THROW == 1
+ */
+struct Error : public std::runtime_error {
+ /*!
+ * \brief constructor
+ * \param s the error message
+ */
+ explicit Error(const std::string &s) : std::runtime_error(s) {
+ }
+};
+} // namespace dmlc
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define noexcept(a)
+#endif
+
+#if DMLC_USE_CXX11
+#define DMLC_THROW_EXCEPTION noexcept(false)
+#else
+#define DMLC_THROW_EXCEPTION
+#endif
+
+#if DMLC_USE_GLOG
+#include
+
+namespace dmlc {
+inline void InitLogging(const char *argv0) {
+ google::InitGoogleLogging(argv0);
+}
+} // namespace dmlc
+
+#else
+// use a light version of glog
+#include
+#include
+#include
+#include
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4722)
+#endif
+
+namespace dmlc {
+inline void InitLogging(const char *argv0) {
+ // DO NOTHING
+}
+
+// Always-on checking
+#define CHECK(x) \
+ if (!(x)) \
+ dmlc::LogMessageFatal(__FILE__, __LINE__).stream() << "Check " \
+ "failed: " #x \
+ << ' '
+#define CHECK_LT(x, y) CHECK((x) < (y))
+#define CHECK_GT(x, y) CHECK((x) > (y))
+#define CHECK_LE(x, y) CHECK((x) <= (y))
+#define CHECK_GE(x, y) CHECK((x) >= (y))
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NOTNULL(x) \
+ ((x) == NULL ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
+ << "Check notnull: " #x << ' ', \
+ (x) : (x)) // NOLINT(*)
+// Debug-only checking.
+#ifdef NDEBUG
+#define DCHECK(x) \
+ while (false) \
+ CHECK(x)
+#define DCHECK_LT(x, y) \
+ while (false) \
+ CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+ while (false) \
+ CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+ while (false) \
+ CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+ while (false) \
+ CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+ while (false) \
+ CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+ while (false) \
+ CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif // NDEBUG
+
+#define LOG_INFO dmlc::LogMessage(__FILE__, __LINE__)
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL dmlc::LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// Poor man version of VLOG
+#define VLOG(x) LOG_INFO.stream()
+
+#define LOG(severity) LOG_##severity.stream()
+#define LG LOG_INFO.stream()
+#define LF LOG_FATAL.stream()
+#define LGMT dmlc::LogMessageMultiThread(__FILE__, __LINE__).stream()
+#define LOG_IF(severity, condition) \
+ !(condition) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+
+#ifdef NDEBUG
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) \
+ true ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+ (true || !(condition)) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#else
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG(severity)
+#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
+#endif
+
+// Poor man version of LOG_EVERY_N
+#define LOG_EVERY_N(severity, n) LOG(severity)
+
+class DateLogger {
+public:
+ DateLogger() {
+#if defined(_MSC_VER)
+ _tzset();
+#endif
+ }
+ const char *HumanDate() {
+#if defined(_MSC_VER)
+ _strtime_s(buffer_, sizeof(buffer_));
+#else
+ time_t time_value = time(NULL);
+ struct tm now;
+ localtime_r(&time_value, &now);
+ snprintf(buffer_, sizeof(buffer_), "%02d:%02d:%02d", now.tm_hour,
+ now.tm_min, now.tm_sec);
+#endif
+ return buffer_;
+ }
+
+private:
+ char buffer_[9];
+};
+
+class LogMessage {
+public:
+ LogMessage(const char *file, int line) :
+#ifdef __ANDROID__
+ log_stream_(std::cout)
+#else
+ log_stream_(std::cerr)
+#endif
+ {
+ log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+ << line << ": ";
+ }
+ ~LogMessage() {
+ log_stream_ << "\n";
+ }
+ std::ostream &stream() {
+ return log_stream_;
+ }
+
+protected:
+ std::ostream &log_stream_;
+
+private:
+ DateLogger pretty_date_;
+ LogMessage(const LogMessage &);
+ void operator=(const LogMessage &);
+};
+
+#if DMLC_LOG_STACK_TRACE
+inline std::string Demangle(char const *msg_str) {
+ using std::string;
+ string msg(msg_str);
+ size_t symbol_start = string::npos;
+ size_t symbol_end = string::npos;
+ if (((symbol_start = msg.find("_Z")) != string::npos)
+ && (symbol_end = msg.find_first_of(" +", symbol_start))) {
+ string left_of_symbol(msg, 0, symbol_start);
+ string symbol(msg, symbol_start, symbol_end - symbol_start);
+ string right_of_symbol(msg, symbol_end);
+
+ int status = 0;
+ size_t length = string::npos;
+ std::unique_ptr demangled_symbol = {
+ abi::__cxa_demangle(symbol.c_str(), 0, &length, &status),
+ &std::free};
+ if (demangled_symbol && status == 0 && length > 0) {
+ string symbol_str(demangled_symbol.get());
+ std::ostringstream os;
+ os << left_of_symbol << symbol_str << right_of_symbol;
+ return os.str();
+ }
+ }
+ return string(msg_str);
+}
+
+inline std::string StackTrace() {
+ using std::string;
+ std::ostringstream stacktrace_os;
+ const int MAX_STACK_SIZE = DMLC_LOG_STACK_TRACE_SIZE;
+ void *stack[MAX_STACK_SIZE];
+ int nframes = backtrace(stack, MAX_STACK_SIZE);
+ stacktrace_os << "Stack trace returned " << nframes
+ << " entries:" << std::endl;
+ char **msgs = backtrace_symbols(stack, nframes);
+ if (msgs != nullptr) {
+ for (int frameno = 0; frameno < nframes; ++frameno) {
+ string msg = dmlc::Demangle(msgs[frameno]);
+ stacktrace_os << "[bt] (" << frameno << ") " << msg << "\n";
+ }
+ }
+ free(msgs);
+ string stack_trace = stacktrace_os.str();
+ return stack_trace;
+}
+
+#else // DMLC_LOG_STACK_TRACE is off
+
+inline std::string demangle(char const *msg_str) {
+ return std::string();
+}
+
+inline std::string StackTrace() {
+ return std::string("stack traces not available when "
+ "DMLC_LOG_STACK_TRACE is disabled at compile time.");
+}
+
+#endif // DMLC_LOG_STACK_TRACE
+
+#if DMLC_LOG_FATAL_THROW == 0
+class LogMessageFatal : public LogMessage {
+public:
+ LogMessageFatal(const char *file, int line) : LogMessage(file, line) {
+ }
+ ~LogMessageFatal() {
+ log_stream_ << "\n";
+ abort();
+ }
+
+private:
+ LogMessageFatal(const LogMessageFatal &);
+ void operator=(const LogMessageFatal &);
+};
+#else
+class LogMessageFatal {
+public:
+ LogMessageFatal(const char *file, int line) {
+ log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+ << line << ": ";
+ }
+ std::ostringstream &stream() {
+ return log_stream_;
+ }
+ ~LogMessageFatal() DMLC_THROW_EXCEPTION {
+#if DMLC_LOG_STACK_TRACE
+ log_stream_ << "\n\n" << StackTrace() << "\n";
+#endif
+ // throwing out of destructor is evil
+ // hopefully we can do it here
+ // also log the message before throw
+ LOG(ERROR) << log_stream_.str();
+ throw Error(log_stream_.str());
+ }
+
+private:
+ std::ostringstream log_stream_;
+ DateLogger pretty_date_;
+ LogMessageFatal(const LogMessageFatal &);
+ void operator=(const LogMessageFatal &);
+};
+#endif
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros. This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+public:
+ LogMessageVoidify() {
+ }
+ // This has to be an operator with a precedence lower than << but
+ // higher than "?:". See its usage.
+ void operator&(std::ostream &) {
+ }
+};
+
+class LogMessageMultiThread {
+public:
+ LogMessageMultiThread(const char *file, int line) {
+ log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+ << line << ": ";
+ }
+ std::ostringstream &stream() {
+ return log_stream_;
+ }
+ ~LogMessageMultiThread() {
+ log_stream_ << "\n";
+ }
+
+private:
+ std::ostringstream log_stream_;
+ DateLogger pretty_date_;
+ LogMessageMultiThread(const LogMessageMultiThread &);
+ void operator=(const LogMessageMultiThread &);
+};
+
+} // namespace dmlc
+
+#endif
+#endif // DMLC_LOGGING_H_
diff --git a/ps-lite/include/common/sarray.h b/ps-lite/include/common/sarray.h
new file mode 100644
index 0000000..71c7cea
--- /dev/null
+++ b/ps-lite/include/common/sarray.h
@@ -0,0 +1,328 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "logging.h"
+
+/**
+ * \brief Shared array
+ *
+ * A smart array that retains shared ownership. It provides similar
+ * functionalities comparing to std::vector, including data(), size(),
+ * operator[], resize(), clear(). SArray can be easily constructed from
+ * std::vector, such as
+ *
+ * \code
+ * std::vector a(10); SArray b(a); // copying
+ * std::shared_ptr> c(new std::vector(10));
+ * SArray d(c); // only pointer copying
+ * \endcode
+ *
+ * SArray is also like a C pointer when copying and assigning, namely
+ * both copy are assign are passing by pointers. The memory will be release only
+ * if there is no copy exists. It is also can be cast without memory copy, such
+ * as
+ *
+ * \code
+ * SArray a(10);
+ * SArray b(a); // now b.size() = 10 * sizeof(int);
+ * \endcode
+ *
+ * \tparam V the value type
+ */
+template
+class SArray {
+public:
+ /** \brief empty constructor */
+ SArray() {
+ }
+
+ /** \brief empty deconstrcutor */
+ ~SArray() {
+ }
+
+ /**
+ * \brief Create an array with length n with initialized value
+ * \param size the length
+ * \param val the initial length (0 in default)
+ */
+ explicit SArray(size_t size, V val = 0) {
+ resize(size, val);
+ }
+
+ /**
+ * \brief construct from another SArray.
+ *
+ * Zero-copy constructor, namely just copy the pointer
+ *
+ * \tparam W the value type of the source array
+ * \param arr the source array
+ */
+ template
+ explicit SArray(const SArray &arr) {
+ *this = arr;
+ }
+
+ /**
+ * \brief construct from another SArray.
+ *
+ * Zero-copy constructor, namely just copy the pointer
+ *
+ * \tparam W the value type of the source array
+ * \param arr the source array
+ */
+ template
+ void operator=(const SArray &arr) {
+ size_ = arr.size() * sizeof(W) / sizeof(V);
+ CHECK_EQ(size_ * sizeof(V), arr.size() * sizeof(W))
+ << "cannot be divided";
+ capacity_ = arr.capacity() * sizeof(W) / sizeof(V);
+ ptr_ = std::shared_ptr(arr.ptr(), reinterpret_cast(arr.data()));
+ }
+
+ /**
+ * \brief construct from a c-array
+ *
+ * Zero-copy constructor, namely just copy the pointer
+ *
+ * \param data the source data
+ * \param size the length
+ * \param deletable whether or not can call `delete [] data` when the
+ * reference count goes 0
+ */
+
+ SArray(V *data, size_t size, bool deletable = false) {
+ if (deletable) {
+ reset(data, size, [](V *data) { delete[] data; });
+ } else {
+ reset(data, size, [](V *data) {});
+ }
+ }
+
+ /**
+ * \brief copy from a c-array
+ *
+ * \param data the source data
+ * \param size the length
+ */
+ void CopyFrom(const V *data, size_t size) {
+ resize(size);
+ memcpy(this->data(), data, size * sizeof(V));
+ }
+
+ /**
+ * \brief copy from another SArray
+ *
+ * \param other the source data
+ */
+ void CopyFrom(const SArray &other) {
+ if (this == &other)
+ return;
+ CopyFrom(other.data(), other.size());
+ }
+
+ /**
+ * \brief copy from an iterator
+ */
+ template
+ void CopyFrom(const ForwardIt &first, const ForwardIt &last) {
+ size_t size = static_cast(std::distance(first, last));
+ V *data = new V[size];
+ reset(data, size, [](V *data) { delete[] data; });
+ auto it = first;
+ while (size-- > 0) {
+ *data = *it;
+ ++data;
+ ++it;
+ }
+ }
+
+ /**
+ * \brief construct from a std::vector, copy the data
+ */
+ explicit SArray(const std::vector &vec) {
+ CopyFrom(vec.data(), vec.size());
+ }
+
+ /**
+ * \brief construct from a shared std::vector pinter, no data copy
+ */
+ explicit SArray(const std::shared_ptr> &vec) {
+ ptr_ = std::shared_ptr(vec, vec->data());
+ size_ = vec->size();
+ capacity_ = size_;
+ }
+
+ /** @brief Copy from a initializer_list */
+ template
+ SArray(const std::initializer_list &list) {
+ CopyFrom(list.begin(), list.end());
+ }
+
+ /** @brief Copy from a initializer_list */
+ template
+ void operator=(const std::initializer_list &list) {
+ CopyFrom(list.begin(), list.end());
+ }
+
+ /**
+ * @brief Reset the current data pointer with a deleter
+ */
+ template
+ void reset(V *data, size_t size, Deleter del) {
+ size_ = size;
+ capacity_ = size;
+ ptr_.reset(data, del);
+ }
+
+ /**
+ * @brief Resizes the array to size elements
+ *
+ * If size <= capacity_, then only change the size. otherwise, append size -
+ * current_size entries, and then set new value to val
+ */
+ void resize(size_t size, V val = 0) {
+ size_t cur_n = size_;
+ if (capacity_ >= size) {
+ size_ = size;
+ } else {
+ V *new_data = new V[size + 5];
+ memcpy(new_data, data(), size_ * sizeof(V));
+ reset(new_data, size, [](V *data) { delete[] data; });
+ }
+ if (size <= cur_n)
+ return;
+ V *p = data() + cur_n;
+ if (val == 0) {
+ memset(p, 0, (size - cur_n) * sizeof(V));
+ } else {
+ for (size_t i = 0; i < size - cur_n; ++i) {
+ *p = val;
+ ++p;
+ }
+ }
+ }
+
+ /**
+ * @brief Requests that the capacity be at least enough to contain n
+ * elements.
+ */
+ void reserve(size_t size) {
+ if (capacity_ >= size) {
+ return;
+ }
+ size_t old_size = size_;
+ resize(size);
+ size_ = old_size;
+ }
+
+ /** @brief release the memory */
+ void clear() {
+ reset(nullptr, 0, [](V *data) {});
+ }
+
+ inline bool empty() const {
+ return size() == 0;
+ }
+ inline size_t size() const {
+ return size_;
+ }
+ inline size_t capacity() const {
+ return capacity_;
+ }
+
+ inline V *begin() {
+ return data();
+ }
+ inline const V *begin() const {
+ return data();
+ }
+ inline V *end() {
+ return data() + size();
+ }
+ inline const V *end() const {
+ return data() + size();
+ }
+
+ inline V *data() const {
+ return ptr_.get();
+ }
+
+ /** \brief get the shared pointer */
+ inline std::shared_ptr &ptr() {
+ return ptr_;
+ }
+ /** \brief get the const shared pointer */
+ inline const std::shared_ptr &ptr() const {
+ return ptr_;
+ }
+
+ inline V back() const {
+ CHECK(!empty());
+ return data()[size_ - 1];
+ }
+ inline V front() const {
+ CHECK(!empty());
+ return data()[0];
+ }
+ inline V &operator[](size_t i) {
+ return data()[i];
+ }
+ inline const V &operator[](size_t i) const {
+ return data()[i];
+ }
+
+ inline void push_back(const V &val) {
+ if (size_ == capacity_)
+ reserve(size_ * 2 + 5);
+ data()[size_++] = val;
+ }
+
+ void pop_back() {
+ if (size_)
+ --size_;
+ }
+
+ void append(const SArray &arr) {
+ if (arr.empty())
+ return;
+ auto orig_size = size_;
+ resize(size_ + arr.size());
+ memcpy(data() + orig_size, arr.data(), arr.size() * sizeof(V));
+ }
+
+ /**
+ * @brief Slice a segment, zero-copy
+ *
+ * @param begin the start index segment
+ * @param end the end index segment
+ * @return the segment [begin, end)
+ */
+ SArray segment(size_t begin, size_t end) const {
+ CHECK_GE(end, begin);
+ CHECK_LE(end, size());
+ SArray ret;
+ ret.ptr_ = std::shared_ptr(ptr_, data() + begin);
+ ret.size_ = end - begin;
+ ret.capacity_ = end - begin;
+ return ret;
+ }
+
+private:
+ size_t size_ = 0;
+ size_t capacity_ = 0;
+ std::shared_ptr ptr_;
+};
+
+/**
+ * \brief print a debug string
+ */
+template
+std::ostream &operator<<(std::ostream &os, const SArray &obj) {
+ os << DebugStr(obj.data(), obj.size());
+ return os;
+}
diff --git a/ps-lite/include/common/shared_mutex.h b/ps-lite/include/common/shared_mutex.h
new file mode 100644
index 0000000..d7ea395
--- /dev/null
+++ b/ps-lite/include/common/shared_mutex.h
@@ -0,0 +1,184 @@
+/*
+ * shared_mutex (C) 2017 E. Oriani, ema fastwebnet it
+ *
+ * This file is part of shared_mutex.
+ *
+ * shared_mutex is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * shared_mutex is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with nettop. If not, see .
+ */
+
+#pragma once
+
+#include
+#include
+#include
+
+// this should be defined in the Makefile
+// if not defined, use what is most common
+// for x86_64 CPUs in 2017...
+#ifndef LEVEL1_DCACHE_LINESIZE
+#define LEVEL1_DCACHE_LINESIZE 64
+#endif
+
+namespace ps {
+template
+class shared_mutex {
+ // purpose of this structure is to hold
+ // status of each individual bucket-mutex
+ // object
+ // Ideally each thread should be mapped to
+ // one entry only of 'el_' during its
+ // lifetime
+ struct entry_lock {
+ const static uint64_t W_MASK = 0x8000000000000000, R_MASK = ~W_MASK;
+
+ // purpose ot this variable is to hold
+ // in the first bit (W_MASK) if we're locking
+ // in exclusive mode, otherwise use the
+ // reamining 63 bits to count how many R/O
+ // locks we share in this very bucket
+ std::atomic wr_lock;
+
+ entry_lock() : wr_lock(0) {
+ }
+ } __attribute__((aligned(LEVEL1_DCACHE_LINESIZE)));
+ // array holding all the buckets
+ std::array el_;
+ // atomic variable used to initialize thread
+ // ids so that they should evenly spread
+ // across all the buckets
+ static std::atomic idx_hint_;
+ // lock-free function to return a 'unique' id
+ static uint64_t get_hint_idx(void) {
+ while (true) {
+ size_t cur_hint = idx_hint_.load();
+ if (idx_hint_.compare_exchange_weak(cur_hint, cur_hint + 1))
+ return cur_hint;
+ }
+ }
+ // get index for given thread
+ // could hav used something like
+ // std::hash()(std::this_thread::get_id()) but honestly
+ // using a controlled idx_hint_ seems to be better in terms of putting
+ // threads into buckets evenly note - thread_local is supposed to be
+ // static...
+ inline static size_t get_thread_idx(void) {
+ const thread_local size_t rv = get_hint_idx() % N;
+ return rv;
+ }
+
+public:
+ shared_mutex() {
+ }
+
+ void lock_shared(void) {
+ // try to replace the wr_lock with current value incremented by one
+ while (true) {
+ size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load();
+ if (entry_lock::W_MASK & cur_rw_lock) {
+ // if someone has got W access yield and retry...
+ std::this_thread::yield();
+ continue;
+ }
+ if (el_[get_thread_idx()].wr_lock.compare_exchange_weak(
+ cur_rw_lock, cur_rw_lock + 1))
+ break;
+ }
+ }
+
+ void unlock_shared(void) {
+ // try to decrement the count
+ while (true) {
+ size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load();
+#ifndef _RELEASE
+ if (entry_lock::W_MASK & cur_rw_lock)
+ throw std::runtime_error(
+ "Fatal: unlock_shared but apparently this entry is W_MASK locked!");
+#endif //_RELEASE
+ if (el_[get_thread_idx()].wr_lock.compare_exchange_weak(
+ cur_rw_lock, cur_rw_lock - 1))
+ break;
+ }
+ }
+
+ void lock(void) {
+ for (size_t i = 0; i < N; ++i) {
+ // acquire all locks from all buckets
+ while (true) {
+ size_t cur_rw_lock = el_[i].wr_lock.load();
+ if (cur_rw_lock != 0) {
+ std::this_thread::yield();
+ continue;
+ }
+ // if cur_rw_lock is 0 then proceed
+ if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock,
+ entry_lock::W_MASK))
+ break;
+ }
+ }
+ }
+
+ void unlock(void) {
+ for (size_t i = 0; i < N; ++i) {
+ // release all locks
+ while (true) {
+ size_t cur_rw_lock = el_[i].wr_lock.load();
+#ifndef _RELEASE
+ if (cur_rw_lock != entry_lock::W_MASK)
+ throw std::runtime_error(
+ "Fatal: unlock but apparently this entry is shared locked or uninitialized!");
+#endif //_RELEASE
+ // then proceed resetting to 0
+ if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock, 0))
+ break;
+ }
+ }
+ }
+
+ ~shared_mutex() {
+ }
+};
+
+template
+std::atomic shared_mutex::idx_hint_{0};
+
+// utility class for exclusive RAII lock
+template
+class x_lock {
+ shared_mutex &sm_;
+
+public:
+ x_lock(shared_mutex &sm) : sm_(sm) {
+ sm_.lock();
+ }
+
+ ~x_lock() {
+ sm_.unlock();
+ }
+};
+
+// utility class for share RAII lock
+template
+class s_lock {
+ shared_mutex &sm_;
+
+public:
+ s_lock(shared_mutex &sm) : sm_(sm) {
+ sm_.lock_shared();
+ }
+
+ ~s_lock() {
+ sm_.unlock_shared();
+ }
+};
+} // namespace ps
diff --git a/ps-lite/include/common/thread_pool.h b/ps-lite/include/common/thread_pool.h
new file mode 100644
index 0000000..1899b0a
--- /dev/null
+++ b/ps-lite/include/common/thread_pool.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+class ThreadPool {
+public:
+ ThreadPool(size_t thread_num);
+ ~ThreadPool();
+ static ThreadPool *Get();
+
+ template
+ auto Enqueue(F &&f, Args &&... args)
+ -> std::future::type> {
+ using return_type = typename std::result_of::type;
+ auto task = std::make_shared>(
+ std::bind(std::forward(f), std::forward(args)...));
+ std::future res = task->get_future();
+ {
+ std::unique_lock lock(mutex_);
+ if (terminate_)
+ throw std::runtime_error("enqueue on stopped ThreadPool");
+ tasks_.emplace([task]() { (*task)(); });
+ }
+ cond_.notify_one();
+ return res;
+ }
+
+ void Wait(int task_num);
+
+ size_t ThreadNum() {
+ return thread_num_;
+ }
+
+private:
+ bool terminate_;
+ size_t thread_num_;
+ std::atomic_int complete_task_num_;
+ std::vector threads_;
+ std::queue> tasks_;
+ std::mutex mutex_;
+ std::condition_variable cond_;
+};
diff --git a/ps-lite/include/common/thread_safe_hash_map.h b/ps-lite/include/common/thread_safe_hash_map.h
new file mode 100644
index 0000000..ce1f1e7
--- /dev/null
+++ b/ps-lite/include/common/thread_safe_hash_map.h
@@ -0,0 +1,325 @@
+#pragma once
+#include
+#include
+#include
+#include "shared_mutex.h"
+
+namespace ps {
+/*
+ thread_safe unordered_map
+ use read-write lock to guaruntee concurrency
+*/
+template ,
+ typename _Pred = std::equal_to<_Key>,
+ typename _Alloc = std::allocator>>
+class threadsafe_unordered_map {
+private:
+ std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc> map;
+ mutable shared_mutex<4> mtx;
+
+public:
+ using map_type = std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>;
+ using key_type = typename map_type::key_type;
+ using mapped_type = typename map_type::mapped_type;
+ using value_type = typename map_type::value_type;
+ using hasher = typename map_type::hasher;
+ using key_equal = typename map_type::key_equal;
+ using allocator_type = typename map_type::allocator_type;
+ using reference = typename map_type::reference;
+ using const_reference = typename map_type::const_reference;
+ using pointer = typename map_type::pointer;
+ using const_pointer = typename map_type::const_pointer;
+ using iterator = typename map_type::iterator;
+ using const_iterator = typename map_type::const_iterator;
+ using local_iterator = typename map_type::local_iterator;
+ using const_local_iterator = typename map_type::const_local_iterator;
+ using size_type = typename map_type::size_type;
+ using difference_type = typename map_type::difference_type;
+
+ threadsafe_unordered_map() = default;
+ threadsafe_unordered_map(const threadsafe_unordered_map &) = delete;
+ threadsafe_unordered_map(threadsafe_unordered_map &&) = default;
+ threadsafe_unordered_map &
+ operator=(const threadsafe_unordered_map &) = delete;
+ threadsafe_unordered_map &operator=(threadsafe_unordered_map &&) = delete;
+ explicit threadsafe_unordered_map(
+ size_type __n, const hasher &__hf = hasher(),
+ const key_equal &__eql = key_equal(),
+ const allocator_type &__a = allocator_type()) :
+ map(__n, __hf, __eql, __a) {
+ }
+ template
+ threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+ size_type __n = 0, const hasher &__hf = hasher(),
+ const key_equal &__eql = key_equal(),
+ const allocator_type &__a = allocator_type()) :
+ map(__first, __last, __n, __hf, __eql, __a) {
+ }
+ threadsafe_unordered_map(const map_type &v) : map(v) {
+ }
+ threadsafe_unordered_map(map_type &&rv) : map(std::move(rv)) {
+ }
+ explicit threadsafe_unordered_map(const allocator_type &__a) : map(__a) {
+ }
+ threadsafe_unordered_map(const map_type &__umap,
+ const allocator_type &__a) :
+ map(__umap, __a) {
+ }
+ threadsafe_unordered_map(map_type &&__umap, const allocator_type &__a) :
+ map(std::move(__umap), __a) {
+ }
+ threadsafe_unordered_map(std::initializer_list __l,
+ size_type __n = 0, const hasher &__hf = hasher(),
+ const key_equal &__eql = key_equal(),
+ const allocator_type &__a = allocator_type()) :
+ map(__l, __n, __hf, __eql, __a) {
+ }
+ threadsafe_unordered_map(size_type __n, const allocator_type &__a) :
+ threadsafe_unordered_map(__n, hasher(), key_equal(), __a) {
+ }
+ threadsafe_unordered_map(size_type __n, const hasher &__hf,
+ const allocator_type &__a) :
+ threadsafe_unordered_map(__n, __hf, key_equal(), __a) {
+ }
+ template
+ threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+ size_type __n, const allocator_type &__a) :
+ map(__first, __last, __n, __a) {
+ }
+ template
+ threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+ size_type __n, const hasher &__hf,
+ const allocator_type &__a) :
+ threadsafe_unordered_map(__first, __last, __n, __hf, key_equal(), __a) {
+ }
+ threadsafe_unordered_map(std::initializer_list __l,
+ size_type __n, const allocator_type &__a) :
+ threadsafe_unordered_map(__l, __n, hasher(), key_equal(), __a) {
+ }
+ threadsafe_unordered_map(std::initializer_list __l,
+ size_type __n, const hasher &__hf,
+ const allocator_type &__a) :
+ threadsafe_unordered_map(__l, __n, __hf, key_equal(), __a) {
+ }
+ bool empty() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.empty();
+ }
+ size_type size() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.size();
+ }
+ size_type max_size() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.max_size();
+ }
+ iterator begin() noexcept {
+ x_lock<4> write_lock(mtx);
+ return map.begin();
+ }
+ const_iterator begin() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.begin();
+ }
+ const_iterator cbegin() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.cbegin();
+ }
+ iterator end() noexcept {
+ x_lock<4> write_lock(mtx);
+ return map.end();
+ }
+ const_iterator end() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.end();
+ }
+ const_iterator cend() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.cend();
+ }
+ template
+ std::pair emplace(_Args &&... __args) {
+ x_lock<4> write_lock(mtx);
+ return map.emplace(std::forward<_Args>(__args)...);
+ }
+ template
+ iterator emplace_hint(const_iterator __pos, _Args &&... __args) {
+ x_lock<4> write_lock(mtx);
+ return map.emplace_hint(__pos, std::forward<_Args>(__args)...);
+ }
+ std::pair insert(const value_type &__x) {
+ x_lock<4> write_lock(mtx);
+ return map.insert(__x);
+ }
+ template ::value>::type>
+ std::pair insert(_Pair &&__x) {
+ x_lock<4> write_lock(mtx);
+ return map.insert(std::forward<_Pair>(__x));
+ }
+ iterator insert(const_iterator __hint, const value_type &__x) {
+ x_lock<4> write_lock(mtx);
+ return map.insert(__hint, __x);
+ }
+ template ::value>::type>
+ iterator insert(const_iterator __hint, _Pair &&__x) {
+ x_lock<4> write_lock(mtx);
+ return map.insert(__hint, std::forward<_Pair>(__x));
+ }
+ template
+ void insert(_InputIterator __first, _InputIterator __last) {
+ x_lock<4> write_lock(mtx);
+ map.insert(__first, __last);
+ }
+ void insert(std::initializer_list __l) {
+ x_lock<4> write_lock(mtx);
+ map.insert(__l);
+ }
+ iterator erase(const_iterator __position) {
+ x_lock<4> write_lock(mtx);
+ return map.erase(__position);
+ }
+ iterator erase(iterator __position) {
+ x_lock<4> write_lock(mtx);
+ return map.erase(__position);
+ }
+ size_type erase(const key_type &__x) {
+ x_lock<4> write_lock(mtx);
+ return map.erase(__x);
+ }
+ iterator erase(const_iterator __first, const_iterator __last) {
+ x_lock<4> write_lock(mtx);
+ return map.erase(__first, __last);
+ }
+ void clear() noexcept {
+ x_lock<4> write_lock(mtx);
+ map.clear();
+ }
+ void swap(map_type &__x) noexcept(noexcept(map.swap(__x._M_h))) {
+ x_lock<4> write_lock(mtx);
+ map.swap(__x._M_h);
+ }
+ hasher hash_function() const {
+ s_lock<4> read_lock(mtx);
+ return map.hash_function();
+ }
+ key_equal key_eq() const {
+ s_lock<4> read_lock(mtx);
+ return map.key_eq();
+ }
+ iterator find(const key_type &__x) {
+ x_lock<4> write_lock(mtx);
+ return map.find(__x);
+ }
+ const_iterator find(const key_type &__x) const {
+ s_lock<4> read_lock(mtx);
+ return map.find(__x);
+ }
+ size_type count(const key_type &__x) const {
+ s_lock<4> read_lock(mtx);
+ return map.count(__x);
+ }
+ std::pair equal_range(const key_type &__x) {
+ x_lock<4> write_lock(mtx);
+ return map.equal_range(__x);
+ }
+ std::pair
+ equal_range(const key_type &__x) const {
+ s_lock<4> read_lock(mtx);
+ return map.equal_range(__x);
+ }
+ mapped_type &operator[](const key_type &__k) {
+ x_lock<4> write_lock(mtx);
+ return map[__k];
+ }
+ mapped_type &operator[](key_type &&__k) {
+ x_lock<4> write_lock(mtx);
+ return map[std::move(__k)];
+ }
+ mapped_type &at(const key_type &__k) {
+ x_lock<4> write_lock(mtx);
+ return map.at(__k);
+ }
+ const mapped_type &at(const key_type &__k) const {
+ s_lock<4> read_lock(mtx);
+ return map.at(__k);
+ }
+ size_type bucket_count() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.bucket_count();
+ }
+
+ size_type max_bucket_count() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.max_bucket_count();
+ }
+ size_type bucket_size(size_type __n) const {
+ s_lock<4> read_lock(mtx);
+ return map.bucket_size(__n);
+ }
+ size_type bucket(const key_type &__key) const {
+ s_lock<4> read_lock(mtx);
+ return map.bucket(__key);
+ }
+ local_iterator begin(size_type __n) {
+ x_lock<4> write_lock(mtx);
+ return map.begin(__n);
+ }
+ const_local_iterator begin(size_type __n) const {
+ s_lock<4> read_lock(mtx);
+ return map.begin(__n);
+ }
+ const_local_iterator cbegin(size_type __n) const {
+ s_lock<4> read_lock(mtx);
+ return map.cbegin(__n);
+ }
+ local_iterator end(size_type __n) {
+ x_lock<4> write_lock(mtx);
+ return map.end(__n);
+ }
+ const_local_iterator end(size_type __n) const {
+ s_lock<4> read_lock(mtx);
+ return map.end(__n);
+ }
+ const_local_iterator cend(size_type __n) const {
+ s_lock<4> read_lock(mtx);
+ return map.cend(__n);
+ }
+ float load_factor() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.load_factor();
+ }
+ float max_load_factor() const noexcept {
+ s_lock<4> read_lock(mtx);
+ return map.max_load_factor();
+ }
+ void max_load_factor(float __z) {
+ x_lock<4> write_lock(mtx);
+ map.max_load_factor(__z);
+ }
+ void rehash(size_type __n) {
+ x_lock<4> write_lock(mtx);
+ map.rehash(__n);
+ }
+ void reserve(size_type __n) {
+ x_lock<4> write_lock(mtx);
+ map.reserve(__n);
+ }
+ // ----------------------------- Added function
+ // ----------------------------------
+ template
+ const_iterator emplaceIfAbsent(const key_type &__x, _Args &&... __args) {
+ x_lock<4> write_lock(mtx);
+ iterator iter = map.find(__x);
+ if (iter == map.end()) {
+ iter = map.emplace(__x, mapped_type(std::forward<_Args>(__args)...))
+ .first;
+ }
+ return iter;
+ }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/base.h b/ps-lite/include/ps/base.h
new file mode 100644
index 0000000..0984d05
--- /dev/null
+++ b/ps-lite/include/ps/base.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_BASE_H_
+#define PS_BASE_H_
+#include
+#include "ps/internal/utils.h"
+namespace ps {
+
+#if USE_KEY32
+/*! \brief Use unsigned 32-bit int as the key type */
+using Key = uint32_t;
+#else
+/*! \brief Use unsigned 64-bit int as the key type */
+using Key = uint64_t;
+#endif
+/*! \brief The maximal allowed key value */
+static const Key kMaxKey = std::numeric_limits::max();
+/** \brief node ID for the scheduler */
+static const int kScheduler = 1;
+/**
+ * \brief the server node group ID
+ *
+ * group id can be combined:
+ * - kServerGroup + kScheduler means all server nodes and the scheuduler
+ * - kServerGroup + kWorkerGroup means all server and worker nodes
+ */
+static const int kServerGroup = 2;
+/** \brief the worker node group ID */
+static const int kWorkerGroup = 4;
+
+} // namespace ps
+#endif // PS_BASE_H_
diff --git a/ps-lite/include/ps/internal/assign_op.h b/ps-lite/include/ps/internal/assign_op.h
new file mode 100644
index 0000000..7941960
--- /dev/null
+++ b/ps-lite/include/ps/internal/assign_op.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ * \file assign_op.h
+ * \brief assignment operator
+ * http://en.cppreference.com/w/cpp/language/operator_assignment
+ */
+#ifndef PS_INTERNAL_ASSIGN_OP_H_
+#define PS_INTERNAL_ASSIGN_OP_H_
+#include "ps/internal/utils.h"
+namespace ps {
+
+enum AssignOp {
+ ASSIGN, // a = b
+ PLUS, // a += b
+ MINUS, // a -= b
+ TIMES, // a *= b
+ DIVIDE, // a -= b
+ AND, // a &= b
+ OR, // a |= b
+ XOR // a ^= b
+};
+
+/**
+ * \brief return an assignment function: right op= left
+ */
+template
+inline void AssignFunc(const T &lhs, AssignOp op, T *rhs) {
+ switch (op) {
+ case ASSIGN:
+ *right = left;
+ break;
+ case PLUS:
+ *right += left;
+ break;
+ case MINUS:
+ *right -= left;
+ break;
+ case TIMES:
+ *right *= left;
+ break;
+ case DIVIDE:
+ *right /= left;
+ break;
+ default:
+ LOG(FATAL) << "use AssignOpInt..";
+ }
+}
+
+/**
+ * \brief return an assignment function including bit operations, only
+ * works for integers
+ */
+template
+inline void AssignFuncInt(const T &lhs, AssignOp op, T *rhs) {
+ switch (op) {
+ case ASSIGN:
+ *right = left;
+ break;
+ case PLUS:
+ *right += left;
+ break;
+ case MINUS:
+ *right -= left;
+ break;
+ case TIMES:
+ *right *= left;
+ break;
+ case DIVIDE:
+ *right /= left;
+ break;
+ case AND:
+ *right &= left;
+ break;
+ case OR:
+ *right |= left;
+ break;
+ case XOR:
+ *right ^= left;
+ break;
+ }
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_ASSIGN_OP_H_
diff --git a/ps-lite/include/ps/internal/customer.h b/ps-lite/include/ps/internal/customer.h
new file mode 100644
index 0000000..0a7ee11
--- /dev/null
+++ b/ps-lite/include/ps/internal/customer.h
@@ -0,0 +1,119 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_CUSTOMER_H_
+#define PS_INTERNAL_CUSTOMER_H_
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "ps/internal/message.h"
+#include "ps/internal/threadsafe_pqueue.h"
+namespace ps {
+
+/**
+ * \brief The object for communication.
+ *
+ * As a sender, a customer tracks the responses for each request sent.
+ *
+ * It has its own receiving thread which is able to process any message received
+ * from a remote node with `msg.meta.customer_id` equal to this customer's id
+ */
+class Customer {
+public:
+ /**
+ * \brief the handle for a received message
+ * \param recved the received message
+ */
+ using RecvHandle = std::function;
+
+ /**
+ * \brief constructor
+ * \param app_id the globally unique id indicating the application the
+ * postoffice serving for \param customer_id the locally unique id
+ * indicating the customer of a postoffice \param recv_handle the functino
+ * for processing a received message
+ */
+ Customer(int app_id, int customer_id, const RecvHandle &recv_handle);
+
+ /**
+ * \brief desconstructor
+ */
+ ~Customer();
+
+ /**
+ * \brief return the globally unique application id
+ */
+ inline int app_id() {
+ return app_id_;
+ }
+
+ /**
+ * \brief return the locally unique customer id
+ */
+ inline int customer_id() {
+ return customer_id_;
+ }
+
+ /**
+ * \brief get a timestamp for a new request. threadsafe
+ * \param recver the receive node id of this request
+ * \return the timestamp of this request
+ */
+ int NewRequest(int recver);
+
+ /**
+ * \brief wait until the request is finished. threadsafe
+ * \param timestamp the timestamp of the request
+ */
+ void WaitRequest(int timestamp);
+
+ /**
+ * \brief return the number of responses received for the request.
+ * threadsafe \param timestamp the timestamp of the request
+ */
+ // int NumResponse(int timestamp);
+
+ /**
+ * \brief add a number of responses to timestamp
+ */
+ // void AddResponse(int timestamp, int num = 1);
+
+ /**
+ * \brief accept a received message from \ref Van. threadsafe
+ * \param recved the received the message
+ */
+ inline void Accept(const Message &recved) {
+ recv_queue_.Push(recved);
+ }
+
+private:
+ /**
+ * \brief the thread function
+ */
+ void Receiving();
+
+ int app_id_;
+
+ int customer_id_;
+
+ RecvHandle recv_handle_;
+ ThreadsafePQueue recv_queue_;
+ // using multithread to speed data processing
+ std::vector> recv_threads_;
+
+ std::mutex tracker_mu_;
+ std::condition_variable tracker_cond_;
+ std::unordered_map tracker_;
+ int cur_timestamp;
+
+ DISALLOW_COPY_AND_ASSIGN(Customer);
+};
+
+} // namespace ps
+#endif // PS_INTERNAL_CUSTOMER_H_
diff --git a/ps-lite/include/ps/internal/env.h b/ps-lite/include/ps/internal/env.h
new file mode 100644
index 0000000..d3e4aa0
--- /dev/null
+++ b/ps-lite/include/ps/internal/env.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) 2016 by Contributors
+ */
+#ifndef PS_INTERNAL_ENV_H_
+#define PS_INTERNAL_ENV_H_
+#include
+#include
+#include
+#include
+namespace ps {
+
+/**
+ * \brief Environment configurations
+ */
+class Environment {
+public:
+ /**
+ * \brief return the singleton instance
+ */
+ static inline Environment *Get() {
+ return _GetSharedRef(nullptr).get();
+ }
+ /**
+ * \brief return a shared ptr of the singleton instance
+ */
+ static inline std::shared_ptr _GetSharedRef() {
+ return _GetSharedRef(nullptr);
+ }
+ /**
+ * \brief initialize the environment
+ * \param envs key-value environment variables
+ * \return the initialized singleton instance
+ */
+ static inline Environment *
+ Init(const std::unordered_map &envs) {
+ Environment *env = _GetSharedRef(&envs).get();
+ env->kvs = envs;
+ return env;
+ }
+
+ /**
+ * \brief find the env value.
+ * User-defined env vars first. If not found, check system's environment
+ * \param k the environment key
+ * \return the related environment value, nullptr when not found
+ */
+ const char *find(const char *k) {
+ std::string key(k);
+ return kvs.find(key) == kvs.end() ? getenv(k) : kvs[key].c_str();
+ }
+
+private:
+ explicit Environment(
+ const std::unordered_map *envs) {
+ if (envs)
+ kvs = *envs;
+ }
+
+ static std::shared_ptr
+ _GetSharedRef(const std::unordered_map *envs) {
+ static std::shared_ptr inst_ptr(new Environment(envs));
+ return inst_ptr;
+ }
+
+ std::unordered_map kvs;
+};
+
+} // namespace ps
+#endif // PS_INTERNAL_ENV_H_
diff --git a/ps-lite/include/ps/internal/message.h b/ps-lite/include/ps/internal/message.h
new file mode 100644
index 0000000..ceb8133
--- /dev/null
+++ b/ps-lite/include/ps/internal/message.h
@@ -0,0 +1,170 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_MESSAGE_H_
+#define PS_INTERNAL_MESSAGE_H_
+#include "common/sarray.h"
+#include "ps/psf/PSFunc.h"
+#include
+#include
+#include
+#include
+
+namespace ps {
+
+/**
+ * \brief information about a node
+ */
+struct Node {
+ /** \brief the empty value */
+ static const int kEmpty;
+ /** \brief default constructor */
+ Node() : id(kEmpty), port(kEmpty), is_recovery(false) {
+ }
+ /** \brief node roles */
+ enum Role { SERVER, WORKER, SCHEDULER };
+ /** \brief get debug string */
+ std::string DebugString() const {
+ std::stringstream ss;
+ ss << "role="
+ << (role == SERVER ? "server" :
+ (role == WORKER ? "worker" : "scheduler"))
+ << (id != kEmpty ? ", id=" + std::to_string(id) : "")
+ << ", ip=" << hostname << ", port=" << port
+ << ", is_recovery=" << is_recovery;
+
+ return ss.str();
+ }
+ /** \brief get short debug string */
+ std::string ShortDebugString() const {
+ std::string str = role == SERVER ? "S" : (role == WORKER ? "W" : "H");
+ if (id != kEmpty)
+ str += "[" + std::to_string(id) + "]";
+ return str;
+ }
+ /** \brief the role of this node */
+ Role role;
+ /** \brief node id */
+ int id;
+ /** \brief customer id */
+ int customer_id;
+ /** \brief hostname or ip */
+ std::string hostname;
+ /** \brief the port this node is binding */
+ int port;
+ /** \brief whether this node is created by failover */
+ bool is_recovery;
+};
+/**
+ * \brief meta info of a system control message
+ */
+struct Control {
+ /** \brief empty constructor */
+ Control() : cmd(EMPTY) {
+ }
+ /** \brief return true is empty */
+ inline bool empty() const {
+ return cmd == EMPTY;
+ }
+ /** \brief get debug string */
+ std::string DebugString() const {
+ if (empty())
+ return "";
+ std::vector cmds = {"EMPTY", "TERMINATE", "ADD_NODE",
+ "BARRIER", "ACK", "HEARTBEAT"};
+ std::stringstream ss;
+ ss << "cmd=" << cmds[cmd];
+ if (node.size()) {
+ ss << ", node={";
+ for (const Node &n : node)
+ ss << " " << n.DebugString();
+ ss << " }";
+ }
+ if (cmd == BARRIER)
+ ss << ", barrier_group=" << barrier_group;
+ if (cmd == ACK)
+ ss << ", msg_sig=" << msg_sig;
+ return ss.str();
+ }
+ /** \brief all commands */
+ enum Command { EMPTY, TERMINATE, ADD_NODE, BARRIER, ACK, HEARTBEAT };
+ /** \brief the command */
+ Command cmd;
+ /** \brief node infos */
+ std::vector node;
+ /** \brief the node group for a barrier, such as kWorkerGroup */
+ int barrier_group;
+ /** message signature */
+ uint64_t msg_sig;
+};
+/**
+ * \brief meta info of a message
+ */
+struct Meta {
+ /** \brief the empty value */
+ static const int kEmpty;
+ /** \brief default constructor */
+ Meta() :
+ app_id(kEmpty), customer_id(kEmpty), timestamp(kEmpty), sender(kEmpty),
+ recver(kEmpty), request(false), priority(kEmpty),
+ psftype(PsfType::DensePull) {
+ }
+ std::string DebugString() const {
+ std::stringstream ss;
+ if (sender == Node::kEmpty) {
+ ss << "?";
+ } else {
+ ss << sender;
+ }
+ ss << " => " << recver;
+ if (timestamp != kEmpty)
+ ss << ", timestamp=" << timestamp;
+ if (!control.empty()) {
+ ss << ", control={ " << control.DebugString() << " }";
+ } else {
+ ss << ", app_id=" << app_id << ", customer_id=" << customer_id
+ << ", priority=" << priority << ", psfType=" << psftype;
+ }
+ return ss.str();
+ }
+ /** \brief the unique id of the application of messsage is for*/
+ int app_id;
+ /** \brief customer id*/
+ int customer_id;
+ /** \brief the timestamp of this message */
+ int timestamp;
+ /** \brief the node id of the sender of this message */
+ int sender;
+ /** \brief the node id of the receiver of this message */
+ int recver;
+ /** \brief whether or not this is a request message*/
+ bool request;
+ /** \brief system control message */
+ Control control;
+ /** \brief message priority */
+ int priority;
+ /** \brief server-side computation op for keys */
+ PsfType psftype;
+};
+/**
+ * \brief messages that communicated amaong nodes.
+ */
+struct Message {
+ /** \brief the meta info of this message */
+ Meta meta;
+ /** \brief the large chunk of data of this message */
+ std::vector> data;
+
+ std::string DebugString() const {
+ std::stringstream ss;
+ ss << meta.DebugString();
+ if (data.size()) {
+ ss << " Body:";
+ for (const auto &d : data)
+ ss << " data_size=" << d.size();
+ }
+ return ss.str();
+ }
+};
+} // namespace ps
+#endif // PS_INTERNAL_MESSAGE_H_
diff --git a/ps-lite/include/ps/internal/parallel_kv_match.h b/ps-lite/include/ps/internal/parallel_kv_match.h
new file mode 100644
index 0000000..6a9bf18
--- /dev/null
+++ b/ps-lite/include/ps/internal/parallel_kv_match.h
@@ -0,0 +1,125 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ * \file parallel_kv_match.h
+ * \brief paralle key-value pairs matching
+ */
+#ifndef PS_INTERNAL_PARALLEL_KV_MATCH_H_
+#define PS_INTERNAL_PARALLEL_KV_MATCH_H_
+#include
+#include
+#include "ps/sarray.h"
+#include "ps/internal/assign_op.h"
+
+namespace ps {
+namespace {
+/**
+ * \brief thread function, internal use
+ *
+ * \param src_key start of source key
+ * \param src_key_end end of source key
+ * \param src_val start of source val
+ * \param dst_key start of destination key
+ * \param dst_key_end end of denstination key
+ * \param dst_val start of destination val
+ * \param k length of a single value
+ * \param op assignment operator
+ * \param grainsize thread grainsize size
+ * \param n number of matched kv pairs
+ */
+template
+void ParallelOrderedMatch(const K *src_key, const K *src_key_end,
+ const V *src_val, const K *dst_key,
+ const K *dst_key_end, V *dst_val, int k, AsOp op,
+ size_t grainsize, size_t *n) {
+ size_t src_len = std::distance(src_key, src_key_end);
+ size_t dst_len = std::distance(dst_key, dst_key_end);
+ if (dst_len == 0 || src_len == 0)
+ return;
+
+ // drop the unmatched tail of src
+ src_key = std::lower_bound(src_key, src_key_end, *dst_key);
+ src_val += (src_key - (src_key_end - src_len)) * k;
+
+ if (dst_len <= grainsize) {
+ while (dst_key != dst_key_end && src_key != src_key_end) {
+ if (*src_key < *dst_key) {
+ ++src_key;
+ src_val += k;
+ } else {
+ if (!(*dst_key < *src_key)) {
+ for (int i = 0; i < k; ++i) {
+ AssignOp(dst_val[i], src_val[i], op);
+ }
+ ++src_key;
+ src_val += k;
+ *n += k;
+ }
+ ++dst_key;
+ dst_val += k;
+ }
+ }
+ } else {
+ std::thread thr(ParallelOrderedMatch, src_key, src_key_end,
+ src_val, dst_key, dst_key + dst_len / 2, dst_val, k, op,
+ grainsize, n);
+ size_t m = 0;
+ ParallelOrderedMatch(
+ src_key, src_key_end, src_val, dst_key + dst_len / 2, dst_key_end,
+ dst_val + (dst_len / 2) * k, k, op, grainsize, &m);
+ thr.join();
+ *n += m;
+ }
+}
+} // namespace
+
+/**
+ * \brief Merge \a src_val into \a dst_val by matching keys. Keys must be unique
+ * and sorted.
+ *
+ * \code
+ * if (dst_key[i] == src_key[j]) {
+ * dst_val[i] op= src_val[j]
+ * }
+ * \endcode
+ *
+ * When finished, \a dst_val will have length `k * dst_key.size()` and filled
+ * with matched value. Umatched value will be untouched if exists or filled with
+ * 0.
+ *
+ * \tparam K type of key
+ * \tparam V type of value
+ * \tparam C type of the container such as \ref SArray or \ref std::vector
+ * \param src_key the source keys
+ * \param src_val the source values
+ * \param dst_key the destination keys
+ * \param dst_val the destination values.
+ * \param k the length of a single value (default is 1)
+ * \param op the assignment operator (default is ASSIGN)
+ * \param num_threads number of thread (default is 1)
+ * \return the number of matched kv pairs
+ */
+template
+size_t ParallelOrderedMatch(const SArray &src_key, const SArray &src_val,
+ const SArray &dst_key, C *dst_val, int k = 1,
+ AssignOp op = ASSIGN, int num_threads = 1) {
+ // do check
+ CHECK_GT(num_threads, 0);
+ CHECK_EQ(src_key.size() * k, src_val.size());
+ CHECK_NOTNULL(dst_val->resize(dst_key.size() * k));
+ if (dst_key.empty())
+ return 0;
+
+ // shorten the matching range
+ Range range = FindRange(dst_key, src_key.begin(), src_key.end());
+ size_t grainsize = std::max(range.size() * k / num_threads + 5,
+ static_cast(1024 * 1024));
+ size_t n = 0;
+ ParallelOrderedMatch(
+ src_key.begin(), src_key.end(), src_val.begin(),
+ dst_key.begin() + range.begin(), dst_key.begin() + range.end(),
+ dst_val->begin() + range.begin() * k, k, op, grainsize, &n);
+ return n;
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_PARALLEL_KV_MATCH_H_
diff --git a/ps-lite/include/ps/internal/parallel_sort.h b/ps-lite/include/ps/internal/parallel_sort.h
new file mode 100644
index 0000000..174b0be
--- /dev/null
+++ b/ps-lite/include/ps/internal/parallel_sort.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ * @file parallel_sort.h
+ * @brief Parallel sort
+ */
+#ifndef PS_INTERNAL_PARALLEL_SORT_H_
+#define PS_INTERNAL_PARALLEL_SORT_H_
+#include
+#include
+#include
+#include "ps/sarray.h"
+namespace ps {
+
+namespace {
+/**
+ * \brief the thread function
+ *
+ * \param data start pointer of data
+ * \param len length of data
+ * \param grainsize max data length of one thread
+ * \param cmp comparison function
+ */
+template
+void ParallelSort(T *data, size_t len, size_t grainsize, const Fn &cmp) {
+ if (len <= grainsize) {
+ std::sort(data, data + len, cmp);
+ } else {
+ std::thread thr(ParallelSort, data, len / 2, grainsize, cmp);
+ ParallelSort(data + len / 2, len - len / 2, grainsize, cmp);
+ thr.join();
+
+ std::inplace_merge(data, data + len / 2, data + len, cmp);
+ }
+}
+} // namespace
+
+/**
+ * \brief Parallel Sort
+ *
+ * \param arr the array for sorting
+ * \param num_threads number of thread
+ * \param cmp the comparision function such as
+ * [](const T& a, const T& b) {* return a < b; }
+ * or an even simplier version:
+ * std::less()
+ */
+template
+void ParallelSort(SArray *arr, int num_threads = 2,
+ const Fn &cmp = std::less()) {
+ CHECK_GT(num_threads, 0);
+ CHECK(cmp);
+ size_t grainsize =
+ std::max(arr->size() / num_threads + 5, (size_t)1024 * 16);
+ ParallelSort(arr->data(), arr->size(), grainsize, cmp);
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_PARALLEL_SORT_H_
diff --git a/ps-lite/include/ps/internal/postoffice.h b/ps-lite/include/ps/internal/postoffice.h
new file mode 100644
index 0000000..674b01e
--- /dev/null
+++ b/ps-lite/include/ps/internal/postoffice.h
@@ -0,0 +1,222 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_POSTOFFICE_H_
+#define PS_INTERNAL_POSTOFFICE_H_
+#include
+#include
+#include
+#include
+#include
+#include "ps/range.h"
+#include "ps/internal/env.h"
+#include "ps/internal/customer.h"
+#include "ps/internal/van.h"
+namespace ps {
+/**
+ * \brief the center of the system
+ */
+class Postoffice {
+public:
+ /**
+ * \brief return the singleton object
+ */
+ static Postoffice *Get() {
+ static Postoffice e;
+ return &e;
+ }
+ /** \brief get the van */
+ Van *van() {
+ return van_;
+ }
+ /**
+ * \brief start the system
+ *
+ * This function will block until every nodes are started.
+ * \param argv0 the program name, used for logging.
+ * \param do_barrier whether to block until every nodes are started.
+ */
+ void Start(int customer_id, const char *argv0, const bool do_barrier);
+ /**
+ * \brief terminate the system
+ *
+ * All nodes should call this function before existing.
+ * \param do_barrier whether to do block until every node is finalized,
+ * default true.
+ */
+ void Finalize(const int customer_id, const bool do_barrier = true);
+ /**
+ * \brief add an customer to the system. threadsafe
+ */
+ void AddCustomer(Customer *customer);
+ /**
+ * \brief remove a customer by given it's id. threasafe
+ */
+ void RemoveCustomer(Customer *customer);
+ /**
+ * \brief get the customer by id, threadsafe
+ * \param app_id the application id
+ * \param customer_id the customer id
+ * \param timeout timeout in sec
+ * \return return nullptr if doesn't exist and timeout
+ */
+ Customer *GetCustomer(int app_id, int customer_id, int timeout = 0) const;
+ /**
+ * \brief get the id of a node (group), threadsafe
+ *
+ * if it is a node group, return the list of node ids in this
+ * group. otherwise, return {node_id}
+ */
+ const std::vector &GetNodeIDs(int node_id) const {
+ const auto it = node_ids_.find(node_id);
+ CHECK(it != node_ids_.cend()) << "node " << node_id << " doesn't exist";
+ return it->second;
+ }
+ /**
+ * \brief return the key ranges of all server nodes
+ */
+ const std::vector &GetServerKeyRanges();
+ /**
+ * \brief the template of a callback
+ */
+ using Callback = std::function;
+ /**
+ * \brief Register a callback to the system which is called after Finalize()
+ *
+ * The following codes are equal
+ * \code {cpp}
+ * RegisterExitCallback(cb);
+ * Finalize();
+ * \endcode
+ *
+ * \code {cpp}
+ * Finalize();
+ * cb();
+ * \endcode
+ * \param cb the callback function
+ */
+ void RegisterExitCallback(const Callback &cb) {
+ exit_callback_ = cb;
+ }
+ /**
+ * \brief convert from a worker rank into a node id
+ * \param rank the worker rank
+ */
+ static inline int WorkerRankToID(int rank) {
+ return rank * 2 + 9;
+ }
+ /**
+ * \brief convert from a server rank into a node id
+ * \param rank the server rank
+ */
+ static inline int ServerRankToID(int rank) {
+ return rank * 2 + 8;
+ }
+ /**
+ * \brief convert from a node id into a server or worker rank
+ * \param id the node id
+ */
+ static inline int IDtoRank(int id) {
+#ifdef _MSC_VER
+#undef max
+#endif
+ return std::max((id - 8) / 2, 0);
+ }
+ /** \brief Returns the number of worker nodes */
+ int num_workers() const {
+ return num_workers_;
+ }
+ /** \brief Returns the number of server nodes */
+ int num_servers() const {
+ return num_servers_;
+ }
+ /** \brief Returns the rank of this node in its group
+ *
+ * Each worker will have a unique rank within [0, NumWorkers()). So are
+ * servers. This function is available only after \ref Start has been
+ * called.
+ */
+ int my_rank() const {
+ return IDtoRank(van_->my_node().id);
+ }
+ /** \brief Returns true if this node is a worker node */
+ int is_worker() const {
+ return is_worker_;
+ }
+ /** \brief Returns true if this node is a server node. */
+ int is_server() const {
+ return is_server_;
+ }
+ /** \brief Returns true if this node is a scheduler node. */
+ int is_scheduler() const {
+ return is_scheduler_;
+ }
+ /** \brief Returns the verbose level. */
+ int verbose() const {
+ return verbose_;
+ }
+ /** \brief Return whether this node is a recovery node */
+ bool is_recovery() const {
+ return van_->my_node().is_recovery;
+ }
+ /**
+ * \brief barrier
+ * \param node_id the barrier group id
+ */
+ void Barrier(int customer_id, int node_group);
+ /**
+ * \brief process a control message, called by van
+ * \param the received message
+ */
+ void Manage(const Message &recv);
+ /**
+ * \brief update the heartbeat record map
+ * \param node_id the \ref Node id
+ * \param t the last received heartbeat time
+ */
+ void UpdateHeartbeat(int node_id, time_t t) {
+ std::lock_guard lk(heartbeat_mu_);
+ heartbeats_[node_id] = t;
+ }
+ /**
+ * \brief get node ids that haven't reported heartbeats for over t seconds
+ * \param t timeout in sec
+ */
+ std::vector GetDeadNodes(int t = 60);
+
+private:
+ Postoffice();
+ ~Postoffice() {
+ delete van_;
+ }
+
+ void InitEnvironment();
+ Van *van_;
+ mutable std::mutex mu_;
+ // app_id -> (customer_id -> customer pointer)
+ std::unordered_map> customers_;
+ std::unordered_map> node_ids_;
+ std::mutex server_key_ranges_mu_;
+ std::vector server_key_ranges_;
+ bool is_worker_, is_server_, is_scheduler_;
+ int num_servers_, num_workers_;
+ std::unordered_map> barrier_done_;
+ int verbose_;
+ std::mutex barrier_mu_;
+ std::condition_variable barrier_cond_;
+ std::mutex heartbeat_mu_;
+ std::mutex start_mu_;
+ int init_stage_ = 0;
+ std::unordered_map heartbeats_;
+ Callback exit_callback_;
+ /** \brief Holding a shared_ptr to prevent it from being destructed too
+ * early */
+ std::shared_ptr env_ref_;
+ time_t start_time_;
+ DISALLOW_COPY_AND_ASSIGN(Postoffice);
+};
+
+/** \brief verbose log */
+#define PS_VLOG(x) LOG_IF(INFO, x <= Postoffice::Get()->verbose())
+} // namespace ps
+#endif // PS_INTERNAL_POSTOFFICE_H_
diff --git a/ps-lite/include/ps/internal/threadsafe_pqueue.h b/ps-lite/include/ps/internal/threadsafe_pqueue.h
new file mode 100644
index 0000000..95a927f
--- /dev/null
+++ b/ps-lite/include/ps/internal/threadsafe_pqueue.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_THREADSAFE_PQUEUE_H_
+#define PS_INTERNAL_THREADSAFE_PQUEUE_H_
+#include
+#include
+#include
+#include
+#include
+#include
+#include "ps/base.h"
+namespace ps {
+
+/**
+ * \brief thread-safe queue allowing push and waited pop
+ */
+class ThreadsafePQueue {
+public:
+ ThreadsafePQueue() {
+ }
+ ~ThreadsafePQueue() {
+ }
+
+ /**
+ * \brief push an value into the end. threadsafe.
+ * \param new_value the value
+ */
+ void Push(Message new_value) {
+ mu_.lock();
+ queue_.push(std::move(new_value));
+ mu_.unlock();
+ cond_.notify_all();
+ }
+
+ /**
+ * \brief wait until pop an element from the beginning, threadsafe
+ * \param value the poped value
+ */
+ void WaitAndPop(Message *value) {
+ std::unique_lock lk(mu_);
+ cond_.wait(lk, [this] { return !queue_.empty(); });
+ *value = std::move(queue_.top());
+ queue_.pop();
+ }
+
+private:
+ class Compare {
+ public:
+ bool operator()(const Message &l, const Message &r) {
+ // hbsun: note it is the max-heap,
+ // In other words, the priority is larger, the quickly it is
+ // processed push is first, pull is later
+ return l.meta.priority <= r.meta.priority;
+ }
+ };
+ mutable std::mutex mu_;
+ std::priority_queue, Compare> queue_;
+ std::condition_variable cond_;
+};
+
+} // namespace ps
+
+#endif // PS_INTERNAL_THREADSAFE_PQUEUE_H_
diff --git a/ps-lite/include/ps/internal/threadsafe_queue.h b/ps-lite/include/ps/internal/threadsafe_queue.h
new file mode 100644
index 0000000..a4cd340
--- /dev/null
+++ b/ps-lite/include/ps/internal/threadsafe_queue.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_THREADSAFE_QUEUE_H_
+#define PS_INTERNAL_THREADSAFE_QUEUE_H_
+#include
+#include
+#include
+#include
+#include
+#include "ps/base.h"
+namespace ps {
+
+/**
+ * \brief thread-safe queue allowing push and waited pop
+ */
+template
+class ThreadsafeQueue {
+public:
+ ThreadsafeQueue() {
+ }
+ ~ThreadsafeQueue() {
+ }
+
+ /**
+ * \brief push an value into the end. threadsafe.
+ * \param new_value the value
+ */
+ void Push(T new_value) {
+ mu_.lock();
+ queue_.push(std::move(new_value));
+ mu_.unlock();
+ cond_.notify_all();
+ }
+
+ /**
+ * \brief wait until pop an element from the beginning, threadsafe
+ * \param value the poped value
+ */
+ void WaitAndPop(T *value) {
+ std::unique_lock lk(mu_);
+ cond_.wait(lk, [this] { return !queue_.empty(); });
+ *value = std::move(queue_.front());
+ queue_.pop();
+ }
+
+private:
+ mutable std::mutex mu_;
+ std::queue queue_;
+ std::condition_variable cond_;
+};
+
+} // namespace ps
+
+// bool TryPop(T& value) {
+// std::lock_guard lk(mut);
+// if(data_queue.empty())
+// return false;
+// value=std::move(data_queue.front());
+// data_queue.pop();
+// return true;
+// }
+#endif // PS_INTERNAL_THREADSAFE_QUEUE_H_
diff --git a/ps-lite/include/ps/internal/utils.h b/ps-lite/include/ps/internal/utils.h
new file mode 100644
index 0000000..f12b968
--- /dev/null
+++ b/ps-lite/include/ps/internal/utils.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_UTILS_H_
+#define PS_INTERNAL_UTILS_H_
+#include "common/logging.h"
+#include "ps/internal/env.h"
+namespace ps {
+
+#ifdef _MSC_VER
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include
+#endif
+
+/*!
+ * \brief Get environment variable as int with default.
+ * \param key the name of environment variable.
+ * \param default_val the default value of environment vriable.
+ * \return The value received
+ */
+template
+inline V GetEnv(const char *key, V default_val) {
+ const char *val = Environment::Get()->find(key);
+ if (val == nullptr) {
+ return default_val;
+ } else {
+ return V(val);
+ }
+}
+
+inline int GetEnv(const char *key, int default_val) {
+ const char *val = Environment::Get()->find(key);
+ if (val == nullptr) {
+ return default_val;
+ } else {
+ return atoi(val);
+ }
+}
+
+#ifndef DISALLOW_COPY_AND_ASSIGN
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ TypeName(const TypeName &); \
+ void operator=(const TypeName &)
+#endif
+
+#define LL LOG(ERROR)
+
+} // namespace ps
+#endif // PS_INTERNAL_UTILS_H_
diff --git a/ps-lite/include/ps/internal/van.h b/ps-lite/include/ps/internal/van.h
new file mode 100644
index 0000000..b9f5ef9
--- /dev/null
+++ b/ps-lite/include/ps/internal/van.h
@@ -0,0 +1,213 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_VAN_H_
+#define PS_INTERNAL_VAN_H_
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "ps/base.h"
+#include "ps/internal/message.h"
+namespace ps {
+class Resender;
+class PBMeta;
+/**
+ * \brief Van sends messages to remote nodes
+ *
+ * If environment variable PS_RESEND is set to be 1, then van will resend a
+ * message if it no ACK messsage is received within PS_RESEND_TIMEOUT
+ * millisecond
+ */
+class Van {
+public:
+ /**
+ * \brief create Van
+ * \param type zmq, socket, ...
+ */
+ static Van *Create(const std::string &type);
+
+ /** \brief constructer, do nothing. use \ref Start for real start */
+ Van() {
+ }
+
+ /**\brief deconstructer, do nothing. use \ref Stop for real stop */
+ virtual ~Van() {
+ }
+
+ /**
+ * \brief start van
+ *
+ * must call it before calling Send
+ *
+ * it initalizes all connections to other nodes. start the receiving
+ * threads, which keeps receiving messages. if it is a system
+ * control message, give it to postoffice::manager, otherwise, give it to
+ * the accoding app.
+ */
+ virtual void Start(int customer_id);
+
+ /**
+ * \brief send a message, It is thread-safe
+ * \return the number of bytes sent. -1 if failed
+ */
+ int Send(const Message &msg);
+
+ /**
+ * \brief return my node
+ */
+ inline const Node &my_node() const {
+ CHECK(ready_) << "call Start() first";
+ return my_node_;
+ }
+
+ /**
+ * \brief stop van
+ * stop receiving threads
+ */
+ virtual void Stop();
+
+ /**
+ * \brief get next available timestamp. thread safe
+ */
+ inline int GetTimestamp() {
+ return timestamp_++;
+ }
+
+ /**
+ * \brief whether it is ready for sending. thread safe
+ */
+ inline bool IsReady() {
+ return ready_;
+ }
+
+protected:
+ /**
+ * \brief connect to a node
+ */
+ virtual void Connect(const Node &node) = 0;
+
+ /**
+ * \brief bind to my node
+ * do multiple retries on binding the port. since it's possible that
+ * different nodes on the same machine picked the same port
+ * \return return the port binded, -1 if failed.
+ */
+ virtual int Bind(const Node &node, int max_retry) = 0;
+
+ /**
+ * \brief block until received a message
+ * \return the number of bytes received. -1 if failed or timeout
+ */
+ virtual int RecvMsg(Message *msg) = 0;
+
+ /**
+ * \brief send a mesage
+ * \return the number of bytes sent
+ */
+ virtual int SendMsg(const Message &msg) = 0;
+
+ /**
+ * \brief pack meta into a string
+ */
+ void PackMeta(const Meta &meta, char **meta_buf, int *buf_size);
+
+ /**
+ * \brief pack meta into protobuf
+ */
+ void PackMetaPB(const Meta &meta, PBMeta *pb);
+
+ /**
+ * \brief unpack meta from a string
+ */
+ void UnpackMeta(const char *meta_buf, int buf_size, Meta *meta);
+
+ Node scheduler_;
+ Node my_node_;
+ bool is_scheduler_;
+ std::mutex start_mu_;
+
+private:
+ /** thread function for receving */
+ void Receiving();
+
+ /** thread function for heartbeat */
+ void Heartbeat();
+
+ // node's address string (i.e. ip:port) -> node id
+ // this map is updated when ip:port is received for the first time
+ std::unordered_map connected_nodes_;
+ // maps the id of node which is added later to the id of node
+ // which is with the same ip:port and added first
+ std::unordered_map shared_node_mapping_;
+
+ /** whether it is ready for sending */
+ std::atomic ready_{false};
+ std::atomic send_bytes_{0};
+ size_t recv_bytes_ = 0;
+ int num_servers_ = 0;
+ int num_workers_ = 0;
+ /** the thread for receiving messages */
+ std::unique_ptr receiver_thread_;
+ /** the thread for sending heartbeat */
+ std::unique_ptr heartbeat_thread_;
+ std::vector barrier_count_;
+ /** msg resender */
+ Resender *resender_ = nullptr;
+ int drop_rate_ = 0;
+ std::atomic timestamp_{0};
+ int init_stage = 0;
+
+ /**
+ * \brief processing logic of AddNode message for scheduler
+ */
+ void ProcessAddNodeCommandAtScheduler(Message *msg, Meta *nodes,
+ Meta *recovery_nodes);
+
+ /**
+ * \brief processing logic of Terminate message
+ */
+ void ProcessTerminateCommand();
+
+ /**
+ * \brief processing logic of AddNode message (run on each node)
+ */
+ void ProcessAddNodeCommand(Message *msg, Meta *nodes, Meta *recovery_nodes);
+
+ /**
+ * \brief processing logic of Barrier message (run on each node)
+ */
+ void ProcessBarrierCommand(Message *msg);
+
+ /**
+ * \brief processing logic of AddNode message (run on each node)
+ */
+ void ProcessHearbeat(Message *msg);
+
+ /**
+ * \brief processing logic of Data message
+ */
+ void ProcessDataMsg(Message *msg);
+
+ /**
+ * \brief called by ProcessAddNodeCommand, in scheduler it assigns an id to
+ * the newly added node; in other nodes, it updates the node id with what is
+ * received from scheduler
+ */
+ void UpdateLocalID(Message *msg, std::unordered_set *deadnodes_set,
+ Meta *nodes, Meta *recovery_nodes);
+
+ const char *heartbeat_timeout_val =
+ Environment::Get()->find("PS_HEARTBEAT_TIMEOUT");
+ int heartbeat_timeout_ =
+ heartbeat_timeout_val ? atoi(heartbeat_timeout_val) : 0;
+
+ DISALLOW_COPY_AND_ASSIGN(Van);
+};
+} // namespace ps
+#endif // PS_INTERNAL_VAN_H_
diff --git a/ps-lite/include/ps/kvapp.h b/ps-lite/include/ps/kvapp.h
new file mode 100644
index 0000000..eb87b5b
--- /dev/null
+++ b/ps-lite/include/ps/kvapp.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ps/internal/postoffice.h"
+#include "ps/internal/customer.h"
+#include "ps/internal/message.h"
+
+namespace ps {
+
+// Recursively register receive message handler (from 0 to kNumPSfunction)
+template
+struct KVAppRegisterHelper {
+ static void init(app *ptr) {
+ ptr->message_handlers[ftype] = std::bind(
+ &app::template onReceive, ptr, std::placeholders::_1);
+ KVAppRegisterHelper::init(ptr);
+ }
+};
+
+template
+struct KVAppRegisterHelper {
+ static void init(app *ptr) {
+ }
+};
+
+class KVApp {
+public:
+ explicit KVApp(int app_id) {
+ obj_.reset(new Customer(
+ app_id, app_id,
+ std::bind(&KVApp::Process, this, std::placeholders::_1)));
+ }
+ std::unique_ptr