diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1fccce5 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "third_party/GraphMix"] + path = third_party/GraphMix + url = https://github.com/nox-410/GraphMix.git +[submodule "third_party/HetuML"] + path = third_party/HetuML + url = https://github.com/ccchengff/HetuML.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..fa225f1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,59 @@ +cmake_minimum_required(VERSION 3.18) + +project(Hetu CXX) + +include(cmake/config.cmake) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_CXX_FLAGS "-O3 -Wall") + +# openmp +find_package(OpenMP REQUIRED) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + +# compile flag +if(${HETU_VERSION} STREQUAL "all") + set(HETU_COMPILE_GPU ON) + set(HETU_COMPILE_MKL ON) +elseif(${HETU_VERSION} STREQUAL "gpu") + set(HETU_COMPILE_GPU ON) + set(HETU_COMPILE_MKL OFF) +elseif(${HETU_VERSION} STREQUAL "mkl") + set(HETU_COMPILE_GPU OFF) + set(HETU_COMPILE_MKL ON) +else() + message(FATAL_ERROR "unknown hetu version") +endif() +message(STATUS "HETU version: ${HETU_VERSION}") + +# cuda +if(${HETU_COMPILE_GPU}) + set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc) + file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION) + string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION}) + if(${CUDA_VERSION} VERSION_LESS "10.1") + message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}") + endif() + find_package(CUDAToolkit REQUIRED) + enable_language(CUDA) +endif() + +include(FetchContent) # download third_party + +add_subdirectory(${CMAKE_SOURCE_DIR}/src) + +if(${HETU_PS}) + add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite) +endif() + +if(${HETU_GEOMETRIC}) + add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix) +endif() +if (HETU_ML) + add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML) +endif() +enable_testing() diff --git a/COMMITTERS.md b/COMMITTERS.md new file mode 100644 index 0000000..3406485 --- /dev/null +++ b/COMMITTERS.md @@ -0,0 +1,33 @@ +## Committer + +Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer. + +One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers. + +Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md). + +## Committer Members +### Current Committer +- [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)> +- [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)> +- [codecaution](https://github.com/codecaution) - **Xiaonan Nie** +- [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang** +- [nox-410](https://github.com/nox-410) - **Yining Shi** +- [initzhang](https://github.com/initzhang) - **Xin Zhang** +- [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang** +- [AFDWang](https://github.com/AFDWang) - **Yujie Wang** +- [sj1104](https://github.com/sj1104) - **Jia Shen** +- [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou** +- [zmxdream](https://github.com/zmxdream) - **Minxu Zhang** + +We would like to sincerely thank the following community members for their contributions to Hetu. + +- [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)** +- [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)** +- [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)** +- [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)** +- [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)** +- [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao** +- [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue** +- **Lizi Su** +- **Dong Li** \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..239c267 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,53 @@ +# Contributing to Hetu +Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing. + + +## Issues +We use Github Issues to track public bugs and feature requests. + +### Search Known Issues First +Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant. + +### Reporting New Issues +If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash. + +## Pull Requests +We strongly welcome your pull request to make Hetu better. + +### Branch Management +There are three main branches here: + +1. `main` branch. + + (1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`... + + (2). **Don't submit any PR on `main` branch.** + +2. `specific version` branchs. + + (1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing branch. After full testing, `specific version` branch will be merged to `main` branch for the next release. + + (2). **You are recommended to submit bugfix or feature PR on `specific version` branch.** + + +Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release. + + +### Make Pull Requests +The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay. + +Before submitting a pull request, please make sure the followings are done: + +1. Fork the repo and create your branch from `main` or `specific version`. +2. Update code or documentation if you have changed APIs. +3. Add the copyright notice to the top of any new files you've added. +4. Check your code lints and checkstyles. +5. Test and test again your code. +6. Now, you can submit your pull request on `specific version` branch. + +## Code Style Guide +Use [Code Style](./.clang-format) for Python and C++. + +## License +By contributing to Hetu, you agree that your contributions will be licensed +under [License](LICENSE) \ No newline at end of file diff --git a/LICENSE b/LICENSE index d63dfb7..7849ac5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,124 +1,201 @@ -木兰宽松许可证, 第2版 - -2020年1月 http://license.coscl.org.cn/MulanPSL2 - -您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: - -0. 定义 - -“软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 - -“贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 - -“贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 - -“法人实体” 是指提交贡献的机构及其“关联实体”。 - -“关联实体” 是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 - -1. 授予版权许可 - -每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 - -2. 授予专利许可 - -每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 - -3. 无商标许可 - -“本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 - -4. 分发限制 - -您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 - -5. 免责声明与责任限制 - -“软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 - -6. 语言 - -“本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 - -条款结束 - -如何将木兰宽松许可证,第2版,应用到您的软件 - -如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: - -1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; - -2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; - -3, 请将如下声明文本放入每个源文件的头部注释中。 - -Copyright (c) [Year] [name of copyright holder] -[Software Name] is licensed under Mulan PSL v2. -You can use this software according to the terms and conditions of the Mulan PSL v2. -You may obtain a copy of Mulan PSL v2 at: - http://license.coscl.org.cn/MulanPSL2 -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -See the Mulan PSL v2 for more details. -Mulan Permissive Software License,Version 2 -Mulan Permissive Software License,Version 2 (Mulan PSL v2) - -January 2020 http://license.coscl.org.cn/MulanPSL2 - -Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: - -0. Definition - -Software means the program and related documents which are licensed under this License and comprise all Contribution(s). - -Contribution means the copyrightable work licensed by a particular Contributor under this License. - -Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. - -Legal Entity means the entity making a Contribution and all its Affiliates. - -Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. - -1. Grant of Copyright License - -Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. - -2. Grant of Patent License - -Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. - -3. No Trademark License - -No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4. - -4. Distribution Restriction - -You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. - -5. Disclaimer of Warranty and Limitation of Liability - -THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -6. Language - -THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. - -END OF THE TERMS AND CONDITIONS - -How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software - -To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: - -Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; -Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package; -Attach the statement to the appropriate annotated syntax at the beginning of each source file. -Copyright (c) [Year] [name of copyright holder] -[Software Name] is licensed under Mulan PSL v2. -You can use this software according to the terms and conditions of the Mulan PSL v2. -You may obtain a copy of Mulan PSL v2 at: - http://license.coscl.org.cn/MulanPSL2 -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -See the Mulan PSL v2 for more details. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] [Peking University] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 6ec32e1..1accd40 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,177 @@ -#### 从命令行创建一个新的仓库 +
+ +
+# HETU + + + +[Documentation](https://hetu-doc.readthedocs.io) | +[Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html) + +Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by DAIR Lab at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics: + +- Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms. + +- Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks. + +- Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce. + +- Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark + +- Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search. + +We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details. + +## Installation +1. Clone the repository. + +2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used: +```conda env create -f environment.yml``` . +Please prepare Cuda toolkit and CuDNN in advance. + +3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix. ```bash -touch README.md -git init -git add README.md -git commit -m "first commit" -git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git -git push -u origin master +# modify paths and configurations in cmake/config.cmake + +# generate Makefile +mkdir build && cd build && cmake .. +# compile +# make all +make -j 8 +# make hetu, version is specified in cmake/config.cmake +make hetu -j 8 +# make allreduce module +make allreduce -j 8 +# make ps module +make ps -j 8 +# make geometric module +make geometric -j 8 +# make hetu-cache module +make hetu_cache -j 8 ``` -#### 从命令行推送已经创建的仓库 + +4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` . + + + +## Usage + +Train logistic regression on gpu: ```bash -git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git -git push -u origin master +bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST +``` + +Train a 3-layer mlp on gpu: +```bash +bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10 +``` + +Train a 3-layer cnn with gpu: + +```bash +bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST ``` +Train a 3-layer mlp with allreduce on 8 gpus (use mpirun): +```bash +bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10 +``` + +Train a 3-layer mlp with PS on 1 server and 2 workers: +```bash +# in the script we launch the scheduler and server, and two workers +bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10 +``` + + +## More Examples +Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks. + +## Community +* Email: xupeng.miao@pku.edu.cn +* Slack: coming soon +* Hetu homepage: https://hetu-doc.readthedocs.io +* [Committers & Contributors](COMMITTERS.md) +* [Contributing to Hetu](CONTRIBUTING.md) +* [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html) + +## Enterprise Users + +If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here. + + + + + +## License + +The entire codebase is under [license](LICENSE) + +## Papers + 1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021 + 2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021 + 3. coming soon + +## Acknowledgements + +We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel). + +## Appendix +The prerequisites for different modules in Hetu is listed as follows: + ``` + "*" means you should prepare by yourself, while others support auto-download + + Hetu: OpenMP(*), CMake(*) + Hetu (version mkl): MKL 1.6.1 + Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*) + Hetu (version all): both + + Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version + + Hetu-PS: Protobuf(*), ZeroMQ 4.3.2 + + Hetu-Geometric: Pybind11(*), Metis(*) + + Hetu-Cache: Pybind11(*), this module needs PS module + + ################################################################## + Tips for preparing the prerequisites + + Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment): + 1. download from https://developer.nvidia.com + 2. install + 3. modify paths in cmake/config.cmake if necessary + + Preparing OpenMP: + Your just need to ensure your compiler support openmp. + + Preparing CMake, Protobuf, Pybind11, Metis: + Install by anaconda: + conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis + + Preparing OpenMPI (not necessary): + install by anaconda: `conda install -c conda-forge openmpi=4.0.3` + or + 1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz + 2. build openmpi by `./configure /path/to/build && make -j8 && make install` + 3. modify MPI_HOME to /path/to/build in cmake/config.cmake + + Preparing MKL (not necessary): + install by anaconda: `conda install -c conda-forge onednn` + or + 1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz + 2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` + 3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake + + Preparing ZeroMQ (not necessary): + install by anaconda: `conda install -c anaconda zeromq=4.3.2` + or + 1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip + 2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` + 3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake + ``` diff --git a/bin/heturun b/bin/heturun new file mode 100755 index 0000000..5bf3693 --- /dev/null +++ b/bin/heturun @@ -0,0 +1,2 @@ +#!/bin/bash +python $(cd $(dirname $0); pwd)/../python/runner.py $@ diff --git a/cmake/Modules/FindCUDNN.cmake b/cmake/Modules/FindCUDNN.cmake new file mode 100644 index 0000000..5cfbb0d --- /dev/null +++ b/cmake/Modules/FindCUDNN.cmake @@ -0,0 +1,75 @@ +# Find the CUDNN libraries +# +# The following variables are optionally searched for defaults +# CUDNN_ROOT: Base directory where CUDNN is found +# CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for +# CUDNN_LIBRARY: Directory where CUDNN library is searched for +# CUDNN_STATIC: Are we looking for a static library? (default: no) +# +# The following are set after configuration is done: +# CUDNN_FOUND +# CUDNN_INCLUDE_PATH +# CUDNN_LIBRARY_PATH +# + +set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN") +if (DEFINED $ENV{CUDNN_ROOT_DIR}) + message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.") +endif() +list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) + +# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. +list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT}) + +set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files") + +find_path(CUDNN_INCLUDE_PATH cudnn.h + HINTS ${CUDNN_INCLUDE_DIR} + PATH_SUFFIXES cuda/include cuda include + REQUIRED) + +option(CUDNN_STATIC "Look for static CUDNN" OFF) +if (CUDNN_STATIC) + set(CUDNN_LIBNAME "libcudnn_static.a") +else() + set(CUDNN_LIBNAME "cudnn") +endif() + +set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)") +if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC) + message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.") +endif() + +find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME} + PATHS ${CUDNN_LIBRARY} + PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64 + REQUIRED) + +set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc") +file(WRITE ${file} " + #include + #include \"${CUDNN_INCLUDE_PATH}/cudnn.h\" + int main() + { + std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL; + int x = cudnnGetVersion(); + return x == CUDNN_VERSION; + } +") +try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} + RUN_OUTPUT_VARIABLE CUDNN_VERSION + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}" + LINK_LIBRARIES ${CUDNN_LIBRARY_PATH}) +if (NOT CUDNN_VERSION_MATCHED) + message(FATAL_ERROR "Found CUDNN header version and library version do not match! \ + (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.") +endif() +message(STATUS "CUDNN version: ${CUDNN_VERSION}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + CUDNN + REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH + VERSION_VAR CUDNN_VERSION) + +mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY) diff --git a/cmake/Modules/FindMETIS.cmake b/cmake/Modules/FindMETIS.cmake new file mode 100644 index 0000000..eaf818b --- /dev/null +++ b/cmake/Modules/FindMETIS.cmake @@ -0,0 +1,70 @@ +# Accepts the following variables: +# +# METIS_ROOT: Prefix where METIS is installed. +# METIS_LIB_NAME: Name of the METIS library (default: metis). +# METIS_LIBRARY: Full path of the METIS library. + +# Sets the following variables: +# +# METIS_LIBRARY: Full path of the METIS library. +# METIS_FOUND: True if ParMETIS was found. +# METIS_LIBRARIES: List of all libraries needed for linking with METIS, +# +# Provides the following macros: +# +# find_package(METIS) +# +# Searches for METIS (See above) + + +# search metis header +find_path(METIS_INCLUDE_DIR metis.h + PATHS ${METIS_DIR} ${METIS_ROOT} + PATH_SUFFIXES metis include include/metis Lib METISLib + NO_DEFAULT_PATH + DOC "Include directory of metis") +find_path(METIS_INCLUDE_DIR metis.h + PATH_SUFFIXES metis include include/metis Lib METISLib) + +set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library") + +# search metis library +if(NOT METIS_LIB_NAME) + set(METIS_LIB_NAME metis) +endif(NOT METIS_LIB_NAME) + +find_library(METIS_LIBRARY ${METIS_LIB_NAME} + PATHS ${METIS_DIR} ${METIS_ROOT} + PATH_SUFFIXES lib + NO_DEFAULT_PATH) +find_library(METIS_LIBRARY ${METIS_LIB_NAME} + PATH_SUFFIXES lib +) + +# behave like a CMake module is supposed to behave +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + "METIS" + DEFAULT_MSG + METIS_INCLUDE_DIR + METIS_LIBRARY +) + +mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME) + +# if both headers and library are found, store results +if(METIS_FOUND) + set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR}) + set(METIS_LIBRARIES ${METIS_LIBRARY}) + # log result + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + "Determing location of METIS succeded:\n" + "Include directory: ${METIS_INCLUDE_DIRS}\n" + "Library directory: ${METIS_LIBRARIES}\n\n") +else(METIS_FOUND) + # log errornous result + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + "Determing location of METIS failed:\n" + "Include directory: ${METIS_INCLUDE_DIRS}\n" + "Library directory: ${METIS_LIBRARIES}\n\n") +endif(METIS_FOUND) diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake new file mode 100644 index 0000000..322a80a --- /dev/null +++ b/cmake/Modules/FindMKL.cmake @@ -0,0 +1,14 @@ +# - Try to find DNNL(MKL-DNN) +# Once done this will define +# DNNL_FOUND - System has DNNL +# DNNL_INCLUDE_DIR - The DNNL include directories +# DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build +# DNNL_LIBRARY - The libraries needed to use DNNL +# DNNL_DEFINITIONS - Compiler switches required for using DNNL + +find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include ) +find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include ) +find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src ) + +include ( FindPackageHandleStandardArgs ) +find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR ) diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake new file mode 100644 index 0000000..b0a81d9 --- /dev/null +++ b/cmake/Modules/FindNCCL.cmake @@ -0,0 +1,97 @@ +# Try to find NCCL +# +# The following variables are optionally searched for defaults +# NCCL_ROOT: Base directory where all NCCL components are found +# NCCL_ROOT_DIR: Base directory where all NCCL components are found +# NCCL_INCLUDE_DIR: Directory where NCCL header is found +# NCCL_LIB_DIR: Directory where NCCL library is found +# +# The following are set after configuration is done: +# NCCL_FOUND +# NCCL_INCLUDE_DIRS +# NCCL_LIBRARIES +# +# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks +# install NCCL in the same location as the CUDA toolkit. +# See https://github.com/caffe2/caffe2/issues/1601 + +if (NOT DEFINED NCCL_ROOT) + set(NCCL_ROOT $ENV{CONDA_PREFIX}) +endif() + +set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL") + +find_path(NCCL_INCLUDE_DIRS + NAMES nccl.h + HINTS + ${NCCL_ROOT} + ${NCCL_ROOT}/include + ${NCCL_INCLUDE_DIR} + ${NCCL_ROOT_DIR} + ${NCCL_ROOT_DIR}/include + ${CUDA_TOOLKIT_ROOT_DIR}/include + REQUIRED) + +if ($ENV{USE_STATIC_NCCL}) + message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library") + set(NCCL_LIBNAME "libnccl_static.a") +else() + set(NCCL_LIBNAME "nccl") +endif() + +find_library(NCCL_LIBRARIES + NAMES ${NCCL_LIBNAME} + HINTS + ${NCCL_LIB_DIR} + ${NCCL_ROOT} + ${NCCL_ROOT}/lib + ${NCCL_ROOT}/lib/x86_64-linux-gnu + ${NCCL_ROOT}/lib64 + ${NCCL_ROOT_DIR} + ${NCCL_ROOT_DIR}/lib + ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu + ${NCCL_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + REQUIRED) + +set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h") +message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...") +set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) +list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR}) +include(CheckCXXSymbolExists) +check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED) + +if (NCCL_VERSION_DEFINED) + set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc") + file(WRITE ${file} " + #include + #include \"${NCCL_HEADER_FILE}\" + int main() + { + std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH; + int x; + ncclGetVersion(&x); + return x == NCCL_VERSION_CODE; + } + ") + try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} + RUN_OUTPUT_VARIABLE NCCL_VERSION + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}" + LINK_LIBRARIES ${NCCL_LIBRARIES}) + if (NOT NCCL_VERSION_MATCHED) + message(FATAL_ERROR "Found NCCL header version and library version do not match! \ + (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.") + endif() + message(STATUS "NCCL version: ${NCCL_VERSION}") +else() + message(STATUS "NCCL version < 2.3.5-5") +endif () +set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES}) + +mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + NCCL + REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES + VERSION_VAR NCCL_VERSION) diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake new file mode 100644 index 0000000..806c9c6 --- /dev/null +++ b/cmake/Modules/FindZMQ.cmake @@ -0,0 +1,47 @@ +# - Try to find ZMQ +# Once done this will define +# ZMQ_FOUND - System has ZMQ +# ZMQ_INCLUDE_DIRS - The ZMQ include directories +# ZMQ_LIBRARIES - The libraries needed to use ZMQ +# ZMQ_DEFINITIONS - Compiler switches required for using ZMQ + +find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include ) +find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib ) + +set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} ) +set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} ) + +if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS) + set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc") + file(WRITE ${file} " + #include + #include \"${ZMQ_INCLUDE_DIRS}/zmq.h\" + int main() + { + std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH; + int x, y, z; + zmq_version(&x, &y, &z); + return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH; + } + ") + try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} + RUN_OUTPUT_VARIABLE ZMQ_VERSION + LINK_LIBRARIES ${ZMQ_LIBRARIES}) + if (NOT ZMQ_VERSION_MATCHED) + message(WARNING "Found ZMQ header version and library version do not match! \ + (include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.") + unset(ZMQ_INCLUDE_DIRS) + unset(ZMQ_LIBRARIES) + unset(ZMQ_VERSION) + else () + message(STATUS "ZMQ version: ${ZMQ_VERSION}") + endif() +endif() + +include ( FindPackageHandleStandardArgs ) +# handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args ( + ZMQ + REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS + VERSION_VAR ZMQ_VERSION) diff --git a/cmake/config.example.cmake b/cmake/config.example.cmake new file mode 100644 index 0000000..86224f1 --- /dev/null +++ b/cmake/config.example.cmake @@ -0,0 +1,55 @@ +###################### +### Set targets ###### +###################### + +# hetu main version, choose from (mkl, gpu, all) +# if using mkl (for CPU) or all, OpenMP(*), mkl required +# if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required +set(HETU_VERSION "all") + +# whether to compile allreduce module +# nccl(*), openmpi required +set(HETU_ALLREDUCE ON) + +# whether to compile ps module +# protobuf(*), zeromq required +set(HETU_PS ON) + +# whether to compile geometric module (for GNNs) +# pybind11(*), metis(*) required +set(HETU_GEOMETRIC ON) + +# whether to compile cache module (for PS) +# to enable this, you must turn HETU_PS on +# pybind11(*) required +set(HETU_CACHE ON) + +# whether to compile Hetu ML Module +set(HETU_ML ON) +set(HETU_PARALLEL_ML ON) + +###################### +### Set paths ######## +###################### + +# CUDA version >= 10.1 +set(CUDAToolkit_ROOT /usr/local/cuda) + +# NCCL version >= 2.8 +set(NCCL_ROOT $ENV{CONDA_PREFIX}) + +set(CUDNN_ROOT) + +# MPI version >= 3.1 (OpenMPI version >= 4.0.3) +# if valid version not found, we'll download and compile it in time (openmpi-4.0.3) +set(MPI_HOME $ENV{CONDA_PREFIX}) + +# MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl +# if not found, we'll download and compile it in time +set(MKL_ROOT $ENV{CONDA_PREFIX}) +set(MKL_BUILD $ENV{CONDA_PREFIX}) + +# ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq +# if not found, we'll download and compile it in time +set(ZMQ_ROOT $ENV{CONDA_PREFIX}) +set(ZMQ_BUILD $ENV{CONDA_PREFIX}) diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..a230326 --- /dev/null +++ b/environment.yml @@ -0,0 +1,84 @@ +name: hetu +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=4.5=1_gnu + - bcrypt=3.2.0=py37h5e8e339_1 + - blas=1.0=mkl + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2021.7.5=h06a4308_1 + - certifi=2021.5.30=py37h06a4308_0 + - cffi=1.14.6=py37hc58025e_0 + - cmake=3.18.2=ha30ef3c_0 + - cryptography=3.4.7=py37h5d9358c_0 + - cudatoolkit=10.1.243=h6bb024c_0 + - expat=2.4.1=h2531618_2 + - intel-openmp=2021.3.0=h06a4308_3350 + - joblib=1.0.1=pyhd3eb1b0_0 + - krb5=1.18.2=h173b8e3_0 + - ld_impl_linux-64=2.35.1=h7274673_9 + - libcurl=7.71.1=h20c2e04_1 + - libedit=3.1.20210216=h27cfd23_1 + - libffi=3.3=he6710b0_2 + - libgcc-ng=9.3.0=h5101ec6_17 + - libgfortran-ng=7.5.0=h14aa051_19 + - libgfortran4=7.5.0=h14aa051_19 + - libgomp=9.3.0=h5101ec6_17 + - libprotobuf=3.15.8=h780b84a_0 + - libsodium=1.0.18=h7b6447c_0 + - libssh2=1.9.0=h1ba5d50_1 + - libstdcxx-ng=9.3.0=hd4cf53a_17 + - libuv=1.40.0=h7b6447c_0 + - lz4-c=1.9.3=h2531618_0 + - metis=5.1.0=hf484d3e_4 + - mkl=2021.3.0=h06a4308_520 + - mkl-service=2.4.0=py37h7f8727e_0 + - mkl_fft=1.3.0=py37h42c9631_2 + - mkl_random=1.2.2=py37h51133e4_0 + - mpi=1.0=openmpi + - nccl=2.8.3.1=hcaf9a05_0 + - ncurses=6.2=he6710b0_1 + - numpy=1.20.3=py37hf144106_0 + - numpy-base=1.20.3=py37h74d4b33_0 + - onednn=2.3=omp_hf4ef041_0 + - onnx=1.9.0=py37h284874a_0 + - onnxruntime=1.7.2=py37he8cb6d3_1 + - openmpi=4.0.3=hdf1f1ad_1 + - openssl=1.1.1k=h27cfd23_0 + - pandas=1.2.5=py37h295c915_0 + - paramiko=2.7.2=pyh9f0ad1d_0 + - pip=21.1.3=py37h06a4308_0 + - protobuf=3.15.8=py37hcd2ae1e_0 + - psutil=5.8.0=py37h5e8e339_1 + - pybind11=2.6.2=py37hff7bd54_1 + - pycparser=2.20=pyh9f0ad1d_2 + - pynacl=1.4.0=py37h5e8e339_2 + - python=3.7.10=h12debd9_4 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python_abi=3.7=2_cp37m + - pytz=2021.1=pyhd3eb1b0_0 + - pyyaml=5.4.1=py37h27cfd23_1 + - re2=2021.04.01=h9c3ff4c_0 + - readline=8.1=h27cfd23_0 + - rhash=1.4.1=h3c74f83_1 + - scikit-learn=0.24.2=py37ha9443f7_0 + - scipy=1.6.2=py37had2a1c9_1 + - setuptools=52.0.0=py37h06a4308_0 + - six=1.16.0=pyhd3eb1b0_0 + - sqlite=3.36.0=hc218d9a_0 + - threadpoolctl=2.2.0=pyhb85f177_0 + - tk=8.6.10=hbc83047_0 + - tqdm=4.61.2=pyhd3eb1b0_1 + - typing-extensions=3.10.0.0=hd8ed1ab_0 + - typing_extensions=3.10.0.0=pyha770c72_0 + - wheel=0.36.2=pyhd3eb1b0_0 + - xz=5.2.5=h7b6447c_0 + - yaml=0.2.5=h7b6447c_0 + - zeromq=4.3.2=he6710b0_3 + - zlib=1.2.11=h7b6447c_3 + - zstd=1.4.9=haebb681_0 + - pip: + - cloudpickle==1.6.0 + - wget==3.2 diff --git a/examples/cnn/README.md b/examples/cnn/README.md new file mode 100644 index 0000000..474b927 --- /dev/null +++ b/examples/cnn/README.md @@ -0,0 +1,49 @@ +# CNN Examples +In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison. +## Structure +``` +- cnn + - models/ CNN models in HETU + - pytorch_models/ CNN models in PyTorch + - tf_models/ CNN models in TensorFlow + - scripts/ Test scripts + - main.py Trainer for HETU + - run_tf_horovod.py Trainer for Horovod + - tf_launch_server.py Trainer for TF-PS (role: server) + - tf_launch_worker.py Trainer for TF-PS (role: worker) + - tf_main.py Trainer for TensorFlow + - torch_main.py Trainer for Pytorch + - +``` +## Usage +Here are some examples of running scripts. +```bash +bash scripts/hetu_1gpu.sh mlp CIFAR10 # mlp with CIFAR10 dataset in hetu +bash scripts/hetu_8gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (1-node) +bash scripts/hetu_16gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (2-nodes) +``` +To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples. + +We can change the setting in scripts. See `mnist_mlp.sh` below. +```bash +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../main.py + +### validate and timing +python ${mainpy} --model mlp --dataset CIFAR10 --validate --timing + +### run in cpu +# python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing + +``` + +For more details about training setting, please refer to `main.py`. +## Models +We provide following models with specific datasets. +``` +CIFAR100: VGG, ResNet +CIFAR10: MLP, VGG, ResNet +MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN +``` diff --git a/examples/cnn/local_s1.yml b/examples/cnn/local_s1.yml new file mode 100644 index 0000000..20ffc94 --- /dev/null +++ b/examples/cnn/local_s1.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13030 + DMLC_NUM_WORKER : 2 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 +launch : + worker : 0 + server : 1 + scheduler : true diff --git a/examples/cnn/main.py b/examples/cnn/main.py new file mode 100644 index 0000000..1a4a224 --- /dev/null +++ b/examples/cnn/main.py @@ -0,0 +1,202 @@ +import hetu as ht +import models +import os +import numpy as np +import argparse +import json +import logging +from time import time +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def print_rank0(msg): + if device_id == 0: + logger.info(msg) + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, required=True, + help='model to be tested') + parser.add_argument('--dataset', type=str, required=True, + help='dataset to be trained on') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=10, help='epoch number') + parser.add_argument('--gpu', type=int, default=0, + help='gpu to be used, -1 means cpu') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + parser.add_argument('--comm-mode', default=None, help='communication mode') + args = parser.parse_args() + + global device_id + device_id = 0 + print_rank0("Training {} on HETU".format(args.model)) + if args.comm_mode in ('AllReduce', 'Hybrid'): + comm, device_id = ht.mpi_nccl_init() + executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0) + else: + if args.gpu == -1: + executor_ctx = ht.cpu(0) + print_rank0('Use CPU.') + else: + executor_ctx = ht.gpu(args.gpu) + print_rank0('Use GPU %d.' % args.gpu) + if args.comm_mode in ('PS', 'Hybrid'): + settings_file = open(os.path.join(os.path.abspath( + os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu)) + settings = json.load(settings_file) + for key in settings: + if type(settings[key]) == str: + os.environ[key] = settings[key] + else: + os.environ[key] = str(settings[key]) # type is str + + assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \ + 'Model not supported!' + model = eval('models.' + args.model) + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + + if args.opt == 'sgd': + print_rank0('Use SGD Optimizer.') + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + elif args.opt == 'momentum': + print_rank0('Use Momentum Optimizer.') + opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate) + elif args.opt == 'nesterov': + print_rank0('Use Nesterov Momentum Optimizer.') + opt = ht.optim.MomentumOptimizer( + learning_rate=args.learning_rate, nesterov=True) + elif args.opt == 'adagrad': + print_rank0('Use AdaGrad Optimizer.') + opt = ht.optim.AdaGradOptimizer( + learning_rate=args.learning_rate, initial_accumulator_value=0.1) + else: + print_rank0('Use Adam Optimizer.') + opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate) + + # data loading + print_rank0('Loading %s data...' % dataset) + if dataset == 'MNIST': + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + # train_set_x: (50000, 784), train_set_y: (50000, 10) + # valid_set_x: (10000, 784), valid_set_y: (10000, 10) + # x_shape = (args.batch_size, 784) + # y_shape = (args.batch_size, 10) + elif dataset == 'CIFAR10': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( + num_class=10) + if args.model == "mlp": + train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) + valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) + # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10) + # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10) + # x_shape = (args.batch_size, 3, 32, 32) + # y_shape = (args.batch_size, 10) + elif dataset == 'CIFAR100': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( + num_class=100) + # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100) + # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100) + else: + raise NotImplementedError + + # model definition + print_rank0('Building model {}'.format(args.model)) + x = ht.dataloader_op([ + ht.Dataloader(train_set_x, args.batch_size, 'train'), + ht.Dataloader(valid_set_x, args.batch_size, 'validate'), + ]) + y_ = ht.dataloader_op([ + ht.Dataloader(train_set_y, args.batch_size, 'train'), + ht.Dataloader(valid_set_y, args.batch_size, 'validate'), + ]) + if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100': + loss, y = model(x, y_, 100) + else: + loss, y = model(x, y_) + + train_op = opt.minimize(loss) + + eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]} + executor = ht.Executor(eval_nodes, ctx=executor_ctx, + comm_mode=args.comm_mode) + n_train_batches = executor.get_batch_num('train') + n_valid_batches = executor.get_batch_num('validate') + + # training + print_rank0("Start training loop...") + running_time = 0 + for i in range(args.num_epochs + 1): + print_rank0("Epoch %d" % i) + loss_all = 0 + batch_num = 0 + if args.timing: + start = time() + correct_predictions = [] + for minibatch_index in range(n_train_batches): + loss_val, predict_y, y_val, _ = executor.run( + 'train', eval_node_list=[loss, y, y_, train_op]) + # Loss for this minibatch + predict_y = predict_y.asnumpy() + y_val = y_val.asnumpy() + loss_all += loss_val.asnumpy() + batch_num += 1 + # Predict accuracy for this minibatch + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + + loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Train loss = %f" % loss_all) + print_rank0("Train accuracy = %f" % accuracy) + + if args.timing: + end = time() + during_time = end - start + print_rank0("Running time of current epoch = %fs" % (during_time)) + if i != 0: + running_time += during_time + if args.validate: + val_loss_all = 0 + batch_num = 0 + correct_predictions = [] + for minibatch_index in range(n_valid_batches): + loss_val, valid_y_predicted, y_val = executor.run( + 'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True) + val_loss_all += loss_val + batch_num += 1 + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(valid_y_predicted, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + + val_loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Validation loss = %f" % val_loss_all) + print_rank0("Validation accuracy = %f" % accuracy) + print_rank0("*"*50) + print_rank0("Running time of total %d epoch = %fs" % + (args.num_epochs, running_time)) + if args.comm_mode in ('AllReduce', 'Hybrid'): + ht.mpi_nccl_finish(comm) diff --git a/examples/cnn/models/AlexNet.py b/examples/cnn/models/AlexNet.py new file mode 100644 index 0000000..1491dbf --- /dev/null +++ b/examples/cnn/models/AlexNet.py @@ -0,0 +1,61 @@ +import hetu as ht +from hetu import init + + +def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False): + weight = init.random_normal( + shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight') + bn_scale = init.random_normal( + shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale') + bn_bias = init.random_normal( + shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias') + x = ht.conv2d_op(x, weight, stride=1, padding=1) + x = ht.batch_normalization_op(x, bn_scale, bn_bias) + if with_relu: + x = ht.relu_op(x) + if with_pool: + x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0) + return x + + +def fc(x, shape, name, with_relu=True): + weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +def alexnet(x, y_): + ''' + AlexNet model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print('Building AlexNet model...') + x = ht.array_reshape_op(x, [-1, 1, 28, 28]) + x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1', + with_relu=True, with_pool=True) + x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2', + with_relu=True, with_pool=True) + x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3', + with_relu=True, with_pool=False) + x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4', + with_relu=True, with_pool=False) + x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5', + with_relu=False, with_pool=True) + x = ht.array_reshape_op(x, (-1, 256*3*3)) + x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True) + x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True) + y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/CNN.py b/examples/cnn/models/CNN.py new file mode 100644 index 0000000..ed84059 --- /dev/null +++ b/examples/cnn/models/CNN.py @@ -0,0 +1,41 @@ +import hetu as ht +from hetu import init + + +def conv_relu_avg(x, shape): + weight = init.random_normal(shape=shape, stddev=0.1) + x = ht.conv2d_op(x, weight, padding=2, stride=1) + x = ht.relu_op(x) + x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) + return x + + +def fc(x, shape): + weight = init.random_normal(shape=shape, stddev=0.1) + bias = init.random_normal(shape=shape[-1:], stddev=0.1) + x = ht.array_reshape_op(x, (-1, shape[0])) + x = ht.matmul_op(x, weight) + y = x + ht.broadcastto_op(bias, x) + return y + + +def cnn_3_layers(x, y_): + ''' + 3-layer-CNN model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print('Building 3-layer-CNN model...') + x = ht.array_reshape_op(x, [-1, 1, 28, 28]) + x = conv_relu_avg(x, (32, 1, 5, 5)) + x = conv_relu_avg(x, (64, 32, 5, 5)) + y = fc(x, (7 * 7 * 64, 10)) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/LSTM.py b/examples/cnn/models/LSTM.py new file mode 100644 index 0000000..7469a7c --- /dev/null +++ b/examples/cnn/models/LSTM.py @@ -0,0 +1,90 @@ +import hetu as ht +from hetu import init +import numpy as np + + +def lstm(x, y_): + ''' + LSTM model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + diminput = 28 + dimhidden = 128 + dimoutput = 10 + nsteps = 28 + + forget_gate_w = init.random_normal( + shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w") + forget_gate_u = init.random_normal( + shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u") + forget_gate_b = init.random_normal( + shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b") + input_gate_w = init.random_normal( + shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w") + input_gate_u = init.random_normal( + shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u") + input_gate_b = init.random_normal( + shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b") + output_gate_w = init.random_normal( + shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w") + output_gate_u = init.random_normal( + shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u") + output_gate_b = init.random_normal( + shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b") + tanh_w = init.random_normal( + shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w") + tanh_u = init.random_normal( + shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u") + tanh_b = init.random_normal( + shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b") + out_weights = init.random_normal( + shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight") + out_bias = init.random_normal( + shape=(dimoutput,), stddev=0.1, name="lstm_out_bias") + initial_state = ht.Variable(value=np.zeros((1,)).astype( + np.float32), name='initial_state', trainable=False) + + for i in range(nsteps): + cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput)) + # forget gate + if i == 0: + temp = ht.matmul_op(cur_x, forget_gate_w) + last_c_state = ht.broadcastto_op(initial_state, temp) + last_h_state = ht.broadcastto_op(initial_state, temp) + cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp + else: + cur_forget = ht.matmul_op( + last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w) + cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget) + cur_forget = ht.sigmoid_op(cur_forget) + # input gate + cur_input = ht.matmul_op( + last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w) + cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input) + cur_input = ht.sigmoid_op(cur_input) + # output gate + cur_output = ht.matmul_op( + last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w) + cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output) + cur_output = ht.sigmoid_op(cur_output) + # tanh + cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \ + ht.matmul_op(cur_x, tanh_w) + cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh) + cur_tanh = ht.tanh_op(cur_tanh) + + last_c_state = ht.mul_op(last_c_state, cur_forget) + \ + ht.mul_op(cur_input, cur_tanh) + last_h_state = ht.tanh_op(last_c_state) * cur_output + + x = ht.matmul_op(last_h_state, out_weights) + y = x + ht.broadcastto_op(out_bias, x) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/LeNet.py b/examples/cnn/models/LeNet.py new file mode 100644 index 0000000..a14903c --- /dev/null +++ b/examples/cnn/models/LeNet.py @@ -0,0 +1,46 @@ +import hetu as ht +from hetu import init + + +def conv_pool(x, in_channel, out_channel, name): + weight = init.random_normal( + shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight') + x = ht.conv2d_op(x, weight, padding=2, stride=1) + x = ht.relu_op(x) + x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) + return x + + +def fc(x, shape, name, with_relu=True): + weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +def lenet(x, y_): + ''' + LeNet model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print('Building LeNet model...') + x = ht.array_reshape_op(x, (-1, 1, 28, 28)) + x = conv_pool(x, 1, 6, name='lenet_conv1') + x = conv_pool(x, 6, 16, name='lenet_conv2') + x = ht.array_reshape_op(x, (-1, 7*7*16)) + x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True) + x = fc(x, (120, 84), name='lenet_fc2', with_relu=True) + y = fc(x, (84, 10), name='lenet_fc3', with_relu=False) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/LogReg.py b/examples/cnn/models/LogReg.py new file mode 100644 index 0000000..a8a05a6 --- /dev/null +++ b/examples/cnn/models/LogReg.py @@ -0,0 +1,24 @@ +import hetu as ht +from hetu import init + + +def logreg(x, y_): + ''' + Logistic Regression model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print("Build logistic regression model...") + weight = init.zeros((784, 10), name='logreg_weight') + bias = init.zeros((10,), name='logreg_bias') + x = ht.matmul_op(x, weight) + y = x + ht.broadcastto_op(bias, x) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/MLP.py b/examples/cnn/models/MLP.py new file mode 100644 index 0000000..184e200 --- /dev/null +++ b/examples/cnn/models/MLP.py @@ -0,0 +1,33 @@ +import hetu as ht +from hetu import init + + +def fc(x, shape, name, with_relu=True): + weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +def mlp(x, y_): + ''' + MLP model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print("Building MLP model...") + x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True) + x = fc(x, (256, 256), 'mlp_fc2', with_relu=True) + y = fc(x, (256, 10), 'mlp_fc3', with_relu=False) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/RNN.py b/examples/cnn/models/RNN.py new file mode 100644 index 0000000..a767952 --- /dev/null +++ b/examples/cnn/models/RNN.py @@ -0,0 +1,56 @@ +import hetu as ht +from hetu import init +import numpy as np + + +def rnn(x, y_): + ''' + RNN model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print("Building RNN model...") + diminput = 28 + dimhidden = 128 + dimoutput = 10 + nsteps = 28 + + weight1 = init.random_normal( + shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1') + bias1 = init.random_normal( + shape=(dimhidden, ), stddev=0.1, name='rnn_bias1') + weight2 = init.random_normal( + shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2') + bias2 = init.random_normal( + shape=(dimhidden, ), stddev=0.1, name='rnn_bias2') + weight3 = init.random_normal( + shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3') + bias3 = init.random_normal( + shape=(dimoutput, ), stddev=0.1, name='rnn_bias3') + last_state = ht.Variable(value=np.zeros((1,)).astype( + np.float32), name='initial_state', trainable=False) + + for i in range(nsteps): + cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput)) + h = ht.matmul_op(cur_x, weight1) + h = h + ht.broadcastto_op(bias1, h) + + if i == 0: + last_state = ht.broadcastto_op(last_state, h) + s = ht.concat_op(h, last_state, axis=1) + s = ht.matmul_op(s, weight2) + s = s + ht.broadcastto_op(bias2, s) + last_state = ht.relu_op(s) + + final_state = last_state + x = ht.matmul_op(final_state, weight3) + y = x + ht.broadcastto_op(bias3, x) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/cnn/models/ResNet.py b/examples/cnn/models/ResNet.py new file mode 100644 index 0000000..cd1d30e --- /dev/null +++ b/examples/cnn/models/ResNet.py @@ -0,0 +1,125 @@ +import hetu as ht +from hetu import init + + +def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''): + weight = init.random_normal( + shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight') + x = ht.conv2d_op(x, weight, stride=stride, padding=padding) + return x + + +def batch_norm_with_relu(x, hidden, name): + scale = init.random_normal( + shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale') + bias = init.random_normal(shape=(1, hidden, 1, 1), + stddev=0.1, name=name+'_bias') + x = ht.batch_normalization_op(x, scale, bias) + x = ht.relu_op(x) + return x + + +def resnet_block(x, in_channel, num_blocks, is_first=False, name=''): + if is_first: + out_channel = in_channel + identity = x + x = conv2d(x, in_channel, out_channel, stride=1, + padding=1, name=name+'_conv1') + x = batch_norm_with_relu(x, out_channel, name+'_bn1') + x = conv2d(x, out_channel, out_channel, stride=1, + padding=1, name=name+'_conv2') + x = x + identity + else: + out_channel = 2 * in_channel + identity = x + x = batch_norm_with_relu(x, in_channel, name+'_bn0') + x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]]) + x = conv2d(x, in_channel, out_channel, stride=2, + padding=0, name=name+'_conv1') + x = batch_norm_with_relu(x, out_channel, name+'_bn1') + x = conv2d(x, out_channel, out_channel, stride=1, + padding=1, name=name+'_conv2') + identity = ht.avg_pool2d_op( + identity, kernel_H=2, kernel_W=2, padding=0, stride=2) + identity = ht.pad_op( + identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) + x = x + identity + + for i in range(1, num_blocks): + identity = x + x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i)) + x = conv2d(x, out_channel, out_channel, stride=1, + padding=1, name=name+'_conv%d' % (2 * i + 1)) + x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1)) + x = conv2d(x, out_channel, out_channel, stride=1, + padding=1, name=name+'_conv%d' % (2 * i + 2)) + x = x + identity + + return x + + +def fc(x, shape, name): + weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + return x + + +def resnet(x, y_, num_layers=18, num_class=10): + ''' + ResNet model, for CIFAR10 dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + num_layers: 18 or 34 + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + base_size = 16 + + x = conv2d(x, 3, base_size, stride=1, padding=1, + name='resnet_initial_conv') + x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn') + + if num_layers == 18: + print("Building ResNet-18 model...") + x = resnet_block(x, base_size, num_blocks=2, + is_first=True, name='resnet_block1') + x = resnet_block(x, base_size, num_blocks=2, + is_first=False, name='resnet_block2') + x = resnet_block(x, 2 * base_size, num_blocks=2, + is_first=False, name='resnet_block3') + x = resnet_block(x, 4 * base_size, num_blocks=2, + is_first=False, name='resnet_block4') + elif num_layers == 34: + print("Building ResNet-34 model...") + x = resnet_block(x, base_size, num_blocks=3, + is_first=True, name='resnet_block1') + x = resnet_block(x, base_size, num_blocks=4, + is_first=False, name='resnet_block2') + x = resnet_block(x, 2 * base_size, num_blocks=6, + is_first=False, name='resnet_block3') + x = resnet_block(x, 4 * base_size, num_blocks=3, + is_first=False, name='resnet_block4') + else: + assert False, "Number of layers should be 18 or 34 !" + + x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn') + x = ht.array_reshape_op(x, (-1, 128 * base_size)) + y = fc(x, (128 * base_size, num_class), name='resnet_final_fc') + # here we don't use cudnn for softmax crossentropy to avoid overflows + loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y + + +def resnet18(x, y_, num_class=10): + return resnet(x, y_, 18, num_class) + + +def resnet34(x, y_, num_class=10): + return resnet(x, y_, 34, num_class) diff --git a/examples/cnn/models/VGG.py b/examples/cnn/models/VGG.py new file mode 100644 index 0000000..9cdab5f --- /dev/null +++ b/examples/cnn/models/VGG.py @@ -0,0 +1,100 @@ +import hetu as ht +from hetu import init + + +def conv_bn_relu(x, in_channel, out_channel, name): + weight = init.random_normal(shape=(out_channel, in_channel, 3, 3), + stddev=0.1, name=name+'_weight') + bn_scale = init.random_normal(shape=(1, out_channel, 1, 1), + stddev=0.1, name=name+'_bn_scale') + bn_bias = init.random_normal(shape=(1, out_channel, 1, 1), + stddev=0.1, name=name+'_bn_bias') + + x = ht.conv2d_op(x, weight, padding=1, stride=1) + x = ht.batch_normalization_op(x, bn_scale, bn_bias) + act = ht.relu_op(x) + return act + + +def vgg_2block(x, in_channel, out_channel, name): + x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') + x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) + return x + + +def vgg_3block(x, in_channel, out_channel, name): + x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3') + x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) + return x + + +def vgg_4block(x, in_channel, out_channel, name): + x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3') + x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4') + x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) + return x + + +def vgg_fc(x, in_feat, out_feat, name): + weight = init.random_normal(shape=(in_feat, out_feat), + stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=(out_feat,), + stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + return x + + +def vgg(x, y_, num_layers, num_class=10): + ''' + VGG model, for CIFAR10/CIFAR100 dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + num_layers: 16 or 19 + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + if num_layers == 16: + print('Building VGG-16 model...') + x = vgg_2block(x, 3, 64, 'vgg_block1') + x = vgg_2block(x, 64, 128, 'vgg_block2') + x = vgg_3block(x, 128, 256, 'vgg_block3') + x = vgg_3block(x, 256, 512, 'vgg_block4') + x = vgg_3block(x, 512, 512, 'vgg_block5') + + elif num_layers == 19: + print('Building VGG-19 model...') + x = vgg_2block(x, 3, 64, 'vgg_block1') + x = vgg_2block(x, 64, 128, 'vgg_block2') + x = vgg_4block(x, 128, 256, 'vgg_block3') + x = vgg_4block(x, 256, 512, 'vgg_block4') + x = vgg_4block(x, 512, 512, 'vgg_block5') + + else: + assert False, 'VGG model should have 16 or 19 layers!' + + x = ht.array_reshape_op(x, (-1, 512)) + x = vgg_fc(x, 512, 4096, 'vgg_fc1') + x = vgg_fc(x, 4096, 4096, 'vgg_fc2') + y = vgg_fc(x, 4096, num_class, 'vgg_fc3') + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + + return loss, y + + +def vgg16(x, y_, num_class=10): + return vgg(x, y_, 16, num_class) + + +def vgg19(x, y_, num_class=10): + return vgg(x, y_, 19, num_class) diff --git a/examples/cnn/models/__init__.py b/examples/cnn/models/__init__.py new file mode 100644 index 0000000..f6306d2 --- /dev/null +++ b/examples/cnn/models/__init__.py @@ -0,0 +1,9 @@ +from .VGG import vgg, vgg16, vgg19 +from .LogReg import logreg +from .CNN import cnn_3_layers +from .AlexNet import alexnet +from .LeNet import lenet +from .MLP import mlp +from .RNN import rnn +from .LSTM import lstm +from .ResNet import resnet, resnet18, resnet34 diff --git a/examples/cnn/pytorch_models/__init__.py b/examples/cnn/pytorch_models/__init__.py new file mode 100644 index 0000000..022e62e --- /dev/null +++ b/examples/cnn/pytorch_models/__init__.py @@ -0,0 +1,4 @@ +from .mlp import mlp +from .resnet import resnet18, resnet34, resnet50 +from .vgg import vgg16, vgg19 +from .rnn import rnn diff --git a/examples/cnn/pytorch_models/mlp.py b/examples/cnn/pytorch_models/mlp.py new file mode 100644 index 0000000..398799f --- /dev/null +++ b/examples/cnn/pytorch_models/mlp.py @@ -0,0 +1,20 @@ +import torch.nn.functional as F +import torch.nn as nn + + +class MLP(nn.Module): + def __init__(self): + super(MLP, self).__init__() + self.fc1 = nn.Linear(3072, 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 10) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + out = self.fc3(x) + return out + + +def mlp(): + return MLP() diff --git a/examples/cnn/pytorch_models/resnet.py b/examples/cnn/pytorch_models/resnet.py new file mode 100644 index 0000000..927adec --- /dev/null +++ b/examples/cnn/pytorch_models/resnet.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d( + in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, self.expansion * + planes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(self.expansion*planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def resnet18(num_classes=10): + return ResNet(BasicBlock, [2, 2, 2, 2], num_classes) + + +def resnet34(num_classes=10): + return ResNet(BasicBlock, [3, 4, 6, 3], num_classes) + + +def resnet50(num_classes=10): + return ResNet(Bottleneck, [3, 4, 6, 3], num_classes) + + +def resnet101(num_classes=10): + return ResNet(Bottleneck, [3, 4, 23, 3], num_classes) + + +def resnet152(num_classes=10): + return ResNet(Bottleneck, [3, 8, 36, 3], num_classes) diff --git a/examples/cnn/pytorch_models/rnn.py b/examples/cnn/pytorch_models/rnn.py new file mode 100644 index 0000000..0a298d7 --- /dev/null +++ b/examples/cnn/pytorch_models/rnn.py @@ -0,0 +1,36 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class RNN(nn.Module): + def __init__(self, diminput, dimoutput, dimhidden, nsteps): + super(RNN, self).__init__() + self.diminput = diminput + self.dimoutput = dimoutput + self.dimhidden = dimhidden + self.nsteps = nsteps + self.fc1 = nn.Linear(diminput, dimhidden) + self.fc2 = nn.Linear(dimhidden*2, dimhidden) + self.fc3 = nn.Linear(dimhidden, dimoutput) + + def forward(self, x): + last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device) + for i in range(self.nsteps): + t = i % self.nsteps + index = torch.Tensor([idx for idx in range( + t*self.diminput, (t+1)*self.diminput)]).long().to(x.device) + cur_x = torch.index_select(x, 1, index) + h = self.fc1(cur_x) + s = torch.cat([h, last_state], axis=1) + s = self.fc2(s) + last_state = F.relu(s) + + final_state = last_state + y = self.fc3(final_state) + return y + + +def rnn(diminput, dimoutput, dimhidden, nsteps): + + return RNN(diminput, dimoutput, dimhidden, nsteps) diff --git a/examples/cnn/pytorch_models/vgg.py b/examples/cnn/pytorch_models/vgg.py new file mode 100644 index 0000000..0fa2e88 --- /dev/null +++ b/examples/cnn/pytorch_models/vgg.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn + + +cfg = { + 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], + 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], +} + + +class VGG(nn.Module): + def __init__(self, vgg_name, num_class=10): + super(VGG, self).__init__() + self.features = self._make_layers(cfg[vgg_name]) + self.fc1 = nn.Linear(512, 4096) + self.fc2 = nn.Linear(4096, 4096) + self.classifier = nn.Linear(4096, num_class) + + def forward(self, x): + out = self.features(x) + out = out.view(out.size(0), -1) + out = self.fc2(self.fc1(out)) + out = self.classifier(out) + return out + + def _make_layers(self, cfg): + layers = [] + in_channels = 3 + for x in cfg: + if x == 'M': + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), + nn.BatchNorm2d(x), + nn.ReLU(inplace=True)] + in_channels = x + layers += [nn.AvgPool2d(kernel_size=1, stride=1)] + return nn.Sequential(*layers) + + +def vgg16(num_class=10): + return VGG('VGG16', num_class) + + +def vgg19(num_class=10): + return VGG('VGG19', num_class) diff --git a/examples/cnn/run_tf_horovod.py b/examples/cnn/run_tf_horovod.py new file mode 100644 index 0000000..fa2b96f --- /dev/null +++ b/examples/cnn/run_tf_horovod.py @@ -0,0 +1,309 @@ +import os +import numpy as np +import tensorflow as tf +import tf_models +import time +import argparse +from tqdm import tqdm +from sklearn import metrics +import horovod.tensorflow as hvd +import hetu as ht +import logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def print_rank0(msg): + if rank % 8 == 0: + logger.info(msg) + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + +# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model +# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model +# horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8 +# python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn + + +# if using multi nodes setting in conda, need to modify /etc/bash.bashrc +# we can also use mpirun (default gloo): +# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ +# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model +''' +def train(model, args): + hvd.init() + + def get_current_shard(data): + part_size = data.shape[0] // hvd.size() + start = part_size * hvd.rank() + end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0] + return data[start:end] + + batch_size = 128 + if args.model == 'tf_resnet34': + train_images, train_labels, test_images,\ + test_labels = ht.data.tf_normalize_cifar10() + x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10]) + else: + datasets = ht.data.mnist() + train_images, train_labels = datasets[0] + test_images, test_labels = datasets[2] + x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10]) + + + n_train_batches = train_images.shape[0] // batch_size + + loss, y = model(x, y_) + opt = tf.train.GradientDescentOptimizer(learning_rate=0.01) + + global_step = tf.train.get_or_create_global_step() + # here in DistributedOptimizer by default all tensor are reduced on GPU + # can use device_sparse=xxx, device_dense=xxx to modify + # if using device_sparse='/cpu:0', the performance degrades + train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step) + + gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank())) + # here horovod default use gpu to initialize, which will cause OOM + hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] + sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + + iterations = train_images.shape[0] // batch_size + total_epoch = 10 + start_index = 0 + total_time = 0 + for ep in range(total_epoch + 1): + print("epoch %d" % ep) + st_time = time.time() + train_loss, train_acc = [], [] + for it in range(n_train_batches): + x_val = train_images[start_index: start_index + batch_size] + y_val = train_labels[start_index : start_index+batch_size] + start_index += batch_size + if start_index + batch_size > train_images.shape[0]: + start_index = 0 + loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val}) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + en_time = time.time() + train_time = en_time - st_time + if ep != 0: + total_time += train_time + printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, train_time) + + print("training time:", total_time) + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, help="model to be tested") + parser.add_argument("--all", action="store_true", help="whether to use all data") + args = parser.parse_args() + raw_model = args.model + import tf_models + model = eval('tf_models.' + raw_model) + print('Model:', raw_model) + train(model, args) + +if __name__ == '__main__': + main() +''' + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, required=True, + help='model to be tested') + parser.add_argument('--dataset', type=str, required=True, + help='dataset to be trained on') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=20, help='epoch number') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + args = parser.parse_args() + + hvd.init() + global rank + rank = hvd.rank() + assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ + 'Model not supported now.' + model = eval('tf_models.' + args.model) + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + if args.opt == 'sgd': + print_rank0('Use SGD Optimizer.') + opt = tf.train.GradientDescentOptimizer( + learning_rate=args.learning_rate) + elif args.opt == 'momentum': + print_rank0('Use Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9) + elif args.opt == 'nesterov': + print_rank0('Use Nesterov Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) + elif args.opt == 'adagrad': + print_rank0('Use AdaGrad Optimizer.') + opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) + else: + print_rank0('Use Adam Optimizer.') + opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + if dataset == 'MNIST': + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 784), train_set_y: (50000,) + # valid_set_x: (10000, 784), valid_set_y: (10000,) + elif dataset == 'CIFAR10': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=10) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + if args.model == "tf_mlp": + train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) + valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + elif dataset == 'CIFAR100': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=100) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + else: + raise NotImplementedError + + if dataset == 'MNIST': + x = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 784), name='x') + y_ = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_) + elif dataset == 'CIFAR10': + if args.model == "tf_mlp": + x = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 3072), name='x') + y_ = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + else: + x = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 32, 32, 3), name='x') + y_ = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_, 10) + elif dataset == 'CIFAR100': + x = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 32, 32, 3), name='x') + y_ = tf.compat.v1.placeholder( + dtype=tf.float32, shape=(None, 100), name='y_') + loss, y = model(x, y_, 100) + + global_step = tf.train.get_or_create_global_step() + # here in DistributedOptimizer by default all tensor are reduced on GPU + # can use device_sparse=xxx, device_dense=xxx to modify + # if using device_sparse='/cpu:0', the performance degrades + train_op = hvd.DistributedOptimizer( + opt).minimize(loss, global_step=global_step) + + gpu_options = tf.compat.v1.GPUOptions( + allow_growth=True, visible_device_list=str(hvd.local_rank())) + # here horovod default use gpu to initialize, which will cause OOM + hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] + sess = tf.compat.v1.train.MonitoredTrainingSession( + hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + + # sess.run(tf.compat.v1.global_variables_initializer()) + + # training + print_rank0("Start training loop...") + running_time = 0 + for i in range(args.num_epochs + 1): + print_rank0("Epoch %d" % i) + loss_all = 0 + batch_num = 0 + if args.timing: + start = time.time() + correct_predictions = [] + for minibatch_index in range(n_train_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + x_val = train_set_x[minibatch_start:minibatch_end] + y_val = train_set_y[minibatch_start:minibatch_end] + loss_val, predict_y, _ = sess.run([loss, y, train_op], + feed_dict={x: x_val, y_: y_val}) + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + batch_num += 1 + loss_all += loss_val + loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Train loss = %f" % loss_all) + print_rank0("Train accuracy = %f" % accuracy) + + if args.timing: + end = time.time() + print_rank0("Running time of current epoch = %fs" % (end - start)) + if i != 0: + running_time += (end - start) + + if args.validate: + val_loss_all = 0 + batch_num = 0 + correct_predictions = [] + for minibatch_index in range(n_valid_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + valid_x_val = valid_set_x[minibatch_start:minibatch_end] + valid_y_val = valid_set_y[minibatch_start:minibatch_end] + loss_val, valid_y_predicted = sess.run([loss, y], + feed_dict={x: valid_x_val, y_: valid_y_val}) + correct_prediction = np.equal( + np.argmax(valid_y_val, 1), + np.argmax(valid_y_predicted, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + val_loss_all += loss_all + batch_num += 1 + val_loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Validation loss = %f" % val_loss_all) + print_rank0("Validation accuracy = %f" % accuracy) + print_rank0("*"*50) + print_rank0("Running time of total %d epoch = %fs" % + (args.num_epochs, running_time)) diff --git a/examples/cnn/scripts/hetu_16gpu.sh b/examples/cnn/scripts/hetu_16gpu.sh new file mode 100644 index 0000000..4b4c130 --- /dev/null +++ b/examples/cnn/scripts/hetu_16gpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../main.py +depsdir=${workdir}/../../.. +echo $depsdir +### validate and timing +$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce + diff --git a/examples/cnn/scripts/hetu_1gpu.sh b/examples/cnn/scripts/hetu_1gpu.sh new file mode 100644 index 0000000..e63b3eb --- /dev/null +++ b/examples/cnn/scripts/hetu_1gpu.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../main.py + + +# model: +# e.g. bash hetu_1gpu.sh mlp CIFAR10 + +### validate and timing +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing diff --git a/examples/cnn/scripts/hetu_2gpu_ps.sh b/examples/cnn/scripts/hetu_2gpu_ps.sh new file mode 100644 index 0000000..92f5eae --- /dev/null +++ b/examples/cnn/scripts/hetu_2gpu_ps.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../main.py + +### validate and timing +python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched & +python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 & +python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 & +wait \ No newline at end of file diff --git a/examples/cnn/scripts/hetu_8gpu.sh b/examples/cnn/scripts/hetu_8gpu.sh new file mode 100644 index 0000000..f2a99e3 --- /dev/null +++ b/examples/cnn/scripts/hetu_8gpu.sh @@ -0,0 +1,8 @@ +#!/bin/bash +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../main.py +depsdir=${workdir}/../../.. + +### validate and timing +# +NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce diff --git a/examples/cnn/scripts/horovod_16gpu.sh b/examples/cnn/scripts/horovod_16gpu.sh new file mode 100644 index 0000000..7db8800 --- /dev/null +++ b/examples/cnn/scripts/horovod_16gpu.sh @@ -0,0 +1,11 @@ + +#!/bin/bash +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_tf_horovod.py + +# horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing + +horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing + +# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ +# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model diff --git a/examples/cnn/scripts/horovod_8gpu.sh b/examples/cnn/scripts/horovod_8gpu.sh new file mode 100644 index 0000000..10e3cc7 --- /dev/null +++ b/examples/cnn/scripts/horovod_8gpu.sh @@ -0,0 +1,6 @@ + +#!/bin/bash +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_tf_horovod.py + +horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing diff --git a/examples/cnn/scripts/pytorch_16gpu_0.sh b/examples/cnn/scripts/pytorch_16gpu_0.sh new file mode 100644 index 0000000..2a847f9 --- /dev/null +++ b/examples/cnn/scripts/pytorch_16gpu_0.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=162.105.146.117 +MASTER_PORT=6000 +NNODES=2 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../torch_main.py + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS \ + ${mainpy} \ + --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed \ No newline at end of file diff --git a/examples/cnn/scripts/pytorch_16gpu_1.sh b/examples/cnn/scripts/pytorch_16gpu_1.sh new file mode 100644 index 0000000..dba0ecf --- /dev/null +++ b/examples/cnn/scripts/pytorch_16gpu_1.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=162.105.146.117 +MASTER_PORT=39575 +NNODES=2 +NODE_RANK=1 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../torch_main.py + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS \ + ${mainpy} \ + --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed \ No newline at end of file diff --git a/examples/cnn/scripts/pytorch_1gpu.sh b/examples/cnn/scripts/pytorch_1gpu.sh new file mode 100644 index 0000000..cc40166 --- /dev/null +++ b/examples/cnn/scripts/pytorch_1gpu.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../torch_main.py + +## validate and timing +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing diff --git a/examples/cnn/scripts/pytorch_8gpu.sh b/examples/cnn/scripts/pytorch_8gpu.sh new file mode 100644 index 0000000..ebbb049 --- /dev/null +++ b/examples/cnn/scripts/pytorch_8gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../torch_main.py + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS \ + ${mainpy} \ + --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed \ No newline at end of file diff --git a/examples/cnn/scripts/tf_16gpu_worker0.sh b/examples/cnn/scripts/tf_16gpu_worker0.sh new file mode 100644 index 0000000..0f11c59 --- /dev/null +++ b/examples/cnn/scripts/tf_16gpu_worker0.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate & +wait + diff --git a/examples/cnn/scripts/tf_16gpu_worker1.sh b/examples/cnn/scripts/tf_16gpu_worker1.sh new file mode 100644 index 0000000..dcdde09 --- /dev/null +++ b/examples/cnn/scripts/tf_16gpu_worker1.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate & +wait \ No newline at end of file diff --git a/examples/cnn/scripts/tf_1gpu.sh b/examples/cnn/scripts/tf_1gpu.sh new file mode 100644 index 0000000..f58f323 --- /dev/null +++ b/examples/cnn/scripts/tf_1gpu.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_main.py + +### validate and timing +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing + +### run in cpu +# python ${mainpy} --model tf_mlp --gpu -1 --validate --timing diff --git a/examples/cnn/scripts/tf_8gpu.sh b/examples/cnn/scripts/tf_8gpu.sh new file mode 100644 index 0000000..70ecb79 --- /dev/null +++ b/examples/cnn/scripts/tf_8gpu.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate & +python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate & +wait + diff --git a/examples/cnn/settings/tf_dist_s1_w16.json b/examples/cnn/settings/tf_dist_s1_w16.json new file mode 100644 index 0000000..2b3c1d0 --- /dev/null +++ b/examples/cnn/settings/tf_dist_s1_w16.json @@ -0,0 +1,23 @@ +{ + "worker": [ + "162.105.146.117:34569", + "162.105.146.117:34568", + "162.105.146.117:34567", + "162.105.146.117:34566", + "162.105.146.117:34565", + "162.105.146.117:34564", + "162.105.146.117:34563", + "162.105.146.117:34562", + "162.105.146.118:34779", + "162.105.146.118:34778", + "162.105.146.118:34777", + "162.105.146.118:34776", + "162.105.146.118:34775", + "162.105.146.118:34774", + "162.105.146.118:34773", + "162.105.146.118:34772" + ], + "ps": [ + "162.105.146.117:34575" + ] +} \ No newline at end of file diff --git a/examples/cnn/settings/tf_dist_s1_w4.json b/examples/cnn/settings/tf_dist_s1_w4.json new file mode 100644 index 0000000..c9f08f6 --- /dev/null +++ b/examples/cnn/settings/tf_dist_s1_w4.json @@ -0,0 +1,11 @@ +{ + "worker": [ + "162.105.146.119:34569", + "162.105.146.119:34568", + "162.105.146.119:34567", + "162.105.146.119:34566" + ], + "ps": [ + "162.105.146.119:34575" + ] +} \ No newline at end of file diff --git a/examples/cnn/settings/tf_dist_s1_w8.json b/examples/cnn/settings/tf_dist_s1_w8.json new file mode 100644 index 0000000..92ce51c --- /dev/null +++ b/examples/cnn/settings/tf_dist_s1_w8.json @@ -0,0 +1,15 @@ +{ + "worker": [ + "162.105.146.119:34569", + "162.105.146.119:34568", + "162.105.146.119:34567", + "162.105.146.119:34566", + "162.105.146.119:34565", + "162.105.146.119:34564", + "162.105.146.119:34563", + "162.105.146.119:34562" + ], + "ps": [ + "162.105.146.119:34575" + ] +} \ No newline at end of file diff --git a/examples/cnn/tf_launch_server.py b/examples/cnn/tf_launch_server.py new file mode 100644 index 0000000..67ca216 --- /dev/null +++ b/examples/cnn/tf_launch_server.py @@ -0,0 +1,49 @@ +import os +import tensorflow as tf +import multiprocessing +import signal +import json +import argparse + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + os.environ['CUDA_VISIBLE_DEVICES'] = '' + + +pop_env() + + +def start_server(cluster, task_id): + server = tf.train.Server(cluster, job_name='ps', task_index=task_id) + server.join() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path") + parser.add_argument("--id", type=int, required=True) + args = parser.parse_args() + raw_config = args.config + config = json.load(open(raw_config)) + cluster = tf.train.ClusterSpec(config) + global proc + proc = multiprocessing.Process( + target=start_server, args=[cluster, args.id, ]) + proc.start() + signal.signal(signal.SIGINT, signal_handler) + proc.join() + + +def signal_handler(signal, frame): + print("SIGINT signal caught, stop Training") + global proc + proc.kill() + exit(0) + + +if __name__ == '__main__': + main() diff --git a/examples/cnn/tf_launch_worker.py b/examples/cnn/tf_launch_worker.py new file mode 100644 index 0000000..925bc83 --- /dev/null +++ b/examples/cnn/tf_launch_worker.py @@ -0,0 +1,234 @@ +import tensorflow as tf +import tf_models +import hetu as ht + +import numpy as np +import argparse +import json +from time import time +import os +import logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def print_rank0(msg): + if task_id % 8 == 0: + logger.info(msg) + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, required=True, + help='model to be tested') + parser.add_argument('--dataset', type=str, required=True, + help='dataset to be trained on') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=20, help='epoch number') + parser.add_argument('--gpu', type=int, default=0, + help='gpu to be used, -1 means cpu') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + parser.add_argument("--rank", type=int, required=True, + help="rank of process") + parser.add_argument( + "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path") + + args = parser.parse_args() + global task_id + + task_id = int(args.rank) + print_rank0("task id %d" % (task_id)) + raw_config = args.config + + if args.gpu == -1: + device = '/job:worker/task:%d/cpu:0' % (task_id) + print_rank0('Use CPU.') + else: + device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu) + print_rank0('Use GPU %d.' % args.gpu) + + config = json.load(open(raw_config)) + cluster = tf.train.ClusterSpec(config) + + assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ + 'Model not supported now.' + model = eval('tf_models.' + args.model) + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + if args.opt == 'sgd': + print_rank0('Use SGD Optimizer.') + opt = tf.train.GradientDescentOptimizer( + learning_rate=args.learning_rate) + elif args.opt == 'momentum': + print_rank0('Use Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9) + elif args.opt == 'nesterov': + print_rank0('Use Nesterov Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) + elif args.opt == 'adagrad': + print_rank0('Use AdaGrad Optimizer.') + opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) + else: + print_rank0('Use Adam Optimizer.') + opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + with tf.device( + tf.compat.v1.train.replica_device_setter( + worker_device=device, + cluster=cluster)): + # data loading + print_rank0('Loading %s data...' % dataset) + if dataset == 'MNIST': + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 784), train_set_y: (50000,) + # valid_set_x: (10000, 784), valid_set_y: (10000,) + elif dataset == 'CIFAR10': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=10) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + if args.model == "tf_mlp": + train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) + valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) + + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + elif dataset == 'CIFAR100': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=100) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + else: + raise NotImplementedError + + if dataset == 'MNIST': + x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x') + y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_) + elif dataset == 'CIFAR10': + if args.model == "tf_mlp": + x = tf.placeholder( + dtype=tf.float32, shape=(None, 3072), name='x') + y_ = tf.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + else: + x = tf.placeholder(dtype=tf.float32, shape=( + None, 32, 32, 3), name='x') + y_ = tf.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_, 10) + elif dataset == 'CIFAR100': + x = tf.placeholder(dtype=tf.float32, shape=( + None, 32, 32, 3), name='x') + y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_') + loss, y = model(x, y_, 100) + train_op = opt.minimize(loss) + + server = tf.train.Server( + cluster, job_name="worker", task_index=task_id) + + init = tf.compat.v1.global_variables_initializer() + sv = tf.train.Supervisor( + is_chief=(task_id == 0), + init_op=init, + recovery_wait_secs=1) + sess_config = tf.compat.v1.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % task_id]) + sess = sv.prepare_or_wait_for_session( + server.target, config=sess_config) + + sess.run(init) + # training + print_rank0("Start training loop...") + running_time = 0 + for i in range(args.num_epochs + 1): + print_rank0("Epoch %d" % i) + loss_all = 0 + batch_num = 0 + if args.timing: + start = time() + correct_predictions = [] + for minibatch_index in range(n_train_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + x_val = train_set_x[minibatch_start:minibatch_end] + y_val = train_set_y[minibatch_start:minibatch_end] + loss_val, predict_y, _ = sess.run([loss, y, train_op], + feed_dict={x: x_val, y_: y_val}) + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + batch_num += 1 + loss_all += loss_val + loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Train loss = %f" % loss_all) + print_rank0("Train accuracy = %f" % accuracy) + + if args.timing: + end = time() + print_rank0("Running time of current epoch = %fs" % + (end - start)) + if i != 0: + running_time += (end - start) + + if args.validate: + val_loss_all = 0 + batch_num = 0 + correct_predictions = [] + for minibatch_index in range(n_valid_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + valid_x_val = valid_set_x[minibatch_start:minibatch_end] + valid_y_val = valid_set_y[minibatch_start:minibatch_end] + loss_val, valid_y_predicted = sess.run([loss, y], + feed_dict={x: valid_x_val, y_: valid_y_val}) + correct_prediction = np.equal( + np.argmax(valid_y_val, 1), + np.argmax(valid_y_predicted, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + val_loss_all += loss_all + batch_num += 1 + val_loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Validation loss = %f" % val_loss_all) + print_rank0("Validation accuracy = %f" % accuracy) + print_rank0("*"*50) + print_rank0("Running time of total %d epoch = %fs" % + (args.num_epochs, running_time)) diff --git a/examples/cnn/tf_main.py b/examples/cnn/tf_main.py new file mode 100644 index 0000000..feb1400 --- /dev/null +++ b/examples/cnn/tf_main.py @@ -0,0 +1,194 @@ +import tensorflow as tf +import tf_models +import hetu as ht +import numpy as np +import argparse +from time import time +import logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def print_rank0(msg): + logger.info(msg) + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, required=True, + help='model to be tested') + parser.add_argument('--dataset', type=str, required=True, + help='dataset to be trained on') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=20, help='epoch number') + parser.add_argument('--gpu', type=int, default=0, + help='gpu to be used, -1 means cpu') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + args = parser.parse_args() + + if args.gpu == -1: + device = '/cpu:0' + print_rank0('Use CPU.') + else: + device = '/gpu:%d' % args.gpu + print_rank0('Use GPU %d.' % args.gpu) + + print_rank0("Training {} on TensorFlow".format(args.model)) + assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ + 'Model not supported now.' + model = eval('tf_models.' + args.model) + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + if args.opt == 'sgd': + print_rank0('Use SGD Optimizer.') + opt = tf.train.GradientDescentOptimizer( + learning_rate=args.learning_rate) + elif args.opt == 'momentum': + print_rank0('Use Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9) + elif args.opt == 'nesterov': + print_rank0('Use Nesterov Momentum Optimizer.') + opt = tf.train.MomentumOptimizer( + learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) + elif args.opt == 'adagrad': + print_rank0('Use AdaGrad Optimizer.') + opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) + else: + print_rank0('Use Adam Optimizer.') + opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + # model definition + print_rank0('Building model...') + with tf.device(device): + if dataset == 'MNIST': + x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x') + y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_) + elif dataset == 'CIFAR10': + if args.model == "tf_mlp": + x = tf.placeholder( + dtype=tf.float32, shape=(None, 3072), name='x') + y_ = tf.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + else: + x = tf.placeholder(dtype=tf.float32, shape=( + None, 32, 32, 3), name='x') + y_ = tf.placeholder( + dtype=tf.float32, shape=(None, 10), name='y_') + loss, y = model(x, y_, 10) + elif dataset == 'CIFAR100': + x = tf.placeholder(dtype=tf.float32, shape=( + None, 32, 32, 3), name='x') + y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_') + loss, y = model(x, y_, 100) + + train_op = opt.minimize(loss) + + # data loading + print_rank0('Loading %s data...' % dataset) + if dataset == 'MNIST': + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 784), train_set_y: (50000,) + # valid_set_x: (10000, 784), valid_set_y: (10000,) + elif dataset == 'CIFAR10': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=10) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + if args.model == "tf_mlp": + train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) + valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + elif dataset == 'CIFAR100': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( + num_class=100) + n_train_batches = train_set_x.shape[0] // args.batch_size + n_valid_batches = valid_set_x.shape[0] // args.batch_size + # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) + # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) + else: + raise NotImplementedError + + # training + print_rank0("Start training loop...") + running_time = 0 + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for i in range(args.num_epochs + 1): + print_rank0("Epoch %d" % i) + loss_all = 0 + batch_num = 0 + if args.timing: + start = time() + correct_predictions = [] + for minibatch_index in range(n_train_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + x_val = train_set_x[minibatch_start:minibatch_end] + y_val = train_set_y[minibatch_start:minibatch_end] + loss_val, predict_y, _ = sess.run([loss, y, train_op], + feed_dict={x: x_val, y_: y_val}) + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + batch_num += 1 + loss_all += loss_val + loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Train loss = %f" % loss_all) + print_rank0("Train accuracy = %f" % accuracy) + + if args.timing: + end = time() + print_rank0("Running time of current epoch = %fs" % + (end - start)) + if i != 0: + running_time += (end - start) + + if args.validate: + val_loss_all = 0 + batch_num = 0 + correct_predictions = [] + for minibatch_index in range(n_valid_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + valid_x_val = valid_set_x[minibatch_start:minibatch_end] + valid_y_val = valid_set_y[minibatch_start:minibatch_end] + loss_val, valid_y_predicted = sess.run([loss, y], + feed_dict={x: valid_x_val, y_: valid_y_val}) + correct_prediction = np.equal( + np.argmax(valid_y_val, 1), + np.argmax(valid_y_predicted, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + val_loss_all += loss_all + batch_num += 1 + val_loss_all /= batch_num + accuracy = np.mean(correct_predictions) + print_rank0("Validation loss = %f" % val_loss_all) + print_rank0("Validation accuracy = %f" % accuracy) + print_rank0("*"*50) + print_rank0("Running time of total %d epoch = %fs" % + (args.num_epochs, running_time)) diff --git a/examples/cnn/tf_models/__init__.py b/examples/cnn/tf_models/__init__.py new file mode 100644 index 0000000..6d713ff --- /dev/null +++ b/examples/cnn/tf_models/__init__.py @@ -0,0 +1,8 @@ +from .tf_LogReg import tf_logreg +from .tf_CNN import tf_cnn_3_layers +from .tf_LeNet import tf_lenet +from .tf_MLP import tf_mlp +from .tf_RNN import tf_rnn +from .tf_LSTM import tf_lstm +from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34 +from .tf_VGG import tf_vgg16, tf_vgg19 diff --git a/examples/cnn/tf_models/tf_CNN.py b/examples/cnn/tf_models/tf_CNN.py new file mode 100644 index 0000000..d8036a7 --- /dev/null +++ b/examples/cnn/tf_models/tf_CNN.py @@ -0,0 +1,45 @@ +import numpy as np +import tensorflow as tf + + +def tf_conv_relu_avg(x, shape): + weight = tf.Variable(np.random.normal( + scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32)) + x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1]) + x = tf.nn.relu(x) + x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1], + padding='VALID', strides=[1, 2, 2, 1]) + return x + + +def tf_fc(x, shape): + weight = tf.Variable(np.random.normal( + scale=0.1, size=shape).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=shape[-1:]).astype(np.float32)) + x = tf.reshape(x, (-1, shape[0])) + y = tf.matmul(x, weight) + bias + return y + + +def tf_cnn_3_layers(x, y_): + ''' + 3-layer-CNN model in TensorFlow, for MNIST dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print('Building 3-layer-CNN model in tensorflow...') + x = tf.reshape(x, [-1, 28, 28, 1]) + x = tf_conv_relu_avg(x, (32, 1, 5, 5)) + x = tf_conv_relu_avg(x, (64, 32, 5, 5)) + x = tf.transpose(x, [0, 3, 1, 2]) + y = tf_fc(x, (7 * 7 * 64, 10)) + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_LSTM.py b/examples/cnn/tf_models/tf_LSTM.py new file mode 100644 index 0000000..32ebe23 --- /dev/null +++ b/examples/cnn/tf_models/tf_LSTM.py @@ -0,0 +1,81 @@ +import numpy as np +import tensorflow as tf + + +def tf_lstm(x, y_): + ''' + LSTM model in TensorFlow, for MNIST dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print("Building LSTM model in tensorflow...") + diminput = 28 + dimhidden = 128 + dimoutput = 10 + nsteps = 28 + + forget_gate_w = tf.Variable(np.random.normal( + scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) + forget_gate_u = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) + forget_gate_b = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden,)).astype(np.float32)) + input_gate_w = tf.Variable(np.random.normal( + scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) + input_gate_u = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) + input_gate_b = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden,)).astype(np.float32)) + output_gate_w = tf.Variable(np.random.normal( + scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) + output_gate_u = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) + output_gate_b = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden,)).astype(np.float32)) + tanh_w = tf.Variable(np.random.normal( + scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) + tanh_u = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) + tanh_b = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden,)).astype(np.float32)) + out_weights = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32)) + out_bias = tf.Variable(np.random.normal( + scale=0.1, size=(dimoutput,)).astype(np.float32)) + initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32) + + last_c_state = initial_state + last_h_state = initial_state + + for i in range(nsteps): + cur_x = tf.slice(x, (0, i * diminput), (-1, diminput)) + # forget gate + cur_forget = tf.matmul(last_h_state, forget_gate_u) + \ + tf.matmul(cur_x, forget_gate_w) + forget_gate_b + cur_forget = tf.sigmoid(cur_forget) + # input gate + cur_input = tf.matmul(last_h_state, input_gate_u) + \ + tf.matmul(cur_x, input_gate_w) + input_gate_b + cur_input = tf.sigmoid(cur_input) + # output gate + cur_output = tf.matmul(last_h_state, output_gate_u) + \ + tf.matmul(cur_x, output_gate_w) + output_gate_b + cur_output = tf.sigmoid(cur_output) + # tanh + cur_tanh = tf.matmul(last_h_state, tanh_u) + \ + tf.matmul(cur_x, tanh_w) + tanh_b + cur_tanh = tf.tanh(cur_tanh) + + last_c_state = last_c_state * cur_forget + cur_input * cur_tanh + last_h_state = tf.tanh(last_c_state) * cur_output + + y = tf.matmul(last_h_state, out_weights) + out_bias + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_LeNet.py b/examples/cnn/tf_models/tf_LeNet.py new file mode 100644 index 0000000..03905f5 --- /dev/null +++ b/examples/cnn/tf_models/tf_LeNet.py @@ -0,0 +1,49 @@ +import numpy as np +import tensorflow as tf + + +def tf_conv_pool(x, in_channel, out_channel): + weight = tf.Variable(np.random.normal(scale=0.1, size=( + out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32)) + x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1]) + x = tf.nn.relu(x) + x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], + padding='VALID', strides=[1, 2, 2, 1]) + return x + + +def tf_fc(x, shape, with_relu=True): + weight = tf.Variable(np.random.normal( + scale=0.1, size=shape).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=shape[-1:]).astype(np.float32)) + x = tf.matmul(x, weight) + bias + if with_relu: + x = tf.nn.relu(x) + return x + + +def tf_lenet(x, y_): + ''' + LeNet model in TensorFlow, for MNIST dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print('Building LeNet model in tensorflow...') + x = tf.reshape(x, [-1, 28, 28, 1]) + x = tf_conv_pool(x, 1, 6) + x = tf_conv_pool(x, 6, 16) + x = tf.transpose(x, [0, 3, 1, 2]) + x = tf.reshape(x, (-1, 7*7*16)) + x = tf_fc(x, (7*7*16, 120), with_relu=True) + x = tf_fc(x, (120, 84), with_relu=True) + y = tf_fc(x, (84, 10), with_relu=False) + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_LogReg.py b/examples/cnn/tf_models/tf_LogReg.py new file mode 100644 index 0000000..67a7577 --- /dev/null +++ b/examples/cnn/tf_models/tf_LogReg.py @@ -0,0 +1,23 @@ +import numpy as np +import tensorflow as tf + + +def tf_logreg(x, y_): + ''' + Logistic Regression model in TensorFlow, for MNIST dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print("Build logistic regression model in tensorflow...") + weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32)) + bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32)) + y = tf.matmul(x, weight) + bias + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_MLP.py b/examples/cnn/tf_models/tf_MLP.py new file mode 100644 index 0000000..8a23646 --- /dev/null +++ b/examples/cnn/tf_models/tf_MLP.py @@ -0,0 +1,34 @@ +import numpy as np +import tensorflow as tf + + +def tf_fc(x, shape, with_relu=True): + weight = tf.Variable(np.random.normal( + scale=0.1, size=shape).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=shape[-1:]).astype(np.float32)) + x = tf.matmul(x, weight) + bias + if with_relu: + x = tf.nn.relu(x) + return x + + +def tf_mlp(x, y_, num_class=10): + ''' + MLP model in TensorFlow, for CIFAR dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print("Building MLP model in tensorflow...") + x = tf_fc(x, (3072, 256), with_relu=True) + x = tf_fc(x, (256, 256), with_relu=True) + y = tf_fc(x, (256, num_class), with_relu=False) + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_RNN.py b/examples/cnn/tf_models/tf_RNN.py new file mode 100644 index 0000000..c1d1436 --- /dev/null +++ b/examples/cnn/tf_models/tf_RNN.py @@ -0,0 +1,49 @@ +import numpy as np +import tensorflow as tf + + +def tf_rnn(x, y_): + ''' + RNN model in TensorFlow, for MNIST dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + + print("Building RNN model in tensorflow...") + diminput = 28 + dimhidden = 128 + dimoutput = 10 + nsteps = 28 + + weight1 = tf.Variable(np.random.normal( + scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) + bias1 = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, )).astype(np.float32)) + weight2 = tf.Variable(np.random.normal(scale=0.1, size=( + dimhidden + dimhidden, dimhidden)).astype(np.float32)) + bias2 = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, )).astype(np.float32)) + weight3 = tf.Variable(np.random.normal( + scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32)) + bias3 = tf.Variable(np.random.normal( + scale=0.1, size=(dimoutput, )).astype(np.float32)) + last_state = tf.zeros((128, dimhidden), dtype=tf.float32) + + for i in range(nsteps): + cur_x = tf.slice(x, (0, i * diminput), (-1, diminput)) + h = tf.matmul(cur_x, weight1) + bias1 + + s = tf.concat([h, last_state], axis=1) + s = tf.matmul(s, weight2) + bias2 + last_state = tf.nn.relu(s) + + final_state = last_state + y = tf.matmul(final_state, weight3) + bias3 + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y diff --git a/examples/cnn/tf_models/tf_ResNet.py b/examples/cnn/tf_models/tf_ResNet.py new file mode 100644 index 0000000..a0e0b2c --- /dev/null +++ b/examples/cnn/tf_models/tf_ResNet.py @@ -0,0 +1,113 @@ +import numpy as np +import tensorflow as tf + + +def tf_conv2d(x, in_channel, out_channel, stride=1): + weight = tf.Variable(np.random.normal(scale=0.1, size=( + out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32)) + x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME') + return x + + +def tf_batch_norm_with_relu(x, hidden): + scale = tf.Variable(np.random.normal( + scale=0.1, size=(hidden,)).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=(hidden,)).astype(np.float32)) + axis = list(range(len(x.shape) - 1)) + a_mean, a_var = tf.nn.moments(x, axis) + x = tf.nn.batch_normalization( + x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2) + x = tf.nn.relu(x) + return x + + +def tf_resnet_block(x, in_channel, num_blocks, is_first=False): + if is_first: + out_channel = in_channel + identity = x + x = tf_conv2d(x, in_channel, out_channel, stride=1) + x = tf_batch_norm_with_relu(x, out_channel) + x = tf_conv2d(x, out_channel, out_channel, stride=1) + x = x + identity + else: + out_channel = 2 * in_channel + identity = x + x = tf_batch_norm_with_relu(x, in_channel) + x = tf_conv2d(x, in_channel, out_channel, stride=2) + x = tf_batch_norm_with_relu(x, out_channel) + x = tf_conv2d(x, out_channel, out_channel, stride=1) + identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[ + 1, 2, 2, 1], padding='VALID') + identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [ + in_channel // 2, in_channel // 2]]) + x = x + identity + + for i in range(1, num_blocks): + identity = x + x = tf_batch_norm_with_relu(x, out_channel) + x = tf_conv2d(x, out_channel, out_channel, stride=1) + x = tf_batch_norm_with_relu(x, out_channel) + x = tf_conv2d(x, out_channel, out_channel, stride=1) + x = x + identity + + return x + + +def tf_fc(x, shape): + weight = tf.Variable(np.random.normal( + scale=0.1, size=shape).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=shape[-1:]).astype(np.float32)) + x = tf.matmul(x, weight) + bias + return x + + +def tf_resnet(x, y_, num_layers, num_class=10): + ''' + ResNet model in TensorFlow, for CIFAR10 dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + num_layers: 18 or 34 + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + print("Number of Class: {}".format(num_class)) + base_size = 16 + + x = tf_conv2d(x, 3, base_size, stride=1) + x = tf_batch_norm_with_relu(x, base_size) + + if num_layers == 18: + print("Building ResNet-18 model in tensorflow...") + x = tf_resnet_block(x, base_size, num_blocks=2, is_first=True) + x = tf_resnet_block(x, base_size, num_blocks=2) + x = tf_resnet_block(x, 2 * base_size, num_blocks=2) + x = tf_resnet_block(x, 4 * base_size, num_blocks=2) + elif num_layers == 34: + print("Building ResNet-34 model in tensorflow...") + x = tf_resnet_block(x, base_size, num_blocks=3, is_first=True) + x = tf_resnet_block(x, base_size, num_blocks=4) + x = tf_resnet_block(x, 2 * base_size, num_blocks=6) + x = tf_resnet_block(x, 4 * base_size, num_blocks=3) + else: + assert False, "Number of layers should be 18 or 34 !" + + x = tf_batch_norm_with_relu(x, 8 * base_size) + x = tf.transpose(x, [0, 3, 1, 2]) + x = tf.reshape(x, [-1, 128 * base_size]) + y = tf_fc(x, (128 * base_size, num_class)) + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y + + +def tf_resnet18(x, y_, num_class=10): + return tf_resnet(x, y_, 18, num_class) + + +def tf_resnet34(x, y_, num_class=10): + return tf_resnet(x, y_, 34, num_class) diff --git a/examples/cnn/tf_models/tf_VGG.py b/examples/cnn/tf_models/tf_VGG.py new file mode 100644 index 0000000..124457d --- /dev/null +++ b/examples/cnn/tf_models/tf_VGG.py @@ -0,0 +1,103 @@ +import numpy as np +import tensorflow as tf + + +def conv_bn_relu(x, in_channel, out_channel): + weight = tf.Variable(np.random.normal(scale=0.1, size=( + out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32)) + scale = tf.Variable(np.random.normal( + scale=0.1, size=(out_channel,)).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=(out_channel,)).astype(np.float32)) + x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME') + axis = list(range(len(x.shape) - 1)) + a_mean, a_var = tf.nn.moments(x, axis) + x = tf.nn.batch_normalization( + x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2) + x = tf.nn.relu(x) + return x + + +def vgg_2block(x, in_channel, out_channel): + x = conv_bn_relu(x, in_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ + 1, 2, 2, 1], padding='VALID') + return x + + +def vgg_3block(x, in_channel, out_channel): + x = conv_bn_relu(x, in_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ + 1, 2, 2, 1], padding='VALID') + return x + + +def vgg_4block(x, in_channel, out_channel): + x = conv_bn_relu(x, in_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = conv_bn_relu(x, out_channel, out_channel) + x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ + 1, 2, 2, 1], padding='VALID') + return x + + +def tf_fc(x, in_feat, out_feat): + weight = tf.Variable(np.random.normal( + scale=0.1, size=(in_feat, out_feat)).astype(np.float32)) + bias = tf.Variable(np.random.normal( + scale=0.1, size=(out_feat,)).astype(np.float32)) + x = tf.matmul(x, weight) + bias + return x + + +def tf_vgg(x, y_, num_layers, num_class=10): + ''' + ResNet model in TensorFlow, for CIFAR10 dataset. + + Parameters: + x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C) + y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + num_layers: 18 or 34 + Return: + loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) + y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) + ''' + if num_layers == 16: + print('Building VGG-16 model in tensorflow') + x = vgg_2block(x, 3, 64) + x = vgg_2block(x, 64, 128) + x = vgg_3block(x, 128, 256) + x = vgg_3block(x, 256, 512) + x = vgg_3block(x, 512, 512) + + elif num_layers == 19: + print('Building VGG-19 model in tensorflow') + x = vgg_2block(x, 3, 64) + x = vgg_2block(x, 64, 128) + x = vgg_4block(x, 128, 256) + x = vgg_4block(x, 256, 512) + x = vgg_4block(x, 512, 512) + else: + assert False, "Number of layers should be 18 or 34 !" + + x = tf.reshape(x, [-1, 512]) + x = tf_fc(x, 512, 4096) + x = tf_fc(x, 4096, 4096) + y = tf_fc(x, 4096, num_class) + print("Number of Class: {}".format(num_class)) + + loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + return loss, y + + +def tf_vgg16(x, y_, num_class=10): + return tf_vgg(x, y_, 16, num_class) + + +def tf_vgg19(x, y_, num_class=10): + return tf_vgg(x, y_, 34, num_class) diff --git a/examples/cnn/torch_main.py b/examples/cnn/torch_main.py new file mode 100644 index 0000000..f6ed928 --- /dev/null +++ b/examples/cnn/torch_main.py @@ -0,0 +1,213 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from pytorch_models import * +import hetu as ht +import numpy as np +import argparse +from time import time +import os +import logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def print_rank0(msg): + if local_rank % 8 == 0: + logger.info(msg) + + +def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None): + print_rank0('Epoch: %d' % epoch) + n_train_batches = data.shape[0] // batch_size + + net.train() + + train_loss = 0 + correct = 0 + total = 0 + + for minibatch_index in range(n_train_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + inputs = torch.Tensor(data[minibatch_start:minibatch_end]) + targets = torch.Tensor(label[minibatch_start:minibatch_end]).long() + + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + outputs = net(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + train_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + + print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1))) + print_rank0("Train accuracy = %f" % (100.*correct/total)) + + +def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None): + net.eval() + n_test_batches = data.shape[0] // batch_size + test_loss = 0 + correct = 0 + total = 0 + + with torch.no_grad(): + for minibatch_index in range(n_test_batches): + minibatch_start = minibatch_index * args.batch_size + minibatch_end = (minibatch_index + 1) * args.batch_size + inputs = torch.Tensor(data[minibatch_start:minibatch_end]) + targets = torch.Tensor(label[minibatch_start:minibatch_end]).long() + + inputs, targets = inputs.to(device), targets.to(device) + outputs = net(inputs) + loss = criterion(outputs, targets) + test_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + + print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1))) + print_rank0("Validation accuracy = %f" % (100.*correct/total)) + + +if __name__ == "__main__": + # argument parser + global local_rank + local_rank = 0 + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, required=True, + help='model to be tested') + parser.add_argument('--dataset', type=str, required=True, + help='dataset to be trained on') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=20, help='epoch number') + parser.add_argument('--gpu', type=int, default=0, + help='gpu to be used, -1 means cpu') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + parser.add_argument('--distributed', action='store_true', + help='whether to distributed training') + parser.add_argument('--local_rank', type=int, default=-1) + args = parser.parse_args() + + if args.distributed == True: + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + rank = int(os.getenv('RANK', '0')) + world_size = int(os.getenv("WORLD_SIZE", '1')) + print("***"*50) + print(init_method) + torch.distributed.init_process_group(backend="nccl", + world_size=world_size, + rank=rank, + init_method=init_method) + + if args.gpu == -1: + device = 'cpu' + else: + if args.distributed == True: + local_rank = rank % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + device = torch.device('cuda:%d' % local_rank) + logger.info('Use GPU %d.' % local_rank) + else: + device = torch.device('cuda:%d' % args.gpu) + torch.cuda.set_device(args.gpu) + print_rank0('Use GPU %d.' % args.gpu) + + assert args.model in ['mlp', 'resnet18', 'resnet34', + 'vgg16', 'vgg19', 'rnn'], 'Model not supported now.' + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + + if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100': + net = eval(args.model)(100) + elif args.model == 'rnn': + net = eval(args.model)(28, 10, 128, 28) + else: + net = eval(args.model)() + + assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] + dataset = args.dataset + + net.to(device) + if args.distributed: + net = torch.nn.parallel.DistributedDataParallel( + net, device_ids=[local_rank]) + + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + if args.opt == 'sgd': + print_rank0('Use SGD Optimizer.') + opt = optim.SGD(net.parameters(), lr=args.learning_rate) + elif args.opt == 'momentum': + print_rank0('Use Momentum Optimizer.') + opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9) + elif args.opt == 'nesterov': + print_rank0('Use Nesterov Momentum Optimizer.') + opt = optim.SGD(net.parameters(), lr=args.learning_rate, + momentum=0.9, nesterov=True) + elif args.opt == 'adagrad': + print_rank0('Use AdaGrad Optimizer.') + opt = optim.Adagrad(net.parameters(), lr=args.learning_rate) + else: + print_rank0('Use Adam Optimizer.') + opt = optim.Adam(lr=args.learning_rate) + + criterion = nn.CrossEntropyLoss() + + # data loading + print_rank0('Loading %s data...' % dataset) + if dataset == 'MNIST': + datasets = ht.data.mnist(onehot=False) + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + elif dataset == 'CIFAR10': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( + num_class=10, onehot=False) + if args.model == "mlp": + train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) + valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) + elif dataset == 'CIFAR100': + train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( + num_class=100, onehot=False) + + running_time = 0 + # training + print_rank0("Start training loop...") + for i in range(args.num_epochs + 1): + if args.timing: + start = time() + train(epoch=i, net=net, data=train_set_x, label=train_set_y, + batch_size=args.batch_size, criterion=criterion, optimizer=opt) + if args.timing: + end = time() + print_rank0("Running time of current epoch = %fs" % (end - start)) + if i != 0: + running_time += (end - start) + test(epoch=i, net=net, data=valid_set_x, label=valid_set_y, + batch_size=args.batch_size, criterion=criterion) + + print_rank0("*"*50) + print_rank0("Running time of total %d epoch = %fs" % + (args.num_epochs, running_time)) diff --git a/examples/cnn/worker_conf0.json b/examples/cnn/worker_conf0.json new file mode 100644 index 0000000..b705130 --- /dev/null +++ b/examples/cnn/worker_conf0.json @@ -0,0 +1,9 @@ +{ +"DMLC_ROLE":"worker", +"WORKER_ID":"0", +"DMLC_PS_ROOT_URI":"127.0.0.1", +"DMLC_PS_ROOT_PORT":"13030", +"DMLC_NUM_WORKER":"2", +"DMLC_NUM_SERVER":"1", +"DMLC_PS_VAN_TYPE":"p3" +} diff --git a/examples/cnn/worker_conf1.json b/examples/cnn/worker_conf1.json new file mode 100644 index 0000000..831826d --- /dev/null +++ b/examples/cnn/worker_conf1.json @@ -0,0 +1,9 @@ +{ +"DMLC_ROLE":"worker", +"WORKER_ID":"1", +"DMLC_PS_ROOT_URI":"127.0.0.1", +"DMLC_PS_ROOT_PORT":"13030", +"DMLC_NUM_WORKER":"2", +"DMLC_NUM_SERVER":"1", +"DMLC_PS_VAN_TYPE":"p3" +} diff --git a/examples/ctr/.gitignore b/examples/ctr/.gitignore new file mode 100644 index 0000000..ccb0cb2 --- /dev/null +++ b/examples/ctr/.gitignore @@ -0,0 +1,2 @@ +datasets/ +logs/ diff --git a/examples/ctr/README.md b/examples/ctr/README.md new file mode 100644 index 0000000..c99c5db --- /dev/null +++ b/examples/ctr/README.md @@ -0,0 +1,109 @@ +# CTR Examples (with Distributed Settings) +In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset. + +## Structure +``` +- ctr + - datasets/ contains sampled criteo data + - models/ ctr models in hetu + - tf_models/ ctr models in tensorflow + - settings/ configurations for distributed training + - tests/ test scripts + - kill.sh script to kill all python processes + - run_hetu.py basic trainer for hetu + - run_tf_local.py local trainer for tensorflow + - run_tf_horovod.py trainer for tensorflow in horovod setting + - run_tf_parallax.py trainer for tensorflow in parallax setting + - tf_launch_server.py launcher for server in tensorflow + - tf_launch_worker.py launcher for worker in tensorflow +``` + +## Prepare criteo data +* We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files. +* To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset. + + +## Flags for test files +Here we explain some of the flags you may use in test files: +* model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult') +* config: to specify the configuration file in settings. +* val: whether using validation. +* cache: whether using cache in PS/Hybrid mode. +* bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.) +* all: whether to use all criteo data. +* bound: per embedding entry staleness in cache setting, default to be 100. + + +## Usage +If memory available, you can try to run the model locally, by running +```bash +# run locally +bash tests/local_{model}_{dataset}.sh +# run in ps setting (locally) +bash tests/ps_{model}_{dataset}.sh +# run in hybrid setting (locally) +bash tests/hybrid_{model}_{dataset}.sh + +# run tensorflow locally +python run_tf_local.py --model {model}_{dataset} +# run tensorflow in horovod +horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset} +# run tensorflow in parallax +python {absolute_path_to}/run_tf_parallax.py +# run tensorflow in ps setting +python tf_launch_server.py --config {config} --id {rank} +python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config} +``` + + +## Configuration +We use a simple yaml file to specify the run configuration. + +```yaml +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 +launch : + worker : 4 + server : 1 + scheduler : true +``` + +The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster. + +The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by + + +## Examples +### Local execution +Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data): +```bash +python run_hetu.py --model wdl_criteo (--all) (--val) +``` + +### PS mode execution +Run ps locally, here we can also run on multiple nodes. +```bash +# launch scheduler and server, -n means number of servers, --sched means using scheduler +python -m hetu.launcher {config} -n 1 --sched +# launch workers (or run scheduler and server together if configured in config file) +python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10) +``` +You can also specify the cache to be used and also the cache bound. + + +### Hybrid mode execution +You must launch a scheduler and server in one terminal: +```bash +python -m hetu.launcher {config} -n 1 --sched +``` +And then launch the workers simultaneously using mpirun command: +```bash +mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ... +``` +Or if in distributed nodes setting: +``` +mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ... +``` diff --git a/examples/ctr/kill.sh b/examples/ctr/kill.sh new file mode 100755 index 0000000..4d99771 --- /dev/null +++ b/examples/ctr/kill.sh @@ -0,0 +1,3 @@ +#/bin/bash +#pkill -f mnist_mlp_ps.py +kill -9 $(pidof python) diff --git a/examples/ctr/models/__init__.py b/examples/ctr/models/__init__.py new file mode 100644 index 0000000..8c17586 --- /dev/null +++ b/examples/ctr/models/__init__.py @@ -0,0 +1,5 @@ +from .wdl_adult import wdl_adult +from .dcn_criteo import dcn_criteo +from .dc_criteo import dc_criteo +from .wdl_criteo import wdl_criteo +from .deepfm_criteo import dfm_criteo diff --git a/examples/ctr/models/dc_criteo.py b/examples/ctr/models/dc_criteo.py new file mode 100644 index 0000000..30b3522 --- /dev/null +++ b/examples/ctr/models/dc_criteo.py @@ -0,0 +1,63 @@ +import hetu as ht +from hetu import init + +import numpy as np +import time + + +def residual_layer(x0, input_dim, hidden_dim): + + embedding_len = input_dim + weight_1 = init.random_normal( + shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1') + bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1') + weight_2 = init.random_normal( + shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2') + bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2') + + x0w = ht.matmul_op(x0, weight_1) # (batch, hidden_dim) + x0w_b = x0w + ht.broadcastto_op(bias_1, x0w) + + relu1 = ht.relu_op(x0w_b) + x1w = ht.matmul_op(relu1, weight_2) # (batch, input_dim) + x1w_b = x1w + ht.broadcastto_op(bias_2, x1w) + residual = x1w_b + x0 + y = ht.relu_op(residual) + return y + + +def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3): + for i in range(num_layers): + x0 = residual_layer(x0, input_dim, hidden_dim) + return x0 + + +def dc_criteo(dense_input, sparse_input, y_): + + feature_dimension = 33762577 + embedding_size = 8 + learning_rate = 0.001 + + Embedding = init.random_normal( + [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") + sparse_input = ht.embedding_lookup_op(Embedding, sparse_input) + sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) + + # dc_model + x = ht.concat_op(sparse_input, dense_input, axis=1) + + input_dim = 26 * 8 + 13 + hidden_dim = input_dim + residual_out = build_residual_layers( + x, input_dim, hidden_dim, num_layers=5) + + W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4") + y = ht.matmul_op(residual_out, W4) + y = ht.sigmoid_op(y) + + loss = ht.binarycrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) + train_op = opt.minimize(loss) + + return loss, y, y_, train_op diff --git a/examples/ctr/models/dcn_criteo.py b/examples/ctr/models/dcn_criteo.py new file mode 100644 index 0000000..2222d22 --- /dev/null +++ b/examples/ctr/models/dcn_criteo.py @@ -0,0 +1,68 @@ +import hetu as ht +from hetu import init + +import numpy as np +import time + + +def cross_layer(x0, x1): + # x0: input embedding feature (batch_size, 26 * embedding_size + 13) + # x1: the output of last layer (batch_size, 26 * embedding_size + 13) + + embedding_len = 26 * 128 + 13 + weight = init.random_normal( + shape=(embedding_len, 1), stddev=0.01, name='weight') + bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias') + x1w = ht.matmul_op(x1, weight) # (batch_size, 1) + y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0)) + y = y + x1 + ht.broadcastto_op(bias, y) + return y + + +def build_cross_layer(x0, num_layers=3): + x1 = x0 + for i in range(num_layers): + x1 = cross_layer(x0, x1) + return x1 + + +def dcn_criteo(dense_input, sparse_input, y_): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.003 + + Embedding = init.random_normal( + [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) + sparse_input = ht.embedding_lookup_op( + Embedding, sparse_input, ctx=ht.cpu(0)) + sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) + x = ht.concat_op(sparse_input, dense_input, axis=1) + # Cross Network + cross_output = build_cross_layer(x, num_layers=3) + + # DNN + flatten = x + W1 = init.random_normal( + [26*embedding_size + 13, 256], stddev=0.01, name="W1") + W2 = init.random_normal([256, 256], stddev=0.01, name="W2") + W3 = init.random_normal([256, 256], stddev=0.01, name="W3") + + W4 = init.random_normal( + [256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4") + + fc1 = ht.matmul_op(flatten, W1) + relu1 = ht.relu_op(fc1) + fc2 = ht.matmul_op(relu1, W2) + relu2 = ht.relu_op(fc2) + y3 = ht.matmul_op(relu2, W3) + + y4 = ht.concat_op(cross_output, y3, axis=1) + y = ht.matmul_op(y4, W4) + y = ht.sigmoid_op(y) + + loss = ht.binarycrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) + train_op = opt.minimize(loss) + + return loss, y, y_, train_op diff --git a/examples/ctr/models/deepfm_criteo.py b/examples/ctr/models/deepfm_criteo.py new file mode 100644 index 0000000..d84f299 --- /dev/null +++ b/examples/ctr/models/deepfm_criteo.py @@ -0,0 +1,59 @@ +import hetu as ht +from hetu import init + +import numpy as np +import time + + +def dfm_criteo(dense_input, sparse_input, y_): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.01 + + # FM + Embedding1 = init.random_normal( + [feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0)) + FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") + sparse_1dim_input = ht.embedding_lookup_op( + Embedding1, sparse_input, ctx=ht.cpu(0)) + fm_dense_part = ht.matmul_op(dense_input, FM_W) + fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1) + # fst order output + y1 = fm_dense_part + fm_sparse_part + + Embedding2 = init.random_normal( + [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) + sparse_2dim_input = ht.embedding_lookup_op( + Embedding2, sparse_input, ctx=ht.cpu(0)) + sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1) + sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum) + + sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input) + sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1) + sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum + sparse_2dim_half = sparse_2dim * 0.5 + # snd order output + y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) + + # DNN + flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size)) + W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1") + W2 = init.random_normal([256, 256], stddev=0.01, name="W2") + W3 = init.random_normal([256, 1], stddev=0.01, name="W3") + + fc1 = ht.matmul_op(flatten, W1) + relu1 = ht.relu_op(fc1) + fc2 = ht.matmul_op(relu1, W2) + relu2 = ht.relu_op(fc2) + y3 = ht.matmul_op(relu2, W3) + + y4 = y1 + y2 + y = y4 + y3 + y = ht.sigmoid_op(y) + + loss = ht.binarycrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) + train_op = opt.minimize(loss) + + return loss, y, y_, train_op diff --git a/examples/ctr/models/load_data.py b/examples/ctr/models/load_data.py new file mode 100644 index 0000000..6122342 --- /dev/null +++ b/examples/ctr/models/load_data.py @@ -0,0 +1,320 @@ +import os +import numpy as np + + +########################################################################### +# criteo +########################################################################### + +def download_criteo(path): + import tarfile + import pandas as pd + from six.moves import urllib + if not os.path.exists(path): + os.makedirs(path) + assert os.path.isdir(path), 'Please provide a directory path.' + # this source may be invalid, please use other valid sources. + origin = ( + 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz' + ) + print('Downloading data from %s' % origin) + dataset = os.path.join(path, 'criteo.tar.gz') + urllib.request.urlretrieve(origin, dataset) + print("Extracting criteo zip...") + with tarfile.open(dataset) as f: + f.extractall(path=path) + print("Create local files...") + + # save csv filed + df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None) + df.columns = ['label'] + ["I" + + str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)] + df.to_csv(os.path.join(path, "train.csv"), index=0) + print('Csv file saved.') + + # save numpy arrays + target_path = [os.path.join(path, filename) for filename in [ + 'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy', + 'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']] + dense_feats = [col for col in df.columns if col.startswith('I')] + sparse_feats = [col for col in df.columns if col.startswith('C')] + labels = df['label'] + dense_feats = process_dense_feats(df, dense_feats) + sparse_feats = process_sparse_feats(df, sparse_feats) + num_data = dense_feats.shape[0] + perm = np.random.permutation(num_data) + # split data in 2 parts + test_num = num_data // 10 + processed_data = [ + dense_feats[perm[:-test_num]], # train dense + sparse_feats[perm[:-test_num]], # train sparse + labels[perm[:-test_num]], # train labels + dense_feats[perm[-test_num:]], # validate dense + sparse_feats[perm[-test_num:]], # validate sparse + labels[perm[-test_num:]], # validate labels + ] + print('Array shapes:') + for i in range(len(processed_data)): + print(os.path.split(target_path[i]) + [-1].split('.')[0], processed_data[i].shape) + np.save(target_path[i], processed_data[i]) + print('Numpy arrays saved.') + + +def process_dense_feats(data, feats): + d = data.copy() + d = d[feats].fillna(0.0) + for f in feats: + d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1) + return d + + +def process_sparse_feats(data, feats): + from sklearn.preprocessing import LabelEncoder + # process to embeddings. + d = data.copy() + d = d[feats].fillna("-1") + for f in feats: + label_encoder = LabelEncoder() + d[f] = label_encoder.fit_transform(d[f]) + feature_cnt = 0 + for f in feats: + d[f] += feature_cnt + feature_cnt += d[f].nunique() + return d + + +def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True): + import pandas as pd + csv_path = os.path.join(path, "train.csv") + if not os.path.exists(csv_path): + download_criteo(path) + df = pd.read_csv(csv_path, nrows=nrows, header=0) + dense_feats = [col for col in df.columns if col.startswith('I')] + sparse_feats = [col for col in df.columns if col.startswith('C')] + labels = np.array(df['label']).reshape(-1, 1) + dense_feats = np.array(process_dense_feats(df, dense_feats)) + sparse_feats = np.array(process_sparse_feats( + df, sparse_feats)).astype(np.int32) + if return_val: + test_num = nrows // 10 + train_dense = dense_feats[:-test_num] + train_sparse = sparse_feats[:-test_num] + train_label = labels[:-test_num] + validate_dense = dense_feats[-test_num:] + validate_sparse = sparse_feats[-test_num:] + validate_label = labels[-test_num:] + return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label) + else: + return dense_feats, sparse_feats, labels + + +def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')): + # all data should be available! no checking. + processed_data = [np.load(os.path.join(path, filename)) + for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']] + return tuple(processed_data) + + +def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True): + file_paths = [os.path.join(path, filename) for filename in [ + 'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy', + 'test_sparse_feats.npy', 'train_labels.npy', 'test_labels.npy']] + if not all([os.path.exists(p) for p in file_paths]): + download_criteo(path) + files = [np.load(filename) for filename in file_paths] + if return_val: + return (files[0], files[1]), (files[2], files[3]), (files[4], files[5]) + else: + return files[0], files[2], files[4] + + +########################################################################### +# adult +########################################################################### + +def maybe_download(train_data, test_data): + import pandas as pd + """if adult data "train.csv" and "test.csv" are not in your directory, + download them. + """ + + COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", + "marital_status", "occupation", "relationship", "race", "gender", + "capital_gain", "capital_loss", "hours_per_week", "native_country", + "income_bracket"] + + if not os.path.exists(train_data): + print("downloading training data...") + df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", + names=COLUMNS, skipinitialspace=True) + else: + df_train = pd.read_csv("train.csv") + + if not os.path.exists(test_data): + print("downloading testing data...") + df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", + names=COLUMNS, skipinitialspace=True, skiprows=1) + else: + df_test = pd.read_csv("test.csv") + + return df_train, df_test + + +def cross_columns(x_cols): + """simple helper to build the crossed columns in a pandas dataframe + """ + crossed_columns = dict() + colnames = ['_'.join(x_c) for x_c in x_cols] + for cname, x_c in zip(colnames, x_cols): + crossed_columns[cname] = x_c + return crossed_columns + + +def val2idx(df, cols): + """helper to index categorical columns before embeddings. + """ + val_types = dict() + for c in cols: + val_types[c] = df[c].unique() + + val_to_idx = dict() + for k, v in val_types.items(): + val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])} + + for k, v in val_to_idx.items(): + df[k] = df[k].apply(lambda x: v[x]) + + unique_vals = dict() + for c in cols: + unique_vals[c] = df[c].nunique() + + return df, unique_vals + + +def onehot(x): + from sklearn.preprocessing import OneHotEncoder + return np.array(OneHotEncoder().fit_transform(x).todense()) + + +def wide(df_train, df_test, wide_cols, x_cols, target): + import pandas as pd + print('Processing wide data') + df_train['IS_TRAIN'] = 1 + df_test['IS_TRAIN'] = 0 + df_wide = pd.concat([df_train, df_test]) + + crossed_columns_d = cross_columns(x_cols) + categorical_columns = list( + df_wide.select_dtypes(include=['object']).columns) + + wide_cols += list(crossed_columns_d.keys()) + + for k, v in crossed_columns_d.items(): + df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1) + + df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']] + + dummy_cols = [ + c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())] + df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols]) + + train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) + test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) + assert all(train.columns == test.columns) + + cols = [c for c in train.columns if c != target] + X_train = train[cols].values + y_train = train[target].values.reshape(-1, 1) + X_test = test[cols].values + y_test = test[target].values.reshape(-1, 1) + return X_train, y_train, X_test, y_test + + +def load_adult_data(return_val=True): + import pandas as pd + df_train, df_test = maybe_download("train.csv", "test.csv") + + df_train['income_label'] = ( + df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) + df_test['income_label'] = ( + df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) + + age_groups = [0, 25, 65, 90] + age_labels = range(len(age_groups) - 1) + df_train['age_group'] = pd.cut( + df_train['age'], age_groups, labels=age_labels) + df_test['age_group'] = pd.cut( + df_test['age'], age_groups, labels=age_labels) + + # columns for wide model + wide_cols = ['workclass', 'education', 'marital_status', 'occupation', + 'relationship', 'race', 'gender', 'native_country', 'age_group'] + x_cols = (['education', 'occupation'], ['native_country', 'occupation']) + + # columns for deep model + embedding_cols = ['workclass', 'education', 'marital_status', 'occupation', + 'relationship', 'race', 'gender', 'native_country'] + cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week'] + + target = 'income_label' + + x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide( + df_train, df_test, wide_cols, x_cols, target) + x_train_wide = np.array(x_train_wide).astype(np.float32) + x_test_wide = np.array(x_test_wide).astype(np.float32) + + print('Processing deep data') + df_train['IS_TRAIN'] = 1 + df_test['IS_TRAIN'] = 0 + df_deep = pd.concat([df_train, df_test]) + + deep_cols = embedding_cols + cont_cols + df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']] + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]), + columns=cont_cols) + df_deep, unique_vals = val2idx(df_deep, embedding_cols) + + train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) + test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) + + x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32) + y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32) + x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32) + y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32) + + x_train_deep = np.transpose(x_train_deep) + x_test_deep = np.transpose(x_test_deep) + y_train = onehot(y_train) + y_test = onehot(y_test) + + if return_val: + return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test + else: + return x_train_deep, x_train_wide, y_train + + +########################################################################### +# avazu +########################################################################### + +def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')): + import pandas as pd + # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data + train_file = os.path.join(path, 'train.csv') + # test_file = os.path.join(path, 'test.csv') # useless, no labels + + df_train = pd.read_csv(train_file) + sparse_feats = process_sparse_feats(df_train, df_train.columns[2:]) + # the embedding num for each feature: + # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60] + # sum: 9449445 + + np.save(os.path.join(path, 'sparse.npy'), sparse_feats) + + +if __name__ == '__main__': + download_criteo(os.path.join(os.path.split( + os.path.abspath(__file__)), '../datasets/criteo')) diff --git a/examples/ctr/models/wdl_adult.py b/examples/ctr/models/wdl_adult.py new file mode 100644 index 0000000..1ce77e0 --- /dev/null +++ b/examples/ctr/models/wdl_adult.py @@ -0,0 +1,56 @@ +import hetu as ht +from hetu import init + + +def wdl_adult(X_deep, X_wide, y_): + lr = 5 / 128 + dim_wide = 809 + dim_deep = 68 + + W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") + W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") + b1 = init.random_normal([50], stddev=0.1, name="b1") + W2 = init.random_normal([50, 20], stddev=0.1, name="W2") + b2 = init.random_normal([20], stddev=0.1, name="b2") + + # deep + Embedding = [] + X_deep_input = None + + for i in range(8): + Embedding_name = "Embedding_deep_" + str(i) + Embedding.append(init.random_normal( + [50, 8], stddev=0.1, name=Embedding_name)) + now = ht.embedding_lookup_op(Embedding[i], X_deep[i]) + now = ht.array_reshape_op(now, (-1, 8)) + if X_deep_input is None: + X_deep_input = now + else: + X_deep_input = ht.concat_op(X_deep_input, now, 1) + + for i in range(4): + now = ht.array_reshape_op(X_deep[i + 8], (-1, 1)) + X_deep_input = ht.concat_op(X_deep_input, now, 1) + + mat1 = ht.matmul_op(X_deep_input, W1) + add1 = mat1 + ht.broadcastto_op(b1, mat1) + relu1 = ht.relu_op(add1) + dropout1 = relu1 + mat2 = ht.matmul_op(dropout1, W2) + add2 = mat2 + ht.broadcastto_op(b2, mat2) + relu2 = ht.relu_op(add2) + dropout2 = relu2 + dmodel = dropout2 + + # wide + wmodel = ht.concat_op(X_wide, dmodel, 1) + wmodel = ht.matmul_op(wmodel, W) + + prediction = wmodel + loss = ht.softmaxcrossentropy_op(prediction, y_) + loss = ht.reduce_mean_op(loss, [0]) + + opt = ht.optim.SGDOptimizer(learning_rate=lr) + train_op = opt.minimize(loss) + + return loss, prediction, y_, train_op diff --git a/examples/ctr/models/wdl_criteo.py b/examples/ctr/models/wdl_criteo.py new file mode 100644 index 0000000..416ae24 --- /dev/null +++ b/examples/ctr/models/wdl_criteo.py @@ -0,0 +1,42 @@ +import hetu as ht +from hetu import init + +import numpy as np +import time + + +def wdl_criteo(dense_input, sparse_input, y_): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.01 + Embedding = init.random_normal( + [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) + sparse_input = ht.embedding_lookup_op( + Embedding, sparse_input, ctx=ht.cpu(0)) + sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) + + # DNN + flatten = dense_input + W1 = init.random_normal([13, 256], stddev=0.01, name="W1") + W2 = init.random_normal([256, 256], stddev=0.01, name="W2") + W3 = init.random_normal([256, 256], stddev=0.01, name="W3") + + W4 = init.random_normal( + [256 + 26*embedding_size, 1], stddev=0.01, name="W4") + + fc1 = ht.matmul_op(flatten, W1) + relu1 = ht.relu_op(fc1) + fc2 = ht.matmul_op(relu1, W2) + relu2 = ht.relu_op(fc2) + y3 = ht.matmul_op(relu2, W3) + + y4 = ht.concat_op(sparse_input, y3, axis=1) + y = ht.matmul_op(y4, W4) + y = ht.sigmoid_op(y) + + loss = ht.binarycrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) + train_op = opt.minimize(loss) + + return loss, y, y_, train_op diff --git a/examples/ctr/run_hetu.py b/examples/ctr/run_hetu.py new file mode 100644 index 0000000..5745af3 --- /dev/null +++ b/examples/ctr/run_hetu.py @@ -0,0 +1,230 @@ +import hetu as ht +from hetu.launcher import launch + +import os +import os.path as osp +import numpy as np +import yaml +import time +import argparse +from tqdm import tqdm +from sklearn import metrics + + +def worker(args): + def train(iterations, auc_enabled=True, tqdm_enabled=False): + localiter = tqdm(range(iterations) + ) if tqdm_enabled else range(iterations) + train_loss = [] + train_acc = [] + if auc_enabled: + train_auc = [] + for it in localiter: + loss_val, predict_y, y_val, _ = executor.run( + 'train', convert_to_numpy_ret_vals=True) + if y_val.shape[1] == 1: # for criteo case + acc_val = np.equal( + y_val, + predict_y > 0.5).astype(np.float32) + else: + acc_val = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + if auc_enabled: + train_auc.append(metrics.roc_auc_score(y_val, predict_y)) + if auc_enabled: + return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc) + else: + return np.mean(train_loss), np.mean(train_acc) + + def validate(iterations, tqdm_enabled=False): + localiter = tqdm(range(iterations) + ) if tqdm_enabled else range(iterations) + test_loss = [] + test_acc = [] + test_auc = [] + for it in localiter: + loss_val, test_y_predicted, y_test_val = executor.run( + 'validate', convert_to_numpy_ret_vals=True) + if y_test_val.shape[1] == 1: # for criteo case + correct_prediction = np.equal( + y_test_val, + test_y_predicted > 0.5).astype(np.float32) + else: + correct_prediction = np.equal( + np.argmax(y_test_val, 1), + np.argmax(test_y_predicted, 1)).astype(np.float32) + test_loss.append(loss_val[0]) + test_acc.append(correct_prediction) + test_auc.append(metrics.roc_auc_score( + y_test_val, test_y_predicted)) + return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc) + + def get_current_shard(data): + if args.comm is not None: + part_size = data.shape[0] // nrank + start = part_size * rank + end = start + part_size if rank != nrank - 1 else data.shape[0] + return data[start:end] + else: + return data + + batch_size = 128 + dataset = args.dataset + model = args.model + device_id = 0 + + if args.comm == 'PS': + rank = ht.get_worker_communicate().rank() + nrank = int(os.environ['DMLC_NUM_WORKER']) + device_id = rank % 8 + elif args.comm == 'Hybrid': + comm = ht.wrapped_mpi_nccl_init() + device_id = comm.dev_id + rank = comm.rank + nrank = int(os.environ['DMLC_NUM_WORKER']) + + if dataset == 'criteo': + # define models for criteo + if args.all: + from models.load_data import process_all_criteo_data + dense, sparse, labels = process_all_criteo_data( + return_val=args.val) + elif args.val: + from models.load_data import process_head_criteo_data + dense, sparse, labels = process_head_criteo_data(return_val=True) + else: + from models.load_data import process_sampled_criteo_data + dense, sparse, labels = process_sampled_criteo_data() + if isinstance(dense, tuple): + dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [ + get_current_shard(dense[1]), batch_size, 'validate']]) + sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [ + get_current_shard(sparse[1]), batch_size, 'validate']]) + y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [ + get_current_shard(labels[1]), batch_size, 'validate']]) + else: + dense_input = ht.dataloader_op( + [[get_current_shard(dense), batch_size, 'train']]) + sparse_input = ht.dataloader_op( + [[get_current_shard(sparse), batch_size, 'train']]) + y_ = ht.dataloader_op( + [[get_current_shard(labels), batch_size, 'train']]) + elif dataset == 'adult': + from models.load_data import load_adult_data + x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data() + dense_input = [ + ht.dataloader_op([ + [get_current_shard(x_train_deep[:, i]), batch_size, 'train'], + [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'], + ]) for i in range(12) + ] + sparse_input = ht.dataloader_op([ + [get_current_shard(x_train_wide), batch_size, 'train'], + [get_current_shard(x_test_wide), batch_size, 'validate'], + ]) + y_ = ht.dataloader_op([ + [get_current_shard(y_train), batch_size, 'train'], + [get_current_shard(y_test), batch_size, 'validate'], + ]) + else: + raise NotImplementedError + print("Data loaded.") + + loss, prediction, y_, train_op = model(dense_input, sparse_input, y_) + + eval_nodes = {'train': [loss, prediction, y_, train_op]} + if args.val: + print('Validation enabled...') + eval_nodes['validate'] = [loss, prediction, y_] + executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs') + executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id), + comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path) + + if args.all and dataset == 'criteo': + print('Processing all data...') + file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[ + args.comm], args.raw_model) + file_path += '%d.log' % rank if args.comm else '.log' + file_path = osp.join(osp.dirname( + osp.abspath(__file__)), 'logs', file_path) + log_file = open(file_path, 'w') + total_epoch = args.nepoch if args.nepoch > 0 else 11 + for ep in range(total_epoch): + print("ep: %d" % ep) + ep_st = time.time() + train_loss, train_acc, train_auc = train(executor.get_batch_num( + 'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True) + ep_en = time.time() + if args.val: + val_loss, val_acc, val_auc = validate( + executor.get_batch_num('validate')) + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ + % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st) + else: + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ + % (train_loss, train_acc, train_auc, ep_en - ep_st) + print(printstr) + log_file.write(printstr + '\n') + log_file.flush() + else: + total_epoch = args.nepoch if args.nepoch > 0 else 50 + for ep in range(total_epoch): + if ep == 5: + start = time.time() + print("epoch %d" % ep) + ep_st = time.time() + train_loss, train_acc = train( + executor.get_batch_num('train'), auc_enabled=False) + ep_en = time.time() + if args.val: + val_loss, val_acc, val_auc = validate( + executor.get_batch_num('validate')) + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f" + % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc)) + else: + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (train_loss, train_acc, ep_en - ep_st)) + print('all time:', time.time() - start) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to be tested") + parser.add_argument("--val", action="store_true", + help="whether to use validation") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + parser.add_argument("--comm", default=None, + help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid") + parser.add_argument("--bsp", action="store_true", + help="whether to use bsp instead of asp") + parser.add_argument("--cache", default=None, help="cache policy") + parser.add_argument("--bound", default=100, help="cache bound") + parser.add_argument("--config", type=str, default=osp.join(osp.dirname( + osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps") + parser.add_argument("--nepoch", type=int, default=-1, + help="num of epochs, each train 1/10 data") + args = parser.parse_args() + import models + print('Model:', args.model) + model = eval('models.' + args.model) + args.dataset = args.model.split('_')[-1] + args.raw_model = args.model + args.model = model + if args.comm is None: + worker(args) + elif args.comm == 'Hybrid': + settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader) + value = settings['shared'] + os.environ['DMLC_ROLE'] = 'worker' + for k, v in value.items(): + os.environ[k] = str(v) + worker(args) + elif args.comm == 'PS': + launch(worker, args) + else: + raise NotImplementedError diff --git a/examples/ctr/run_tf_horovod.py b/examples/ctr/run_tf_horovod.py new file mode 100644 index 0000000..dc590cc --- /dev/null +++ b/examples/ctr/run_tf_horovod.py @@ -0,0 +1,174 @@ +import os +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from sklearn import metrics +import horovod.tensorflow as hvd + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + +# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model +# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model +# if using multi nodes setting in conda, need to modify /etc/bash.bashrc +# we can also use mpirun (default gloo): +# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ +# -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model + + +def train_criteo(model, args): + hvd.init() + + def get_current_shard(data): + part_size = data.shape[0] // hvd.size() + start = part_size * hvd.rank() + end = start + part_size if hvd.rank() != hvd.size() - \ + 1 else data.shape[0] + return data[start:end] + + if args.all: + from models.load_data import process_all_criteo_data + dense, sparse, all_labels = process_all_criteo_data() + dense_feature = get_current_shard(dense[0]) + sparse_feature = get_current_shard(sparse[0]) + labels = get_current_shard(all_labels[0]) + val_dense = get_current_shard(dense[1]) + val_sparse = get_current_shard(sparse[1]) + val_labels = get_current_shard(all_labels[1]) + else: + from models.load_data import process_sampled_criteo_data + dense_feature, sparse_feature, labels = process_sampled_criteo_data() + dense_feature = get_current_shard(dense_feature) + sparse_feature = get_current_shard(sparse_feature) + labels = get_current_shard(labels) + + batch_size = 128 + dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) + sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) + + loss, y, opt = model(dense_input, sparse_input, y_) + global_step = tf.train.get_or_create_global_step() + # here in DistributedOptimizer by default all tensor are reduced on GPU + # can use device_sparse=xxx, device_dense=xxx to modify + # if using device_sparse='/cpu:0', the performance degrades + train_op = hvd.DistributedOptimizer( + opt).minimize(loss, global_step=global_step) + + gpu_options = tf.compat.v1.GPUOptions( + allow_growth=True, visible_device_list=str(hvd.local_rank())) + # here horovod default use gpu to initialize, which will cause OOM + hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] + sess = tf.compat.v1.train.MonitoredTrainingSession( + hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + + my_feed_dict = { + dense_input: np.empty(shape=(batch_size, 13)), + sparse_input: np.empty(shape=(batch_size, 26)), + y_: np.empty(shape=(batch_size, 1)), + } + + if args.all: + raw_log_file = './logs/tf_hvd_%s_%d.log' % ( + args.model, hvd.local_rank()) + print('Processing all data, log to', raw_log_file) + log_file = open(raw_log_file, 'w') + iterations = dense_feature.shape[0] // batch_size + total_epoch = 400 + start_index = 0 + for ep in range(total_epoch): + print("epoch %d" % ep) + st_time = time.time() + train_loss, train_acc, train_auc = [], [], [] + for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + start_index += batch_size + if start_index + batch_size > dense_feature.shape[0]: + start_index = 0 + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + train_auc.append(metrics.roc_auc_score(true_val, pred_val)) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + tra_auc = np.mean(train_auc) + en_time = time.time() + train_time = en_time - st_time + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, tra_auc, train_time) + print(printstr) + log_file.write(printstr + '\n') + log_file.flush() + + else: + iterations = dense_feature.shape[0] // batch_size + + epoch = 50 + for ep in range(epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + for idx in range(iterations): + start_index = idx * batch_size + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + if pred_val.shape[1] == 1: # for criteo case + acc_val = np.equal( + true_val, + pred_val > 0.5) + else: + acc_val = np.equal( + np.argmax(pred_val, 1), + np.argmax(true_val, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print('all time:', (time.time() - start)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to be tested") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + raw_model = args.model + import tf_models + model = eval('tf_models.' + raw_model) + dataset = raw_model.split('_')[-1] + print('Model:', raw_model) + train_criteo(model, args) + + +if __name__ == '__main__': + main() diff --git a/examples/ctr/run_tf_local.py b/examples/ctr/run_tf_local.py new file mode 100644 index 0000000..1b0bc6d --- /dev/null +++ b/examples/ctr/run_tf_local.py @@ -0,0 +1,202 @@ +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from sklearn import metrics + + +def train_criteo(model, args): + if args.all: + from models.load_data import process_all_criteo_data + dense, sparse, all_labels = process_all_criteo_data() + dense_feature, val_dense = dense + sparse_feature, val_sparse = sparse + labels, val_labels = all_labels + else: + from models.load_data import process_sampled_criteo_data + dense_feature, sparse_feature, labels = process_sampled_criteo_data() + + batch_size = 128 + dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) + sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) + + loss, y, opt = model(dense_input, sparse_input, y_) + train_op = opt.minimize(loss) + + init = tf.compat.v1.global_variables_initializer() + gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) + sess = tf.compat.v1.Session( + config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + sess.run(init) + + my_feed_dict = { + dense_input: np.empty(shape=(batch_size, 13)), + sparse_input: np.empty(shape=(batch_size, 26)), + y_: np.empty(shape=(batch_size, 1)), + } + + if args.all: + raw_log_file = './logs/tf_local_%s.log' % (args.model) + print('Processing all data, log to', raw_log_file) + log_file = open(raw_log_file, 'w') + iterations = dense_feature.shape[0] // batch_size + total_epoch = 11 + start_index = 0 + for ep in range(total_epoch): + print("epoch %d" % ep) + st_time = time.time() + train_loss, train_acc, train_auc = [], [], [] + for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + start_index += batch_size + if start_index + batch_size > dense_feature.shape[0]: + start_index = 0 + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + train_auc.append(metrics.roc_auc_score(true_val, pred_val)) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + tra_auc = np.mean(train_auc) + en_time = time.time() + train_time = en_time - st_time + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, tra_auc, train_time) + print(printstr) + log_file.write(printstr + '\n') + log_file.flush() + + else: + iteration = dense_feature.shape[0] // batch_size + + epoch = 50 + for ep in range(epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + for idx in range(iteration): + start_index = idx * batch_size + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + if pred_val.shape[1] == 1: # for criteo case + acc_val = np.equal( + true_val, + pred_val > 0.5) + else: + acc_val = np.equal( + np.argmax(pred_val, 1), + np.argmax(true_val, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print('all time:', (time.time() - start)) + + +def train_adult(model): + batch_size = 128 + total_epoch = 50 + dim_wide = 809 + + X_deep = [] + for i in range(8): + X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1])) + for i in range(4): + X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1])) + X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide]) + y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2]) + loss, y, train_op = model(X_deep, X_wide, y_) + + init = tf.global_variables_initializer() + + gpu_options = tf.GPUOptions(allow_growth=True) + sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + + sess.run(init) + + from models.load_data import load_adult_data + x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False) + + iterations = x_train_deep.shape[0] // batch_size + for ep in range(total_epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + pre_index = 0 + + for it in range(iterations): + batch_x_deep = x_train_deep[pre_index:pre_index + batch_size] + batch_x_wide = x_train_wide[pre_index:pre_index + batch_size] + batch_y = y_train[pre_index:pre_index + batch_size] + pre_index += batch_size + + my_feed_dict = dict() + for i in range(12): + my_feed_dict[X_deep[i]] = np.array( + batch_x_deep[:, 1]).reshape(-1, 1) + + my_feed_dict[X_wide] = np.array(batch_x_wide) + my_feed_dict[y_] = batch_y + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + acc_val = np.equal( + np.argmax(loss_val[1], 1), + np.argmax(loss_val[2], 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print('all time:', (time.time() - start)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to be tested") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + raw_model = args.model + import tf_models + model = eval('tf_models.' + raw_model) + dataset = raw_model.split('_')[-1] + print('Model:', raw_model) + + if dataset == 'criteo': + train_criteo(model, args) + elif dataset == 'adult': + train_adult(model) + else: + raise NotImplementedError + + +if __name__ == '__main__': + main() diff --git a/examples/ctr/run_tf_parallax.py b/examples/ctr/run_tf_parallax.py new file mode 100644 index 0000000..9642775 --- /dev/null +++ b/examples/ctr/run_tf_parallax.py @@ -0,0 +1,211 @@ +import os +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from sklearn import metrics + +from autodist import AutoDist +from autodist.resource_spec import ResourceSpec +from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax +from autodist.strategy.base import Strategy +from autodist.kernel.common.utils import get_op_name +from tensorflow.python.framework import ops + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + +# Please DO NOT modify /etc/bash.bashrc to activate conda environment. +# Use python_venv in spec yml file instead. +# Use absolute path of python file. +# Here we use the tf native partitioner instead of autodist's PartitionPS. + + +class Parallaxx(PSLoadBalancing, AllReduce): + """ + Modify original parallax to remove replica on CPUs. + """ + + def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0): + PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness) + AllReduce.__init__(self, chunk_size) + + # pylint: disable=attribute-defined-outside-init + def build(self, graph_item, resource_spec): + """Generate the strategy.""" + expr = Strategy() + + # For each variable, generate variable synchronizer config + expr.graph_config.replicas.extend( + [k for k, v in resource_spec.gpu_devices]) + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + self.loads = {ps: 0.0 for ps in reduction_device_names} + + # Generate node config + node_config = [] + for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()): + var_op_name = get_op_name(var.name) + grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name] + if isinstance(grad, ops.Tensor): # this is a dense variable + group_id = idx // self.chunk_size + config = self._gen_all_reduce_node_config( + var.name, group=group_id) + else: # sparse updates + # For Parallax Strategy, all PS vars are sparse so we don't use a proxy. + # Sparse variables are likely larger, so keeping copies would be costlier, + # and usually each device only requires a small part of the overall variable. + config = self._gen_ps_node_config( + var, + # For Parallax Strategy, all PS vars are sparse which does not need proxy. + False, + self._sync, + self._staleness + ) + node_config.append(config) + expr.node_config.extend(node_config) + + return expr + + +def train_criteo(model, args): + resource_spec_file = os.path.join(os.path.dirname( + __file__), 'settings', 'plx_local_spec.yml') + autodist = AutoDist(resource_spec_file, Parallaxx()) + respec = ResourceSpec(resource_spec_file) + if args.all: + from models.load_data import process_all_criteo_data + dense, sparse, all_labels = process_all_criteo_data() + dense_feature, val_dense = dense + sparse_feature, val_sparse = sparse + labels, val_labels = all_labels + else: + from models.load_data import process_sampled_criteo_data + dense_feature, sparse_feature, labels = process_sampled_criteo_data() + + # autodist will split the feeding data + batch_size = 128 + with tf.Graph().as_default() as g, autodist.scope(): + dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) + sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) + embed_partitioner = tf.fixed_size_partitioner( + len(respec.nodes), 0) if len(respec.nodes) > 1 else None + loss, y, opt = model(dense_input, sparse_input, + y_, embed_partitioner, False) + train_op = opt.minimize(loss) + + sess = autodist.create_distributed_session() + + my_feed_dict = { + dense_input: np.empty(shape=(batch_size, 13)), + sparse_input: np.empty(shape=(batch_size, 26)), + y_: np.empty(shape=(batch_size, 1)), + } + + if args.all: + raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[ + 0], 'logs', 'tf_plx_%s.log' % (args.model)) + print('Processing all data, log to', raw_log_file) + log_file = open(raw_log_file, 'w') + iterations = dense_feature.shape[0] // batch_size + total_epoch = 11 + start_index = 0 + for ep in range(total_epoch): + print("epoch %d" % ep) + st_time = time.time() + train_loss, train_acc, train_auc = [], [], [] + for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + start_index += batch_size + if start_index + batch_size > dense_feature.shape[0]: + start_index = 0 + loss_val = sess.run( + [loss, y, y_, train_op], feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + train_auc.append(metrics.roc_auc_score(true_val, pred_val)) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + tra_auc = np.mean(train_auc) + en_time = time.time() + train_time = en_time - st_time + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, tra_auc, train_time) + print(printstr) + log_file.write(printstr + '\n') + log_file.flush() + + else: + iteration = dense_feature.shape[0] // batch_size + + epoch = 50 + for ep in range(epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + for idx in range(iteration): + start_index = idx * batch_size + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + + loss_val = sess.run( + [loss, y, y_, train_op], feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + if pred_val.shape[1] == 1: # for criteo case + acc_val = np.equal( + true_val, + pred_val > 0.5) + else: + acc_val = np.equal( + np.argmax(pred_val, 1), + np.argmax(true_val, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print('all time:', (time.time() - start)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to be tested") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + raw_model = args.model + import tf_models + model = eval('tf_models.' + raw_model) + dataset = raw_model.split('_')[-1] + print('Model:', raw_model) + + if dataset == 'criteo': + train_criteo(model, args) + else: + raise NotImplementedError + + +if __name__ == '__main__': + main() diff --git a/examples/ctr/settings/local_s1.yml b/examples/ctr/settings/local_s1.yml new file mode 100644 index 0000000..a60a56b --- /dev/null +++ b/examples/ctr/settings/local_s1.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 +launch : + worker : 0 + server : 1 + scheduler : true diff --git a/examples/ctr/settings/local_s1_w2.yml b/examples/ctr/settings/local_s1_w2.yml new file mode 100644 index 0000000..34dc439 --- /dev/null +++ b/examples/ctr/settings/local_s1_w2.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 2 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 +launch : + worker : 2 + server : 1 + scheduler : true diff --git a/examples/ctr/settings/local_s1_w4.yml b/examples/ctr/settings/local_s1_w4.yml new file mode 100644 index 0000000..0790c40 --- /dev/null +++ b/examples/ctr/settings/local_s1_w4.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 +launch : + worker : 4 + server : 1 + scheduler : true diff --git a/examples/ctr/settings/local_s1_w8.yml b/examples/ctr/settings/local_s1_w8.yml new file mode 100644 index 0000000..a795608 --- /dev/null +++ b/examples/ctr/settings/local_s1_w8.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 8 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 +launch : + worker : 8 + server : 1 + scheduler : true diff --git a/examples/ctr/settings/local_w4.yml b/examples/ctr/settings/local_w4.yml new file mode 100644 index 0000000..97b5bcd --- /dev/null +++ b/examples/ctr/settings/local_w4.yml @@ -0,0 +1,6 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 + DMLC_PS_VAN_TYPE : p3 \ No newline at end of file diff --git a/examples/ctr/settings/plx_local_spec.yml b/examples/ctr/settings/plx_local_spec.yml new file mode 100644 index 0000000..11f9025 --- /dev/null +++ b/examples/ctr/settings/plx_local_spec.yml @@ -0,0 +1,4 @@ +nodes: + - address: localhost + cpus: [0] + gpus: [0,1,2,3,4,5,6,7] diff --git a/examples/ctr/settings/tf_local_s1_w2.json b/examples/ctr/settings/tf_local_s1_w2.json new file mode 100644 index 0000000..c1e0148 --- /dev/null +++ b/examples/ctr/settings/tf_local_s1_w2.json @@ -0,0 +1,9 @@ +{ + "worker": [ + "127.0.0.1:12349", + "127.0.0.1:12348" + ], + "ps": [ + "127.0.0.1:12345" + ] +} \ No newline at end of file diff --git a/examples/ctr/settings/tf_local_s1_w4.json b/examples/ctr/settings/tf_local_s1_w4.json new file mode 100644 index 0000000..d0c339b --- /dev/null +++ b/examples/ctr/settings/tf_local_s1_w4.json @@ -0,0 +1,11 @@ +{ + "worker": [ + "127.0.0.1:23459", + "127.0.0.1:23458", + "127.0.0.1:23457", + "127.0.0.1:23456" + ], + "ps": [ + "127.0.0.1:23455" + ] +} \ No newline at end of file diff --git a/examples/ctr/settings/tf_local_s1_w8.json b/examples/ctr/settings/tf_local_s1_w8.json new file mode 100644 index 0000000..05d1f0e --- /dev/null +++ b/examples/ctr/settings/tf_local_s1_w8.json @@ -0,0 +1,15 @@ +{ + "worker": [ + "127.0.0.1:34569", + "127.0.0.1:34568", + "127.0.0.1:34567", + "127.0.0.1:34566", + "127.0.0.1:34565", + "127.0.0.1:34564", + "127.0.0.1:34563", + "127.0.0.1:34562" + ], + "ps": [ + "127.0.0.1:34575" + ] +} \ No newline at end of file diff --git a/examples/ctr/tests/hybrid_dcn_criteo.sh b/examples/ctr/tests/hybrid_dcn_criteo.sh new file mode 100644 index 0000000..8fcb203 --- /dev/null +++ b/examples/ctr/tests/hybrid_dcn_criteo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & +mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml diff --git a/examples/ctr/tests/hybrid_dfm_criteo.sh b/examples/ctr/tests/hybrid_dfm_criteo.sh new file mode 100644 index 0000000..caaa171 --- /dev/null +++ b/examples/ctr/tests/hybrid_dfm_criteo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & +mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml diff --git a/examples/ctr/tests/hybrid_wdl_adult.sh b/examples/ctr/tests/hybrid_wdl_adult.sh new file mode 100644 index 0000000..d3f41ee --- /dev/null +++ b/examples/ctr/tests/hybrid_wdl_adult.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & +mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml diff --git a/examples/ctr/tests/hybrid_wdl_criteo.sh b/examples/ctr/tests/hybrid_wdl_criteo.sh new file mode 100644 index 0000000..36a6b88 --- /dev/null +++ b/examples/ctr/tests/hybrid_wdl_criteo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & +mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml diff --git a/examples/ctr/tests/local_dcn_criteo.sh b/examples/ctr/tests/local_dcn_criteo.sh new file mode 100644 index 0000000..f1cd439 --- /dev/null +++ b/examples/ctr/tests/local_dcn_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model dcn_criteo --val diff --git a/examples/ctr/tests/local_dfm_criteo.sh b/examples/ctr/tests/local_dfm_criteo.sh new file mode 100644 index 0000000..d57378c --- /dev/null +++ b/examples/ctr/tests/local_dfm_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model dfm_criteo --val diff --git a/examples/ctr/tests/local_wdl_adult.sh b/examples/ctr/tests/local_wdl_adult.sh new file mode 100644 index 0000000..4ccb1cd --- /dev/null +++ b/examples/ctr/tests/local_wdl_adult.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model wdl_adult --val diff --git a/examples/ctr/tests/local_wdl_criteo.sh b/examples/ctr/tests/local_wdl_criteo.sh new file mode 100644 index 0000000..cfd2e45 --- /dev/null +++ b/examples/ctr/tests/local_wdl_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model wdl_criteo --val diff --git a/examples/ctr/tests/ps_dcn_criteo.sh b/examples/ctr/tests/ps_dcn_criteo.sh new file mode 100644 index 0000000..fba3350 --- /dev/null +++ b/examples/ctr/tests/ps_dcn_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml diff --git a/examples/ctr/tests/ps_dfm_criteo.sh b/examples/ctr/tests/ps_dfm_criteo.sh new file mode 100644 index 0000000..ee423e9 --- /dev/null +++ b/examples/ctr/tests/ps_dfm_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml diff --git a/examples/ctr/tests/ps_wdl_adult.sh b/examples/ctr/tests/ps_wdl_adult.sh new file mode 100644 index 0000000..c0165b6 --- /dev/null +++ b/examples/ctr/tests/ps_wdl_adult.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml diff --git a/examples/ctr/tests/ps_wdl_criteo.sh b/examples/ctr/tests/ps_wdl_criteo.sh new file mode 100644 index 0000000..8d2f9a1 --- /dev/null +++ b/examples/ctr/tests/ps_wdl_criteo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../run_hetu.py + +python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml diff --git a/examples/ctr/tests/tf_2workers.sh b/examples/ctr/tests/tf_2workers.sh new file mode 100644 index 0000000..8f1c902 --- /dev/null +++ b/examples/ctr/tests/tf_2workers.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +rm -f logs/temp*.log +CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 0 > ${workdir}/../logs/temp0.log & +CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 1 > ${workdir}/../logs/temp1.log & +wait diff --git a/examples/ctr/tests/tf_4workers.sh b/examples/ctr/tests/tf_4workers.sh new file mode 100644 index 0000000..d1ad330 --- /dev/null +++ b/examples/ctr/tests/tf_4workers.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +rm -f logs/temp*.log +CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 0 > ${workdir}/../logs/temp0.log & +CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 1 > ${workdir}/../logs/temp1.log & +CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 2 > ${workdir}/../logs/temp2.log & +CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 3 > ${workdir}/../logs/temp3.log & +wait diff --git a/examples/ctr/tests/tf_8workers.sh b/examples/ctr/tests/tf_8workers.sh new file mode 100644 index 0000000..d0ede08 --- /dev/null +++ b/examples/ctr/tests/tf_8workers.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/../tf_launch_worker.py + +rm -f logs/temp*.log +CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 0 > ${workdir}/../logs/temp0.log & +CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 1 > ${workdir}/../logs/temp1.log & +CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 2 > ${workdir}/../logs/temp2.log & +CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 3 > ${workdir}/../logs/temp3.log & +CUDA_VISIBLE_DEVICES=4 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 4 > ${workdir}/../logs/temp4.log & +CUDA_VISIBLE_DEVICES=5 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 5 > ${workdir}/../logs/temp5.log & +CUDA_VISIBLE_DEVICES=6 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 6 > ${workdir}/../logs/temp6.log & +CUDA_VISIBLE_DEVICES=7 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 7 > ${workdir}/../logs/temp7.log & +wait diff --git a/examples/ctr/tf_launch_server.py b/examples/ctr/tf_launch_server.py new file mode 100644 index 0000000..aaa0b7c --- /dev/null +++ b/examples/ctr/tf_launch_server.py @@ -0,0 +1,49 @@ +import os +import tensorflow as tf +import multiprocessing +import signal +import json +import argparse + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + os.environ['CUDA_VISIBLE_DEVICES'] = '' + + +pop_env() + + +def start_server(cluster, task_id): + server = tf.train.Server(cluster, job_name='ps', task_index=task_id) + server.join() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", type=str, default='./settings/tf_dist_s4_w2.json', help="config file path") + parser.add_argument("--id", type=int, required=True) + args = parser.parse_args() + raw_config = args.config + config = json.load(open(raw_config)) + cluster = tf.train.ClusterSpec(config) + global proc + proc = multiprocessing.Process( + target=start_server, args=[cluster, args.id, ]) + proc.start() + signal.signal(signal.SIGINT, signal_handler) + proc.join() + + +def signal_handler(signal, frame): + print("SIGINT signal caught, stop Training") + global proc + proc.kill() + exit(0) + + +if __name__ == '__main__': + main() diff --git a/examples/ctr/tf_launch_worker.py b/examples/ctr/tf_launch_worker.py new file mode 100644 index 0000000..e56825d --- /dev/null +++ b/examples/ctr/tf_launch_worker.py @@ -0,0 +1,353 @@ +import tensorflow as tf +import numpy as np +import argparse +import os +import time +import json +from sklearn import metrics +from tqdm import tqdm + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + + +def train_criteo(model, cluster, task_id, nrank, args): + def get_current_shard(data): + part_size = data.shape[0] // nrank + start = part_size * task_id + end = start + part_size if task_id != nrank - 1 else data.shape[0] + return data[start:end] + + if args.all: + from models.load_data import process_all_criteo_data + dense, sparse, all_labels = process_all_criteo_data() + dense_feature = get_current_shard(dense[0]) + sparse_feature = get_current_shard(sparse[0]) + labels = get_current_shard(all_labels[0]) + val_dense = get_current_shard(dense[1]) + val_sparse = get_current_shard(sparse[1]) + val_labels = get_current_shard(all_labels[1]) + else: + from models.load_data import process_sampled_criteo_data + dense_feature, sparse_feature, labels = process_sampled_criteo_data() + dense_feature = get_current_shard(dense_feature) + sparse_feature = get_current_shard(sparse_feature) + labels = get_current_shard(labels) + + batch_size = 128 + worker_device = "/job:worker/task:%d/gpu:0" % (task_id) + with tf.device(worker_device): + dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) + sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) + y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) + + with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)): + server_num = len(cluster.as_dict()['ps']) + embed_partitioner = tf.fixed_size_partitioner( + server_num, 0) if server_num > 1 else None + loss, y, opt = model(dense_input, sparse_input, y_, + embed_partitioner, param_on_gpu=False) + train_op = opt.minimize(loss) + + server = tf.train.Server( + cluster, job_name="worker", task_index=task_id) + init = tf.compat.v1.global_variables_initializer() + sv = tf.train.Supervisor( + is_chief=(task_id == 0), + init_op=init, + recovery_wait_secs=1) + sess_config = tf.compat.v1.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % task_id]) + sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) + # sess.run(init) + if task_id == 0: + writer = tf.compat.v1.summary.FileWriter('logs/board', sess.graph) + + my_feed_dict = { + dense_input: np.empty(shape=(batch_size, 13)), + sparse_input: np.empty(shape=(batch_size, 26)), + y_: np.empty(shape=(batch_size, 1)), + } + + if args.all: + raw_log_file = './logs/tf_dist_%s_%d.log' % (args.model, task_id) + print('Processing all data, log to', raw_log_file) + log_file = open(raw_log_file, 'w') + iterations = dense_feature.shape[0] // batch_size + total_epoch = 21 + start_index = 0 + for ep in range(total_epoch): + print("epoch %d" % ep) + st_time = time.time() + train_loss, train_acc, train_auc = [], [], [] + for it in range(iterations // 10 + (ep % 10 == 9) * (iterations % 10)): + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + start_index += batch_size + if start_index + batch_size > dense_feature.shape[0]: + start_index = 0 + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + train_auc.append(metrics.roc_auc_score(true_val, pred_val)) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + tra_auc = np.mean(train_auc) + en_time = time.time() + train_time = en_time - st_time + + if args.val: + val_loss, val_acc, val_auc = [], [], [] + for it in range(val_dense.shape[0] // batch_size): + local_st = it * batch_size + my_feed_dict[dense_input][:] = val_dense[local_st: local_st + batch_size] + my_feed_dict[sparse_input][:] = val_sparse[local_st: local_st + batch_size] + my_feed_dict[y_][:] = val_labels[local_st: local_st+batch_size] + loss_val = sess.run([loss, y, y_], feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + acc_val = np.equal( + true_val, + pred_val > 0.5) + val_loss.append(loss_val[0]) + val_acc.append(acc_val) + val_auc.append(metrics.roc_auc_score(true_val, pred_val)) + v_accuracy = np.mean(val_acc) + v_loss = np.mean(val_loss) + v_auc = np.mean(val_auc) + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, tra_auc, v_loss, v_accuracy, v_auc, train_time) + else: + printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ + % (tra_loss, tra_accuracy, tra_auc, train_time) + + print(printstr) + log_file.write(printstr + '\n') + log_file.flush() + else: + # here no val + iteration = dense_feature.shape[0] // batch_size + + epoch = 10 + for ep in range(epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + for idx in range(iteration): + start_index = idx * batch_size + my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] + my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] + my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] + + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + pred_val = loss_val[1] + true_val = loss_val[2] + if pred_val.shape[1] == 1: # for criteo case + acc_val = np.equal( + true_val, + pred_val > 0.5) + else: + acc_val = np.equal( + np.argmax(pred_val, 1), + np.argmax(true_val, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print("tensorflow: ", (time.time() - start)) + + +def train_adult(model, cluster, task_id, nrank): + from models.load_data import load_adult_data + x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False) + part_size = len(x_train_deep) // nrank + start = part_size * task_id + end = start + part_size if task_id != nrank - 1 else len(x_train_deep) + x_train_deep = x_train_deep[start:end] + x_train_wide = x_train_wide[start:end] + y_train = y_train[start:end] + + batch_size = 128 + total_epoch = 50 + dim_wide = 809 + + worker_device = "/job:worker/task:%d/gpu:0" % (task_id) + with tf.device(worker_device): + X_deep = [] + for i in range(8): + X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1])) + for i in range(4): + X_deep.append(tf.compat.v1.placeholder( + tf.float32, [batch_size, 1])) + X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide]) + y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2]) + loss, y, train_op, global_step = model( + X_deep, X_wide, y_, cluster, task_id) + + with tf.device( + tf.compat.v1.train.replica_device_setter( + worker_device=worker_device, + cluster=cluster)): + server = tf.train.Server( + cluster, job_name="worker", task_index=task_id) + init = tf.global_variables_initializer() + sv = tf.train.Supervisor( + is_chief=(task_id == 0), + init_op=init, + recovery_wait_secs=1, + global_step=global_step) + sess_config = tf.ConfigProto( + # allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % task_id]) + sess = sv.prepare_or_wait_for_session( + server.target, config=sess_config) + + sess.run(init) + + iterations = x_train_deep.shape[0] // batch_size + for ep in range(total_epoch): + print('epoch', ep) + if ep == 5: + start = time.time() + ep_st = time.time() + train_loss = [] + train_acc = [] + pre_index = 0 + + for it in range(iterations): + batch_x_deep = x_train_deep[pre_index:pre_index + batch_size] + batch_x_wide = x_train_wide[pre_index:pre_index + batch_size] + batch_y = y_train[pre_index:pre_index + batch_size] + pre_index += batch_size + + my_feed_dict = dict() + for i in range(12): + my_feed_dict[X_deep[i]] = np.array( + batch_x_deep[:, 1]).reshape(-1, 1) + + my_feed_dict[X_wide] = np.array(batch_x_wide) + my_feed_dict[y_] = batch_y + loss_val = sess.run([loss, y, y_, train_op], + feed_dict=my_feed_dict) + acc_val = np.equal( + np.argmax(loss_val[1], 1), + np.argmax(loss_val[2], 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + tra_accuracy = np.mean(train_acc) + tra_loss = np.mean(train_loss) + ep_en = time.time() + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (tra_loss, tra_accuracy, ep_en - ep_st)) + print("tensorflow: ", (time.time() - start)) + + +def test_bandwidth(cluster, task_id): + print('test bandwidth') + iters = 1000 + params_size = 128 * 9 + ps_device = "/job:ps/task:0/cpu:0" + worker_device = "/job:worker/task:%d/cpu:0" % (task_id) + + with tf.device(ps_device): + dtype = tf.int32 + params = tf.get_variable("params", shape=[params_size], dtype=dtype, + initializer=tf.zeros_initializer()) + with tf.device(tf.compat.v1.train.replica_device_setter( + worker_device=worker_device, + cluster=cluster)): + update = tf.get_variable("update", shape=[params_size], dtype=dtype, + initializer=tf.ones_initializer()) + add_op = params.assign(update) + + server = tf.train.Server( + cluster, job_name="worker", task_index=task_id) + init = tf.global_variables_initializer() + sv = tf.train.Supervisor( + is_chief=(task_id == 0), + init_op=init, + recovery_wait_secs=1) + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % task_id]) + sess = sv.prepare_or_wait_for_session( + server.target, config=sess_config) + + sess.run(init) + # warm up + for i in range(5): + sess.run(add_op.op) + + start_time = time.time() + for i in range(iters): + sess.run(add_op.op) + elapsed_time = time.time() - start_time + ans = float(iters)*(params_size / 1024 / 1024)/elapsed_time + print("transfer rate: %f MB/s" % (ans)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to be tested") + parser.add_argument("--rank", type=int, required=True, + help="rank of process") + parser.add_argument( + "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path") + parser.add_argument("--val", action="store_true", + help="whether to use validation") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + raw_model = args.model + task_id = int(args.rank) + raw_config = args.config + + config = json.load(open(raw_config)) + cluster = tf.train.ClusterSpec(config) + + if raw_model != 'band': + import tf_models + model = eval('tf_models.' + raw_model) + dataset = raw_model.split('_')[-1] + print('Model:', raw_model) + if dataset == 'criteo': + train_criteo(model, cluster, task_id, len(config['worker']), args) + elif dataset == 'adult': + # not support val or all + train_adult(model, cluster, task_id, len(config['worker'])) + else: + raise NotImplementedError + else: + test_bandwidth(cluster, task_id) + + +if __name__ == '__main__': + main() diff --git a/examples/ctr/tf_models/__init__.py b/examples/ctr/tf_models/__init__.py new file mode 100644 index 0000000..659f885 --- /dev/null +++ b/examples/ctr/tf_models/__init__.py @@ -0,0 +1,4 @@ +from .tf_dcn_criteo import dcn_criteo +from .tf_deepfm_criteo import dfm_criteo +from .tf_wdl_criteo import wdl_criteo +from .tf_wdl_adult import wdl_adult diff --git a/examples/ctr/tf_models/tf_dcn_criteo.py b/examples/ctr/tf_models/tf_dcn_criteo.py new file mode 100644 index 0000000..90a3b58 --- /dev/null +++ b/examples/ctr/tf_models/tf_dcn_criteo.py @@ -0,0 +1,69 @@ +import tensorflow as tf + + +def cross_layer(x0, x1, device): + # x0: input embedding feature (batch_size, 26 * embedding_size + 13) + # x1: the output of last layer (batch_size, 26 * embedding_size + 13) + + embed_dim = x1.shape[-1] + with tf.device(device): + w = tf.compat.v1.get_variable(name='w', shape=(embed_dim,)) + b = tf.compat.v1.get_variable(name='b', shape=(embed_dim,)) + x_1w = tf.tensordot(tf.reshape(x1, [-1, 1, embed_dim]), w, axes=1) + cross = x0 * x_1w + return cross + x1 + b + + +def build_cross_layer(x0, num_layers=3, device=tf.device('/gpu:0')): + x1 = x0 + for i in range(num_layers): + with tf.compat.v1.variable_scope('layer%d' % i): + x1 = cross_layer(x0, x1, device) + return x1 + + +def dcn_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.003 / 8 # here to comply with HETU + all_partitioner, embed_partitioner = ( + partitioner, None) if part_all else (None, partitioner) + with tf.compat.v1.variable_scope('dcn', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner): + with tf.device('/cpu:0'): + Embedding = tf.compat.v1.get_variable(name="Embedding", shape=( + feature_dimension, embedding_size), partitioner=embed_partitioner) + sparse_input_embedding = tf.nn.embedding_lookup( + Embedding, sparse_input) + + device = '/gpu:0' if param_on_gpu else '/cpu:0' + with tf.device(device): + W1 = tf.compat.v1.get_variable( + name='W1', shape=[26*embedding_size + 13, 256]) + W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256]) + W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256]) + W4 = tf.compat.v1.get_variable( + name='W4', shape=[256 + 26 * embedding_size + 13, 1]) + + with tf.device('/gpu:0'): + flatten = tf.reshape(sparse_input_embedding, + (-1, 26*embedding_size)) + x = tf.concat((flatten, dense_input), 1) + # CrossNet + cross_output = build_cross_layer(x, num_layers=3, device=device) + # DNN + flatten = x + + fc1 = tf.matmul(flatten, W1) + relu1 = tf.nn.relu(fc1) + fc2 = tf.matmul(relu1, W2) + relu2 = tf.nn.relu(fc2) + y3 = tf.matmul(relu2, W3) + + y4 = tf.concat((cross_output, y3), 1) + y = tf.matmul(y4, W4) + loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate) + return loss, y, optimizer diff --git a/examples/ctr/tf_models/tf_deepfm_criteo.py b/examples/ctr/tf_models/tf_deepfm_criteo.py new file mode 100644 index 0000000..ea9d8f6 --- /dev/null +++ b/examples/ctr/tf_models/tf_deepfm_criteo.py @@ -0,0 +1,62 @@ +import tensorflow as tf + + +def dfm_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.01 / 8 # here to comply with HETU + all_partitioner, embed_partitioner = ( + partitioner, None) if part_all else (None, partitioner) + with tf.compat.v1.variable_scope('dfm', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner): + with tf.device('/cpu:0'): + Embedding1 = tf.compat.v1.get_variable(name="Embedding1", shape=( + feature_dimension, 1), partitioner=embed_partitioner) + Embedding2 = tf.compat.v1.get_variable(name="embeddings", shape=( + feature_dimension, embedding_size), partitioner=embed_partitioner) + sparse_1dim_input = tf.nn.embedding_lookup( + Embedding1, sparse_input) + sparse_2dim_input = tf.nn.embedding_lookup( + Embedding2, sparse_input) + + device = '/gpu:0' if param_on_gpu else '/cpu:0' + with tf.device(device): + FM_W = tf.compat.v1.get_variable(name='FM_W', shape=[13, 1]) + W1 = tf.compat.v1.get_variable( + name='W1', shape=[26*embedding_size, 256]) + W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256]) + W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 1]) + + with tf.device('/gpu:0'): + fm_dense_part = tf.matmul(dense_input, FM_W) + fm_sparse_part = tf.reduce_sum(sparse_1dim_input, 1) + # fst order output + y1 = fm_dense_part + fm_sparse_part + + sparse_2dim_sum = tf.reduce_sum(sparse_2dim_input, 1) + sparse_2dim_sum_square = tf.multiply( + sparse_2dim_sum, sparse_2dim_sum) + + sparse_2dim_square = tf.multiply( + sparse_2dim_input, sparse_2dim_input) + sparse_2dim_square_sum = tf.reduce_sum(sparse_2dim_square, 1) + sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum + sparse_2dim_half = sparse_2dim * 0.5 + # snd order output + y2 = tf.reduce_sum(sparse_2dim_half, 1, keepdims=True) + + # DNN + flatten = tf.reshape(sparse_2dim_input, (-1, 26*embedding_size)) + fc1 = tf.matmul(flatten, W1) + relu1 = tf.nn.relu(fc1) + fc2 = tf.matmul(relu1, W2) + relu2 = tf.nn.relu(fc2) + y3 = tf.matmul(relu2, W3) + + y4 = y1 + y2 + y = y4 + y3 + loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate) + return loss, y, optimizer diff --git a/examples/ctr/tf_models/tf_wdl_adult.py b/examples/ctr/tf_models/tf_wdl_adult.py new file mode 100644 index 0000000..4137d08 --- /dev/null +++ b/examples/ctr/tf_models/tf_wdl_adult.py @@ -0,0 +1,77 @@ +import tensorflow as tf +import numpy as np + + +def wdl_adult(X_deep, X_wide, y_, cluster=None, task_id=None): + lr_ = 5 / 128 + dim_wide = 809 + dim_deep = 68 + use_ps = cluster is not None + + if use_ps: + device = tf.device(tf.train.replica_device_setter( + worker_device="/job:worker/task:%d/gpu:0" % (task_id), + cluster=cluster)) + else: + device = tf.device('/gpu:0') + global_step = tf.Variable(0, name="global_step", trainable=False) + with device: + if use_ps: + global_step = tf.Variable(0, name="global_step", trainable=False) + + rand = np.random.RandomState(seed=123) + W = tf.Variable(rand.normal(scale=0.1, size=[ + dim_wide+20, 2]), dtype=tf.float32) + W1 = tf.Variable(rand.normal(scale=0.1, size=[ + dim_deep, 50]), dtype=tf.float32) + b1 = tf.Variable(rand.normal(scale=0.1, size=[50]), dtype=tf.float32) + W2 = tf.Variable(rand.normal( + scale=0.1, size=[50, 20]), dtype=tf.float32) + b2 = tf.Variable(rand.normal(scale=0.1, size=[20]), dtype=tf.float32) + + Embedding = [] + + for i in range(8): + Embedding.append(tf.Variable(rand.normal( + scale=0.1, size=[20, 8]), dtype=tf.float32)) + + # deep + X_deep_input = None + for i in range(8): + now = tf.nn.embedding_lookup(Embedding[i], X_deep[i]) + now = tf.reshape(now, (-1, 8)) + if X_deep_input is None: + X_deep_input = now + else: + X_deep_input = tf.concat([X_deep_input, now], 1) + + for i in range(4): + now = tf.reshape(X_deep[i + 8], (-1, 1)) + X_deep_input = tf.concat([X_deep_input, now], 1) + + mat1 = tf.matmul(X_deep_input, W1) + add1 = tf.add(mat1, b1) + relu1 = tf.nn.relu(add1) + dropout1 = relu1 + mat2 = tf.matmul(dropout1, W2) + add2 = tf.add(mat2, b2) + relu2 = tf.nn.relu(add2) + dropout2 = relu2 + dmodel = dropout2 + + # wide + wmodel = tf.concat([X_wide, dmodel], 1) + wmodel = tf.matmul(wmodel, W) + + y = wmodel + loss = tf.reduce_mean( + tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) + ) + + optimizer = tf.train.GradientDescentOptimizer(lr_) + train_op = optimizer.minimize(loss, global_step=global_step) + + if use_ps: + return loss, y, train_op, global_step + else: + return loss, y, train_op diff --git a/examples/ctr/tf_models/tf_wdl_criteo.py b/examples/ctr/tf_models/tf_wdl_criteo.py new file mode 100644 index 0000000..d3d7454 --- /dev/null +++ b/examples/ctr/tf_models/tf_wdl_criteo.py @@ -0,0 +1,40 @@ +import tensorflow as tf + + +def wdl_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True): + feature_dimension = 33762577 + embedding_size = 128 + learning_rate = 0.01 / 8 # here to comply with HETU + all_partitioner, embed_partitioner = ( + partitioner, None) if part_all else (None, partitioner) + with tf.compat.v1.variable_scope('wdl', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner): + with tf.device('/cpu:0'): + Embedding = tf.compat.v1.get_variable(name="Embedding", shape=( + feature_dimension, embedding_size), partitioner=embed_partitioner) + sparse_input_embedding = tf.nn.embedding_lookup( + Embedding, sparse_input) + device = '/gpu:0' if param_on_gpu else '/cpu:0' + with tf.device(device): + W1 = tf.compat.v1.get_variable(name='W1', shape=[13, 256]) + W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256]) + W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256]) + W4 = tf.compat.v1.get_variable( + name='W4', shape=[256 + 26 * embedding_size, 1]) + with tf.device('/gpu:0'): + sparse_input_embedding = tf.reshape( + sparse_input_embedding, (-1, 26*embedding_size)) + flatten = dense_input + fc1 = tf.matmul(flatten, W1) + relu1 = tf.nn.relu(fc1) + fc2 = tf.matmul(relu1, W2) + relu2 = tf.nn.relu(fc2) + y3 = tf.matmul(relu2, W3) + + y4 = tf.concat((sparse_input_embedding, y3), 1) + y = tf.matmul(y4, W4) + loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate) + return loss, y, optimizer diff --git a/examples/gnn/README.md b/examples/gnn/README.md new file mode 100644 index 0000000..28ea840 --- /dev/null +++ b/examples/gnn/README.md @@ -0,0 +1,112 @@ +# GNN Examples (with Distributed Settings) + +## Structure +``` +- gnn + - gnn_tools/ scripts to prepare data and other + - config/ distributed configurations + - gnn_model/ gnn models + - run_dist.py train gnn models in ps setting + - run_dist_hybrid.py train gnn models in hybrid setting + - run_single.py train with a single gpu + +``` + +## Configuration file explained + +We use a simple yaml file to specify the run configuration. + +```yaml +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 +launch : + worker : 4 + server : 1 + graph_server : 1 + scheduler : true +``` + +The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster. + +The difference of GNN model and other models is that we need to launch a set of graph servers to carry out graph sampling. Note that the total number of graph server MUST be equal to the graph partition number. It is recommended that you partition the graph into the number of machines and launch one graph server on each machine. + +Note that there should be only 1 scheduler and should only be launched on the machine with DMLC_PS_ROOT_URI. + +Note that the launch automatically select network interface for you. If this fails, try adding "DMLC_INTERFACE : eth0" to select the right network device. + +## Prepare graph datasets + +1. Prepare Normal dataset (use dense feature and no embedding) + + ```shell +python3 -m graphmix.partition [-d DatasetName] -n4 -p ~/yourDataPath + ``` + + We currently have the following dataset Cora, PubMed, Reddit, Flickr, Yelp, ogbn-products, ogbn-arxiv. + +2. Prepare ogbn-mag or Reddit dataset (with sparse embedding) + + Then you can use the following command to partition the graph into 4 parts for 4-workers to use. + + ```bash + python3 gnn_tools/part_graph.py [-d DatasetName] -n 4 -p ~/yourDataPath + ``` + + Also note that if you want to train on K node, replace the -n 4 with -n K. + +3. Prepare Amazon dataset: This dataset is introduced in the cluster-GCN paper and there are two file to be downloaded: [metadata.json](https://drive.google.com/file/d/0B2jJQxNRDl_rVVZCdWVnYmUyRDg) and [map_files](https://drive.google.com/file/d/0B3lPMIHmG6vGd2U3VHB0Wkk4cGM). Once you download and extract the files and put them together under gnn_tools directory you can run + + ```bash + python3 prepare_amazon_dataset.py + ``` + + Note that you need nltk installed in your environment to run this script and this will take a while. + + After running the script, you will get the two output file: graph.npz and sparsefeature.npy. Put them in the right place. + + ```bash + mkdir -p ~/.graphmix_dataset/AmazonSparse + mv graph.npz sparsefeature.npy ~/.graphmix_dataset/AmazonSparse + ``` + + Finally, use the part_graph.py to partition the graph + + ``` + python3 gnn_tools/part_graph.py -d AmazonSparse -n 4 -p ~/yourDataPath + ``` + +## Training GNN Models + +After you have prepare one graph dataset, you can start training Embedding Models on graph datasets. We take Reddit as an example. + +To train on PS communication mode. Run + +``` +python3 run_dist.py [configfile] -p ~/yourDataPath/Reddit [--dense] +``` + +To train on Hybrid communication mode. Run + +``` +mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit [--dense] +``` + +When running on Hybrid mode, you will also have to launch some servers and scheduler seperately + +``` +python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit --server +``` + +A --dense argument is used if you are training with a normal dataset (with dense feature). + +## Train with a single card + +This time you will have to run partition as we mentioned before with n=1. After that, run + +```shell +python3 run_single.py -p ~/yourDataPath/Reddit [--dense] +``` + diff --git a/examples/gnn/config/local_w2.yml b/examples/gnn/config/local_w2.yml new file mode 100644 index 0000000..5e53073 --- /dev/null +++ b/examples/gnn/config/local_w2.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 2 + DMLC_NUM_SERVER : 1 +launch : + worker : 2 + server : 1 + graph_server : 1 + scheduler : true diff --git a/examples/gnn/config/local_w4.yml b/examples/gnn/config/local_w4.yml new file mode 100644 index 0000000..8a7ea2d --- /dev/null +++ b/examples/gnn/config/local_w4.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 4 + DMLC_NUM_SERVER : 1 +launch : + worker : 4 + server : 1 + graph_server : 4 + scheduler : true diff --git a/examples/gnn/config/local_w8.yml b/examples/gnn/config/local_w8.yml new file mode 100644 index 0000000..cc83196 --- /dev/null +++ b/examples/gnn/config/local_w8.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 8 + DMLC_NUM_SERVER : 1 +launch : + worker : 8 + server : 1 + graph_server : 4 + scheduler : true diff --git a/examples/gnn/config/single.yml b/examples/gnn/config/single.yml new file mode 100644 index 0000000..f669fca --- /dev/null +++ b/examples/gnn/config/single.yml @@ -0,0 +1,10 @@ +shared : + DMLC_PS_ROOT_URI : 127.0.0.1 + DMLC_PS_ROOT_PORT : 13100 + DMLC_NUM_WORKER : 1 + DMLC_NUM_SERVER : 0 +launch : + worker : 1 + server : 0 + graph_server : 1 + scheduler : true diff --git a/examples/gnn/gnn_model/__init__.py b/examples/gnn/gnn_model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/gnn/gnn_model/layer.py b/examples/gnn/gnn_model/layer.py new file mode 100644 index 0000000..9270a02 --- /dev/null +++ b/examples/gnn/gnn_model/layer.py @@ -0,0 +1,68 @@ +import hetu as ht +from hetu import init + + +class GCN(object): + def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0, + name="GCN", custom_init=None): + if custom_init is not None: + self.weight = ht.Variable( + value=custom_init[0], name=name+"_Weight") + self.bias = ht.Variable(value=custom_init[1], name=name+"_Bias") + else: + self.weight = init.xavier_uniform( + shape=(in_features, out_features), name=name+"_Weight") + self.bias = init.zeros(shape=(out_features,), name=name+"_Bias") + # self.mp is a sparse matrix and should appear in feed_dict later + self.mp = norm_adj + self.activation = activation + self.dropout = dropout + self.output_width = out_features + + def __call__(self, x): + """ + Build the computation graph, return the output node + """ + if self.dropout > 0: + x = ht.dropout_op(x, 1 - self.dropout) + x = ht.matmul_op(x, self.weight) + msg = x + ht.broadcastto_op(self.bias, x) + x = ht.csrmm_op(self.mp, msg) + if self.activation == "relu": + x = ht.relu_op(x) + elif self.activation is not None: + raise NotImplementedError + return x + + +class SageConv(object): + def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0, + name="GCN", custom_init=None, mp_val=None): + + self.weight = init.xavier_uniform( + shape=(in_features, out_features), name=name+"_Weight") + self.bias = init.zeros(shape=(out_features,), name=name+"_Bias") + self.weight2 = init.xavier_uniform( + shape=(in_features, out_features), name=name+"_Weight") + # self.mp is a sparse matrix and should appear in feed_dict later + self.mp = norm_adj + self.activation = activation + self.dropout = dropout + self.output_width = 2 * out_features + + def __call__(self, x): + """ + Build the computation graph, return the output node + """ + feat = x + if self.dropout > 0: + x = ht.dropout_op(x, 1 - self.dropout) + + x = ht.csrmm_op(self.mp, x) + x = ht.matmul_op(x, self.weight) + x = x + ht.broadcastto_op(self.bias, x) + if self.activation == "relu": + x = ht.relu_op(x) + elif self.activation is not None: + raise NotImplementedError + return ht.concat_op(x, ht.matmul_op(feat, self.weight2), axis=1) diff --git a/examples/gnn/gnn_model/model.py b/examples/gnn/gnn_model/model.py new file mode 100644 index 0000000..502271c --- /dev/null +++ b/examples/gnn/gnn_model/model.py @@ -0,0 +1,62 @@ +import hetu as ht +import numpy as np +from .layer import GCN, SageConv + + +def convert_to_one_hot(vals, max_val=0): + """Helper method to convert label array to one-hot array.""" + if max_val == 0: + max_val = vals.max() + 1 + one_hot_vals = np.zeros((vals.size, max_val)) + one_hot_vals[np.arange(vals.size), vals] = 1 + return one_hot_vals + + +def sparse_model(int_feature, hidden_layer_size, embedding_idx_max, embedding_width, num_classes, lr): + y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot( + g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu())) + mask_ = ht.Variable(name="mask_") + index_ = ht.GNNDataLoaderOp(lambda g: ht.array( + g.i_feat[:, 0:-2], ctx=ht.cpu()), ctx=ht.cpu()) + embedding = ht.init.random_normal( + [embedding_idx_max, embedding_width], stddev=0.1) + embed = ht.embedding_lookup_op(embedding, index_) + feat = ht.array_reshape_op(embed, (-1, int_feature * embedding_width)) + + norm_adj_ = ht.Variable("message_passing", trainable=False, value=None) + gcn1 = GCN(int_feature * embedding_width, + hidden_layer_size, norm_adj_, activation="relu") + gcn2 = GCN(gcn1.output_width, num_classes, norm_adj_) + x = gcn1(feat) + y = gcn2(x) + loss = ht.softmaxcrossentropy_op(y, y_) + train_loss = loss * mask_ + train_loss = ht.reduce_mean_op(train_loss, [0]) + opt = ht.optim.SGDOptimizer(lr) + train_op = opt.minimize(train_loss) + # model input & model output + return [loss, y, train_op], [mask_, norm_adj_] + + +def dense_model(feature_dim, hidden_layer_size, num_classes, lr, arch=GCN): + y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot( + g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu())) + mask_ = ht.Variable(name="mask_") + feat = ht.GNNDataLoaderOp(lambda g: ht.array( + g.f_feat, ctx=ht.cpu()), ctx=ht.cpu()) + + norm_adj_ = ht.Variable("message_passing", trainable=False, value=None) + gcn1 = arch(feature_dim, hidden_layer_size, norm_adj_, activation="relu") + gcn2 = arch(gcn1.output_width, hidden_layer_size, + norm_adj_, activation="relu") + classifier = ht.init.xavier_uniform(shape=(gcn2.output_width, num_classes)) + x = gcn1(feat) + x = gcn2(x) + y = ht.matmul_op(x, classifier) + loss = ht.softmaxcrossentropy_op(y, y_) + train_loss = loss * mask_ + train_loss = ht.reduce_mean_op(train_loss, [0]) + opt = ht.optim.SGDOptimizer(lr) + train_op = opt.minimize(train_loss) + # model input & model output + return [loss, y, train_op], [mask_, norm_adj_] diff --git a/examples/gnn/gnn_model/utils.py b/examples/gnn/gnn_model/utils.py new file mode 100644 index 0000000..33feeb9 --- /dev/null +++ b/examples/gnn/gnn_model/utils.py @@ -0,0 +1,46 @@ +import hetu +import graphmix +import numpy as np +from tqdm import tqdm + + +def padding(graph, target_num_nodes): + assert graph.num_nodes <= target_num_nodes + graph.convert2coo() + new_graph = graphmix.Graph(graph.edge_index, target_num_nodes) + new_graph.tag = graph.tag + new_graph.type = graph.type + extra = target_num_nodes - graph.num_nodes + new_graph.i_feat = np.concatenate( + [graph.i_feat, np.tile(graph.i_feat[0], [extra, 1])]) + new_graph.f_feat = np.concatenate( + [graph.f_feat, np.tile(graph.f_feat[0], [extra, 1])]) + if graph.extra.size: + new_graph.extra = np.concatenate([graph.extra, np.zeros([extra, 1])]) + return new_graph + + +def prepare_data(ngraph): + cli = graphmix.Client() + graphs = [] + for i in tqdm(range(ngraph)): + query = cli.pull_graph() + graph = cli.wait(query) + graphs.append(graph) + max_num_nodes = 0 + for i in range(ngraph): + max_num_nodes = max(max_num_nodes, graphs[i].num_nodes) + for i in range(ngraph): + graphs[i] = padding(graphs[i], max_num_nodes) + return graphs + + +def get_norm_adj(graph, device, use_original_gcn_norm=False): + norm = graph.gcn_norm(use_original_gcn_norm) + mp_mat = hetu.ndarray.sparse_array( + values=norm, + indices=(graph.edge_index[1], graph.edge_index[0]), + shape=(graph.num_nodes, graph.num_nodes), + ctx=device + ) + return mp_mat diff --git a/examples/gnn/gnn_tools/__init__.py b/examples/gnn/gnn_tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/gnn/gnn_tools/launcher.py b/examples/gnn/gnn_tools/launcher.py new file mode 100644 index 0000000..bebb77b --- /dev/null +++ b/examples/gnn/gnn_tools/launcher.py @@ -0,0 +1,135 @@ +import os +import os.path as osp +import signal +import yaml +import multiprocessing + +import libc_graphmix as _C +import hetu as ht +from graphmix.shard import Shard + +default_graph_root_port = 27770 + + +def start_graph_server(shard, server_init): + os.environ['GRAPHMIX_ROLE'] = "server" + _C.init() + shard.load_graph_shard(_C.rank()) + server = _C.start_server() + server.init_meta(shard.meta) + server.init_data(shard.f_feat, shard.i_feat, shard.edges) + del shard + print("GraphMix Server {} : data initialized at {}:{}".format( + _C.rank(), _C.ip(), _C.port())) + _C.barrier_all() + server_init(server) + _C.finalize() + + +def start_server(): + os.environ["DMLC_ROLE"] = "server" + ht.server_init() + ht.server_finish() + +# two scheduler in one process + + +def start_scheduler(): + os.environ['GRAPHMIX_ROLE'] = "scheduler" + os.environ['DMLC_ROLE'] = "scheduler" + _C.init() + ht.scheduler_init() + ht.scheduler_finish() + _C.finalize() + + +def start_worker(func, args): + os.environ['GRAPHMIX_ROLE'] = "worker" + os.environ['DMLC_ROLE'] = "worker" + _C.init() + ht.worker_init() + args.local_rank = _C.rank() % args.num_local_worker + _C.barrier_all() + func(args) + ht.worker_finish() + _C.finalize() + + +def start_worker_standalone(func, args, local_rank): + args.local_rank = local_rank + func(args) + + +def signal_handler(signal, frame): + print("SIGINT signal caught, stop Training") + for proc in process_list: + proc.kill() + exit(0) + + +process_list = [] + + +def launch_graphmix_and_hetu_ps(target, args, server_init, hybrid_config=None): + # open setting file + file_path = osp.abspath(osp.expanduser(osp.normpath(args.config))) + with open(file_path) as setting_file: + settings = yaml.load(setting_file.read(), Loader=yaml.FullLoader) + + # write environment variables + for key, value in settings["shared"].items(): + os.environ[str(key)] = str(value) + + # the graph data path is relative to the setting file path + graph_data_path = osp.abspath(osp.expanduser(osp.normpath(args.path))) + print("GraphMix launcher : Using Graph Data from ", graph_data_path) + + # load graph and set the server number equal to the number of graph parts + shard = Shard(graph_data_path) + os.environ['GRAPHMIX_NUM_SERVER'] = str(shard.meta["num_part"]) + os.environ['GRAPHMIX_NUM_WORKER'] = os.environ['DMLC_NUM_WORKER'] + os.environ['GRAPHMIX_ROOT_URI'] = os.environ['DMLC_PS_ROOT_URI'] + os.environ['GRAPHMIX_ROOT_PORT'] = str(default_graph_root_port) + if 'DMLC_INTERFACE' in os.environ.keys(): + os.environ['GRAPHMIX_INTERFACE'] = os.environ['DMLC_INTERFACE'] + + # get local job number + args.num_local_worker = int(settings["launch"]["worker"]) + args.num_local_graph_server = int(settings["launch"]["graph_server"]) + args.num_local_server = int(settings["launch"]["server"]) + args.scheduler = settings["launch"]["scheduler"] + assert args.num_local_graph_server <= shard.meta["num_part"] + assert args.num_local_worker <= int(os.environ['DMLC_NUM_WORKER']) + assert args.num_local_server <= int(os.environ['DMLC_NUM_SERVER']) + if hybrid_config == "worker": + args.num_local_server = 0 + args.num_local_graph_server = 0 + args.scheduler = False + args.num_local_worker = 1 + elif hybrid_config == "server": + args.num_local_worker = 0 + + # launch workers + for i in range(args.num_local_worker): + proc = multiprocessing.Process( + target=start_worker, args=[target, args]) + process_list.append(proc) + # launch graph servers + for i in range(args.num_local_graph_server): + proc = multiprocessing.Process( + target=start_graph_server, args=[shard, server_init]) + process_list.append(proc) + # launch ps servers + for i in range(args.num_local_server): + proc = multiprocessing.Process(target=start_server, args=[]) + process_list.append(proc) + # launch scheduler + if args.scheduler: + proc = multiprocessing.Process(target=start_scheduler) + process_list.append(proc) + # wait until all process finish + for proc in process_list: + proc.start() + signal.signal(signal.SIGINT, signal_handler) + for proc in process_list: + proc.join() diff --git a/examples/gnn/gnn_tools/log.py b/examples/gnn/gnn_tools/log.py new file mode 100644 index 0000000..1c5f536 --- /dev/null +++ b/examples/gnn/gnn_tools/log.py @@ -0,0 +1,68 @@ +import multiprocessing +import numpy as np +import time + +logfile = open("log.txt", "w") + + +class SharedTrainingStat(): + def __init__(self): + self.manager = multiprocessing.Manager() + self.lock = self.manager.Lock() + self.total = self.manager.Value("total", 0) + self.acc = self.manager.Value("acc", 0) + self.loss = self.manager.Value("loss", 0.0) + self.count = self.manager.Value("count", 0) + self.train_total = self.manager.Value("train_total", 0) + self.train_acc = self.manager.Value("train_acc", 0) + self.train_loss = self.manager.Value("train_loss", 0.0) + self.train_count = self.manager.Value("train_count", 0) + self.time = [] + + def update(self, acc, total, loss): + self.lock.acquire() + self.total.value += total + self.acc.value += acc + self.loss.value += loss + self.count.value += 1 + self.lock.release() + + def update_train(self, acc, total, loss): + self.lock.acquire() + self.train_total.value += total + self.train_acc.value += acc + self.train_loss.value += loss + self.train_count.value += 1 + self.lock.release() + + def print(self, start=""): + self.lock.acquire() + if len(self.time) > 3: + epoch_time = np.array(self.time[1:])-np.array(self.time[:-1]) + print( + "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time))) + self.time.append(time.time()) + print( + start, + "test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format( + self.loss.value / self.count.value, + self.acc.value / self.total.value, + self.train_loss.value / self.train_count.value, + self.train_acc.value / self.train_total.value + ) + ) + print( + self.loss.value / self.count.value, self.acc.value / self.total.value, + self.train_loss.value / self.train_count.value, self.train_acc.value / + self.train_total.value, + file=logfile, flush=True + ) + self.total.value = 0 + self.acc.value = 0 + self.loss.value = 0 + self.count.value = 0 + self.train_total.value = 0 + self.train_acc.value = 0 + self.train_loss.value = 0 + self.train_count.value = 0 + self.lock.release() diff --git a/examples/gnn/gnn_tools/part_graph.py b/examples/gnn/gnn_tools/part_graph.py new file mode 100644 index 0000000..45896e5 --- /dev/null +++ b/examples/gnn/gnn_tools/part_graph.py @@ -0,0 +1,25 @@ +from graphmix.partition import part_graph +from graphmix.dataset import load_dataset +from sparse_datasets import load_sparse_dataset +import argparse +import os.path as osp +import yaml + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--dataset", "-d", required=True) + parser.add_argument("--nparts", "-n", required=True) + parser.add_argument("--path", "-p", required=True) + args = parser.parse_args() + output_path = str(args.path) + nparts = int(args.nparts) + dataset, idx_max = load_sparse_dataset(args.dataset) + output_path = osp.expanduser(osp.join(output_path, args.dataset)) + part_graph(dataset, nparts, output_path) + # now write idx_max into meta.yml + meta_file = osp.join(output_path, "meta.yml") + with open(meta_file) as f: + meta = yaml.load(f.read(), Loader=yaml.FullLoader) + meta["idx_max"] = idx_max + with open(meta_file, "w") as f: + yaml.dump(meta, f, sort_keys=False) diff --git a/examples/gnn/gnn_tools/prepare_amazon_data.py b/examples/gnn/gnn_tools/prepare_amazon_data.py new file mode 100644 index 0000000..71beb0d --- /dev/null +++ b/examples/gnn/gnn_tools/prepare_amazon_data.py @@ -0,0 +1,120 @@ +import json +import ast +import numpy as np +import nltk +# all products with metadata +filemap = {'train': './amazon-3M_train_map.txt', + 'test': './amazon-3M_test_map.txt', + 'bow': './amazon-3M_feature_map.txt', + 'meta': './metadata.json', + 'output': './graph.npz', + 'output_sparse': './sparsefeature.npy'} + + +def getBagofWord(): + bow = dict() + with open(filemap['bow'], 'r') as f: + # start with 1, 0 for padding + word_cnt = 1 + for line in f.read().strip().split(): + bow[line] = word_cnt + word_cnt += 1 + return bow + + +def gettoken(descriptions, length): + bow = getBagofWord() + token_matrix = [] + for desc in descriptions: + token_id = [] + token = nltk.word_tokenize(desc.lower()) + for word in token: + if word in bow: + token_id.append(bow[word]) + if len(token_id) == length: + break + while len(token_id) < length: + token_id.append(0) + token_matrix.append(token_id) + return np.array(token_matrix) + + +prod_all = dict() +prod_rcd = dict() +with open(filemap['meta'], 'r') as f: + for line in f: + prod = ast.literal_eval(line.strip().replace('\n', '\\n')) + asin = prod['asin'] + prod_all[asin] = prod + if 'related' in prod and 'categories' in prod and 'description' in prod: + prod_rcd[asin] = prod + +testNodes = set() +prod_gcn = dict() +asin2id = dict() +cnt_id = 0 +asinlist = [] + +for kword in ['train', 'test']: + with open(filemap[kword], 'r') as f: + for line in f: + asin = line.split()[0] + if asin in prod_rcd: + if kword == 'test': + testNodes.add(asin) + prod_gcn[asin] = prod_rcd[asin] + asin2id[asin] = cnt_id + cnt_id += 1 + asinlist.append(asin) + +graphlen = len(prod_gcn) +print('#products with rel/cat/des/feat (GCN assumptions)', graphlen) +print('#trainNodes:', graphlen-len(testNodes), 'testNodes:', len(testNodes)) + +print(len(asin2id)) + +cat2id = dict() +cnt_id = 0 + +class_map = np.zeros(graphlen).astype(np.int32) +train_map = np.zeros(graphlen).astype(np.int32) +descriptions = [] +for idx, asin in enumerate(asinlist): + prod = prod_gcn[asin] + isTest = True if asin in testNodes else False + + cat = prod['categories'][0][0] + if cat not in cat2id: + cat2id[cat] = (cnt_id, 0, 0) + cnt_id += 1 + + if isTest: + cat2id[cat] = (cat2id[cat][0], cat2id[cat][1], cat2id[cat][2]+1) + else: + cat2id[cat] = (cat2id[cat][0], cat2id[cat][1]+1, cat2id[cat][2]) + + class_map[idx] = cat2id[cat][0] + train_map[idx] = 0 if isTest else 1 + if "title" in prod: + descriptions.append(prod["title"] + " " + prod['description']) + else: + descriptions.append(prod['description']) + +print('Classes:', cat2id) +print("Num Classes:", len(cat2id)) + +links_set = set() +for idx, asin in enumerate(asinlist): + for rel, neighbors in prod_gcn[asin]['related'].items(): + for asin_nei in neighbors: + if asin_nei not in asin2id: + continue + idx_nei = asin2id[asin_nei] + lk = (idx, idx_nei) if idx_nei > idx else (idx_nei, idx) + if lk not in links_set: + links_set.add(lk) +links = np.array(list(links_set)) +print('#links between products:', len(links)) +token_matrix = gettoken(descriptions, 16) +np.savez(file=filemap['output'], y=class_map, train_map=train_map, edge=links) +np.save(file=filemap['output_sparse'], arr=token_matrix) diff --git a/examples/gnn/gnn_tools/sparse_datasets.py b/examples/gnn/gnn_tools/sparse_datasets.py new file mode 100644 index 0000000..54582f5 --- /dev/null +++ b/examples/gnn/gnn_tools/sparse_datasets.py @@ -0,0 +1,89 @@ +import graphmix +from graphmix.dataset import load_dataset +import numpy as np +import os.path as osp + + +class AmazonSparseDataset(): + def __init__(self, dataset_root): + self.name = "AmazonSparse" + data = np.load(osp.join(dataset_root, "graph.npz")) + feat = np.load(osp.join(dataset_root, "sparsefeature.npy")) + num_nodes = feat.shape[0] + edge = data['edge'].T + directed = np.concatenate([edge, edge[[1, 0]]], axis=1) + self.idx_max = np.max(feat) + 1 + node_id = np.arange(num_nodes).reshape(-1, 1) + self.idx_max + self.idx_max += num_nodes + self.x = np.empty([num_nodes, 0]) + self.y = np.concatenate( + [feat, node_id, data['y'].reshape(-1, 1)], axis=-1) + self.train_mask = data["train_map"] + self.graph = graphmix.Graph( + edge_index=directed, + num_nodes=num_nodes + ) + self.num_classes = int(np.max(data['y']) + 1) + + +class OGBNmagDataset(): + def __init__(self, dataset_root): + self.name = "ogbn-mag" + from ogb.nodeproppred import PygNodePropPredDataset + dataset = PygNodePropPredDataset(name=self.name, root=dataset_root) + data = dataset[0] + year = data.node_year['paper'].numpy() + self.train_mask = year < 2018 + edge = data.edge_index_dict['paper', 'cites', 'paper'].numpy() + directed = np.concatenate([edge, edge[[1, 0]]], axis=1) + num_nodes = data.num_nodes_dict['paper'] + self.graph = graphmix.Graph( + edge_index=directed, + num_nodes=num_nodes + ) + self.num_classes = dataset.num_classes + + def process_sparse_idx(rel, length, base): + sp_idx = [[] for i in range(num_nodes)] + for i, j in rel.T: + sp_idx[i].append(j) + for i in range(num_nodes): + if len(sp_idx[i]) > length: + sp_idx[i] = sp_idx[i][0:length] + while len(sp_idx[i]) < length: + sp_idx[i].append(-1) + sp_idx = np.array(sp_idx) + sp_idx += (base + 1) + return sp_idx + + node_id = np.arange(num_nodes).reshape(-1, 1) + field = data.edge_index_dict[( + 'paper', 'has_topic', 'field_of_study')].numpy() + paper_field = process_sparse_idx(field, 10, num_nodes) + idx_max = num_nodes + data.num_nodes_dict['field_of_study'] + 1 + author = data.edge_index_dict[('author', 'writes', 'paper')].numpy() + paper_author = process_sparse_idx(author[[1, 0]], 10, idx_max) + idx_max += data.num_nodes_dict['author'] + 1 + self.idx_max = idx_max + self.x = np.empty([num_nodes, 0]) + self.y = np.concatenate([ + paper_field, paper_author, node_id, data.y_dict["paper"].numpy() + ], axis=1) + + +def load_sparse_dataset(name): + root_dir = osp.expanduser(osp.join('~/.graphmix_dataset/', name)) + if name == "Reddit": + dataset = load_dataset(name) + idx_max = dataset.x.shape[0] + node_id = np.arange(idx_max).reshape(-1, 1) + dataset.y = np.concatenate([node_id, dataset.y.reshape(-1, 1)], axis=1) + elif name == "AmazonSparse": + dataset = AmazonSparseDataset(root_dir) + idx_max = dataset.idx_max + elif name == "ogbn-mag": + dataset = OGBNmagDataset(root_dir) + idx_max = dataset.idx_max + else: + raise NotImplementedError + return dataset, int(idx_max) diff --git a/examples/gnn/run_dist.py b/examples/gnn/run_dist.py new file mode 100644 index 0000000..8b1bccc --- /dev/null +++ b/examples/gnn/run_dist.py @@ -0,0 +1,91 @@ +from gnn_tools.launcher import launch_graphmix_and_hetu_ps +from gnn_model.utils import get_norm_adj, prepare_data +from gnn_model.model import sparse_model +from gnn_tools.log import SharedTrainingStat +import graphmix + +import hetu as ht + +import numpy as np +import argparse + +# usage : on each machine +# python3 run_dist.py [configfile] [-p data_path] + + +def train_main(args): + cli = graphmix.Client() + meta = cli.meta + hidden_layer_size = args.hidden_size + num_epoch = args.num_epoch + rank = cli.rank() + nrank = cli.num_worker() + ctx = ht.gpu(rank % args.num_local_worker) + embedding_width = args.hidden_size + # the last two is train label and other train mask + num_int_feature = meta["int_feature"] - 2 + # sample some graphs + ngraph = meta["train_node"] // (args.batch_size * nrank) + graphs = prepare_data(ngraph) + # build model + [loss, y, train_op], [mask_, norm_adj_] = sparse_model( + num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate) + + idx = 0 + graph = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph) + ht.GNNDataLoaderOp.step(graph) + executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', + use_sparse_pull=False, cstable_policy=args.cache) + nbatches = meta["train_node"] // (args.batch_size * nrank) + for epoch in range(num_epoch): + for _ in range(nbatches): + graph_nxt = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph_nxt) + train_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] == 1) + eval_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] != 1) + feed_dict = { + norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)), + mask_: train_mask + } + loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) + y_predicted = y_predicted.asnumpy().argmax(axis=1) + + acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask) + train_acc = np.sum( + (y_predicted == graph.i_feat[:, -2]) * train_mask) + stat.update(acc, eval_mask.sum(), np.sum( + loss_val.asnumpy()*eval_mask)/eval_mask.sum()) + stat.update_train(train_acc, train_mask.sum(), np.sum( + loss_val.asnumpy()*train_mask)/train_mask.sum()) + ht.get_worker_communicate().BarrierWorker() + graph = graph_nxt + if rank == 0: + stat.print(epoch) + + +def server_init(server): + batch_size = args.batch_size + server.init_cache(0.1, graphmix.cache.LFUOpt) + worker_per_server = server.num_worker() // server.num_server() + server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size, + depth=2, width=2, thread=4 * worker_per_server, subgraph=True) + server.is_ready() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("config") + parser.add_argument("--path", "-p", required=True) + parser.add_argument("--num_epoch", default=300, type=int) + parser.add_argument("--hidden_size", default=128, type=int) + parser.add_argument("--learning_rate", default=1, type=float) + parser.add_argument("--batch_size", default=128, type=int) + parser.add_argument("--cache", default="LFUOpt", type=str) + args = parser.parse_args() + stat = SharedTrainingStat() + launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init) diff --git a/examples/gnn/run_dist_hybrid.py b/examples/gnn/run_dist_hybrid.py new file mode 100644 index 0000000..21cf333 --- /dev/null +++ b/examples/gnn/run_dist_hybrid.py @@ -0,0 +1,153 @@ +from gnn_tools.launcher import launch_graphmix_and_hetu_ps +from gnn_model.utils import get_norm_adj, prepare_data +from gnn_model.model import sparse_model +import graphmix + +import hetu as ht +from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t + +import numpy as np +import time +import os +import sys +import multiprocessing +import argparse + +# usage : +# mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] [-p data_path] +# python3 run_dist_hybrid.py [configfile] [-p data_path] --server + + +class TrainStat(): + def __init__(self, comm): + self.file = open("log.txt", "w") + self.train_stat = np.zeros(4) + self.test_stat = np.zeros(4) + self.count = 0 + self.time = [] + self.comm = comm + + def update_test(self, cnt, total, loss): + self.test_stat += [1, cnt, total, loss] + + def update_train(self, cnt, total, loss): + self.train_stat += [1, cnt, total, loss] + + def sync_and_clear(self): + self.count += 1 + train_stat = ht.array(self.train_stat, ht.cpu()) + test_stat = ht.array(self.test_stat, ht.cpu()) + self.comm.dlarrayNcclAllReduce( + train_stat, train_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream) + self.comm.dlarrayNcclAllReduce( + test_stat, test_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream) + self.comm.stream.sync() + train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy() + printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format( + self.count, + test_stat[3] / test_stat[0], + test_stat[1] / test_stat[2], + train_stat[3] / train_stat[0], + train_stat[1] / train_stat[2], + ) + logstr = "{} {} {} {}".format( + test_stat[3] / test_stat[0], + test_stat[1] / test_stat[2], + train_stat[3] / train_stat[0], + train_stat[1] / train_stat[2], + ) + self.time.append(time.time()) + if self.comm.device_id.value == 0: + print(printstr, flush=True) + print(logstr, file=self.file, flush=True) + if len(self.time) > 3: + epoch_time = np.array(self.time[1:])-np.array(self.time[:-1]) + print( + "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time))) + + self.train_stat[:] = 0 + self.test_stat[:] = 0 + + +def train_main(args): + comm = ht.wrapped_mpi_nccl_init() + device_id = comm.dev_id + cli = graphmix.Client() + meta = cli.meta + hidden_layer_size = args.hidden_size + num_epoch = args.num_epoch + rank = cli.rank() + nrank = cli.num_worker() + ctx = ht.gpu(device_id) + embedding_width = args.hidden_size + # the last two is train label and other train mask + num_int_feature = meta["int_feature"] - 2 + # sample some graphs + ngraph = 10 * meta["train_node"] // (args.batch_size * nrank) + graphs = prepare_data(ngraph) + # build model + [loss, y, train_op], [mask_, norm_adj_] = sparse_model( + num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate) + idx = 0 + graph = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph) + ht.GNNDataLoaderOp.step(graph) + executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='Hybrid', + use_sparse_pull=False, cstable_policy=args.cache) + nbatches = meta["train_node"] // (args.batch_size * nrank) + train_state = TrainStat(comm) + for epoch in range(num_epoch): + for _ in range(nbatches): + graph_nxt = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph_nxt) + train_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] == 1) + eval_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] != 1) + feed_dict = { + norm_adj_: get_norm_adj(graph, ht.gpu(device_id)), + mask_: train_mask + } + loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) + y_predicted = y_predicted.asnumpy().argmax(axis=1) + + acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask) + train_acc = np.sum( + (y_predicted == graph.i_feat[:, -2]) * train_mask) + train_state.update_test(acc, eval_mask.sum(), np.sum( + loss_val.asnumpy()*eval_mask)/eval_mask.sum()) + train_state.update_train(train_acc, train_mask.sum(), np.sum( + loss_val.asnumpy()*train_mask)/train_mask.sum()) + ht.get_worker_communicate().BarrierWorker() + graph = graph_nxt + train_state.sync_and_clear() + + +def server_init(server): + batch_size = args.batch_size + server.init_cache(0.1, graphmix.cache.LFUOpt) + worker_per_server = server.num_worker() // server.num_server() + server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size, + depth=2, width=2, thread=4 * worker_per_server, subgraph=True) + server.is_ready() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("config") + parser.add_argument("--path", "-p", required=True) + parser.add_argument("--num_epoch", default=300, type=int) + parser.add_argument("--hidden_size", default=128, type=int) + parser.add_argument("--learning_rate", default=1, type=float) + parser.add_argument("--batch_size", default=128, type=int) + parser.add_argument("--cache", default="LFUOpt", type=str) + parser.add_argument("--server", action="store_true") + args = parser.parse_args() + if args.server: + launch_graphmix_and_hetu_ps( + train_main, args, server_init, hybrid_config="server") + else: + launch_graphmix_and_hetu_ps( + train_main, args, server_init, hybrid_config="worker") diff --git a/examples/gnn/run_single.py b/examples/gnn/run_single.py new file mode 100644 index 0000000..b5a8418 --- /dev/null +++ b/examples/gnn/run_single.py @@ -0,0 +1,95 @@ +from gnn_tools.launcher import launch_graphmix_and_hetu_ps +from gnn_tools.log import SharedTrainingStat +from gnn_model.utils import get_norm_adj, prepare_data +from gnn_model.model import sparse_model, dense_model +import graphmix + +import hetu as ht + +import numpy as np +import time +import os +import sys +import argparse + +# usage +# python3 run_single.py [-p data_path] + + +def train_main(args): + cli = graphmix.Client() + meta = cli.meta + hidden_layer_size = args.hidden_size + num_epoch = args.num_epoch + rank = cli.rank() + nrank = cli.num_worker() + ctx = ht.gpu(rank % args.num_local_worker) + embedding_width = args.hidden_size + # the last two is train label and other train mask + num_int_feature = meta["int_feature"] - 2 + # sample some graphs + ngraph = meta["train_node"] // (args.batch_size * nrank) + graphs = prepare_data(ngraph) + # build model + if args.dense: + [loss, y, train_op], [mask_, norm_adj_] = dense_model( + meta["float_feature"], args.hidden_size, meta["class"], args.learning_rate) + else: + [loss, y, train_op], [mask_, norm_adj_] = sparse_model( + num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate) + + idx = 0 + graph = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph) + ht.GNNDataLoaderOp.step(graph) + executor = ht.Executor([loss, y, train_op], ctx=ctx) + nbatches = meta["train_node"] // (args.batch_size * nrank) + for epoch in range(num_epoch): + for _ in range(nbatches): + graph_nxt = graphs[idx] + idx = (idx + 1) % ngraph + ht.GNNDataLoaderOp.step(graph_nxt) + train_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] == 1) + eval_mask = np.bitwise_and( + graph.extra[:, 0], graph.i_feat[:, -1] != 1) + feed_dict = { + norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)), + mask_: train_mask + } + loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) + y_predicted = y_predicted.asnumpy().argmax(axis=1) + + acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask) + train_acc = np.sum( + (y_predicted == graph.i_feat[:, -2]) * train_mask) + stat.update(acc, eval_mask.sum(), np.sum( + loss_val.asnumpy()*eval_mask)/eval_mask.sum()) + stat.update_train(train_acc, train_mask.sum(), np.sum( + loss_val.asnumpy()*train_mask)/train_mask.sum()) + graph = graph_nxt + stat.print(epoch) + + +def server_init(server): + batch_size = args.batch_size + server.init_cache(0.1, graphmix.cache.LFUOpt) + worker_per_server = server.num_worker() // server.num_server() + server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size, + depth=2, width=2, thread=4 * worker_per_server, subgraph=True) + server.is_ready() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--config", default="config/single.yml") + parser.add_argument("--path", "-p", required=True) + parser.add_argument("--num_epoch", default=300, type=int) + parser.add_argument("--hidden_size", default=128, type=int) + parser.add_argument("--learning_rate", default=1, type=float) + parser.add_argument("--batch_size", default=128, type=int) + parser.add_argument("--dense", action="store_true") + args = parser.parse_args() + stat = SharedTrainingStat() + launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init) diff --git a/examples/nlp/.gitignore b/examples/nlp/.gitignore new file mode 100644 index 0000000..c90eef6 --- /dev/null +++ b/examples/nlp/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +iwslt2016/ +logs/ \ No newline at end of file diff --git a/examples/nlp/README.md b/examples/nlp/README.md new file mode 100644 index 0000000..53351d8 --- /dev/null +++ b/examples/nlp/README.md @@ -0,0 +1,18 @@ +# NLP Examples +In this directory we provide simple implementations for Transformer model. We use the IWSLT2016 de-en dataset. +## Structure +``` +- nlp + - hparams.py Hyperparameters + - prepare_data.py Downloading and preparing data + - data_load.py Dataloader + - hetu_transformer.py Transformer model in hetu + - tf_transformer.py Transformer model in tensorflow + - train_hetu_transformer.py Trainer for hetu + - train_tf_transformer.py Trainer for tensorflow +``` +## Usage +```bash +python train_{framework}_transformer.py +``` +To change the hyperparameters, please modify `hparams.py` file. \ No newline at end of file diff --git a/examples/nlp/__init__.py b/examples/nlp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/nlp/bookcorpus/bookcorpus.py b/examples/nlp/bookcorpus/bookcorpus.py new file mode 100644 index 0000000..42c696e --- /dev/null +++ b/examples/nlp/bookcorpus/bookcorpus.py @@ -0,0 +1,92 @@ +# Lint as: python3 +"""The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """ + + +import glob +import os +import pathlib + +import datasets + + +_DESCRIPTION = """\ +Books are a rich source of both fine-grained information, how a character, \ +an object or a scene looks like, as well as high-level semantics, what \ +someone is thinking, feeling and how these states evolve through a story.\ +This version of bookcorpus has 17868 dataset items (books). Each item contains \ +two fields: title and text. The title is the name of the book (just the file name) \ +while text contains unprocessed book text. The bookcorpus has been prepared by \ +Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community \ +driven platform dedicated to the archiving and long-term preservation of any and \ +all data including but by no means limited to... websites, books, games, software, \ +video, audio, other digital-obscura and ideas. +""" + +_CITATION = """\ +@InProceedings{Zhu_2015_ICCV, + title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books}, + author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {December}, + year = {2015} +} +""" +_PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27" +# _DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz" +_DOWNLOAD_URL = "/home/xiaonan/develope/Athena/datasets/books_doc_format.tar.gz" + + +class BookCorpusOpenConfig(datasets.BuilderConfig): + """BuilderConfig for BookCorpus.""" + + def __init__(self, **kwargs): + """BuilderConfig for BookCorpus. + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(BookCorpusOpenConfig, self).__init__( + version=datasets.Version("1.0.0", ""), **kwargs) + + +class BookCorpusOpen(datasets.GeneratorBasedBuilder): + """BookCorpus dataset.""" + + BUILDER_CONFIGS = [ + BookCorpusOpenConfig( + name="plain_text", + description="Plain text", + ) + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "title": datasets.Value("string"), + "text": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage=_PROJECT_URL, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL) + + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={ + "directory": arch_path}), + ] + + def _generate_examples(self, directory): + glob_target = os.path.join(directory, "**/*.epub.txt") + book_files = glob.glob(glob_target, recursive=True) + book_files = sorted(book_files) + _id = 0 + for book_file_path in book_files: + path = pathlib.PurePath(book_file_path) + with open(book_file_path, mode="r", encoding="utf-8") as f: + yield _id, {"title": str(path.name), "text": f.read()}, + _id += 1 diff --git a/examples/nlp/data_load.py b/examples/nlp/data_load.py new file mode 100644 index 0000000..b5966f2 --- /dev/null +++ b/examples/nlp/data_load.py @@ -0,0 +1,120 @@ +import numpy as np + + +class DataLoader(object): + def __init__(self, fpath1, fpath2, maxlen1, maxlen2, vocab_fpath): + self.sents1, self.sents2 = self.load_data( + fpath1, fpath2, maxlen1, maxlen2) + self.token2idx, self.idx2token = self.load_vocab(vocab_fpath) + self.maxlen1 = maxlen1 + self.maxlen2 = maxlen2 + + def load_vocab(self, vocab_fpath): + '''Loads vocabulary file and returns idx<->token maps + vocab_fpath: string. vocabulary file path. + Note that these are reserved + 0: , 1: , 2: , 3: + + Returns + two dictionaries. + ''' + vocab = [line.split()[0] for line in open( + vocab_fpath, 'r', encoding='utf-8').read().splitlines()] + token2idx = {token: idx for idx, token in enumerate(vocab)} + idx2token = {idx: token for idx, token in enumerate(vocab)} + return token2idx, idx2token + + def load_data(self, fpath1, fpath2, maxlen1, maxlen2): + '''Loads source and target data and filters out too lengthy samples. + fpath1: source file path. string. + fpath2: target file path. string. + maxlen1: source sent maximum length. scalar. + maxlen2: target sent maximum length. scalar. + + Returns + sents1: list of source sents + sents2: list of target sents + ''' + sents1, sents2 = [], [] + with open(fpath1, 'r', encoding='utf-8') as f1, open(fpath2, 'r', encoding='utf-8') as f2: + for sent1, sent2 in zip(f1, f2): + if len(sent1.split()) + 1 > maxlen1: + continue # 1: + if len(sent2.split()) + 1 > maxlen2: + continue # 1: + sents1.append(sent1.strip()) + sents2.append(sent2.strip()) + return sents1, sents2 + + def encode(self, inp, type, dict): + '''Converts string to number. Used for `generator_fn`. + inp: 1d byte array. + type: "x" (source side) or "y" (target side) + dict: token2idx dictionary + + Returns + list of numbers + ''' + inp_str = inp + if type == "x": + tokens = inp_str.split() + [""] + else: + tokens = [""] + inp_str.split() + [""] + + x = [dict.get(t, dict[""]) for t in tokens] + return x + + def make_epoch_data(self, batch_size, shuffle=False): + import copy + new_sents1 = copy.deepcopy(self.sents1) + new_sents2 = copy.deepcopy(self.sents2) + if shuffle: + import random + random.shuffle(new_sents1) + random.shuffle(new_sents2) + xs = [self.encode(sent1, "x", self.token2idx) for sent1 in new_sents1] + ys = [self.encode(sent2, "y", self.token2idx) for sent2 in new_sents2] + batch_xs = [] + batch_ys = [] + for i in range(0, len(xs), batch_size): + start = i + end = start + batch_size + batch_xs.append(xs[start:end]) + batch_ys.append(ys[start:end]) + if len(batch_xs[-1]) != batch_size: + batch_xs = batch_xs[:-1] + batch_ys = batch_ys[:-1] + self.cur_xs = batch_xs + self.cur_ys = batch_ys + self.batch_num = len(batch_xs) + self.idx = 0 + + def get_batch(self, fill_maxlen=True): + if self.idx >= self.batch_num: + assert False + cur_batch_x = self.cur_xs[self.idx] + cur_batch_y = self.cur_ys[self.idx] + self.idx += 1 + + if fill_maxlen: + cur_largest_len_x = self.maxlen1 + cur_largest_len_y = self.maxlen2 + else: + cur_largest_len_x = max([len(x) for x in cur_batch_x]) + cur_largest_len_y = max([len(y) for y in cur_batch_y]) + + cur_batch_x = np.array([self.align(x, cur_largest_len_x) + for x in cur_batch_x]).astype(np.float32) + cur_batch_y = np.array([self.align(y, cur_largest_len_y) + for y in cur_batch_y]).astype(np.float32) + return (cur_batch_x, cur_largest_len_x), (cur_batch_y, cur_largest_len_y) + + def align(self, arr, length): + ori_len = len(arr) + if length > ori_len: + return arr + [0] * (length - ori_len) + else: + return arr[:length] + + def get_pad(self): + return self.token2idx[""] diff --git a/examples/nlp/hetu_transformer.py b/examples/nlp/hetu_transformer.py new file mode 100644 index 0000000..907f7a8 --- /dev/null +++ b/examples/nlp/hetu_transformer.py @@ -0,0 +1,266 @@ +import hetu as ht +from hetu import init +import numpy as np + + +def layer_norm( + input_tensor, + feature_size, + eps=1e-8 +): + scale = init.ones(name='layer_norm_scale', shape=(feature_size, )) + bias = init.zeros(name='layer_norm_biad', shape=(feature_size, )) + return ht.layer_normalization_op(input_tensor, scale, bias, eps=eps) + + +def dense( + input_tensor, + fan_in, + fan_out, + activation=None, + kernel_initializer=init.xavier_normal, + bias_initializer=init.zeros +): + weights = kernel_initializer(name='dense_weights', shape=(fan_in, fan_out)) + bias = bias_initializer(name='dense_bias', shape=(fan_out,)) + outputs = ht.matmul_op(input_tensor, weights) + outputs = outputs + ht.broadcastto_op(bias, outputs) + if activation is not None: + outputs = activation(outputs) + return outputs + + +def dropout( + input_tensor, + dropout_prob +): + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + output = ht.dropout_op(input_tensor, 1.0 - dropout_prob) + return output + + +def get_token_embeddings(vocab_size, num_units, initializer=init.xavier_normal, zero_pad=True): + if zero_pad: + embedding_part = initializer( + name='embedding_table', shape=(vocab_size-1, num_units)) + padding_zero = init.zeros( + name='padding_zero', shape=(1, num_units), trainable=False) + embeddings = ht.concat_op(padding_zero, embedding_part) + else: + embeddings = initializer( + name='embedding_table', shape=(vocab_size, num_units)) + return embeddings + + +def multihead_attention( + queries, keys, values, + config, + query_act=None, key_act=None, value_act=None, + attention_mask=None, + causality=False): + + def transpose_for_scores(input_tensor): + output_tensor = ht.array_reshape_op( + input_tensor, [config.batch_size, -1, config.num_heads, config.d_model // config.num_heads]) + + output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3]) + return output_tensor + + batch_size = config.batch_size + hidden_size = config.d_model + num_attention_heads = config.num_heads + caus_len = config.maxlen2 - 1 + attention_probs_dropout_prob = config.dropout_rate + + size_per_head = hidden_size // num_attention_heads + + # reshape to 2d + queries2d = ht.array_reshape_op( + queries, [-1, hidden_size]) # (N * T_q, d_model) + keys2d = ht.array_reshape_op(keys, [-1, hidden_size]) # (N * T_k, d_model) + values2d = ht.array_reshape_op( + values, [-1, hidden_size]) # (N * T_k, d_model) + + # linear transformation + query_layer = dense(queries2d, hidden_size, hidden_size, + query_act) # (N * T_k, d_model) + key_layer = dense(keys2d, hidden_size, hidden_size, + key_act) # (N * T_k, d_model) + value_layer = dense(values2d, hidden_size, hidden_size, + value_act) # (N * T_k, d_model) + + # transpose + query_layer = transpose_for_scores(query_layer) # (N, h, T_q, d_model/h) + key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h) + value_layer = transpose_for_scores(value_layer) # (N, h, T_k, d_model/h) + + # score + attention_scores = ht.batch_matmul_op( + query_layer, key_layer, trans_B=True) # (N, h, T_q, T_k) + attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head))) + + # mask + if attention_mask is not None: + zeros = ht.Variable('no_mask', value=np.array( + (0,), dtype=np.float32), trainable=False) + adder = ht.Variable('attention_mask', value=np.array( + (-2**32+1,), dtype=np.float32), trainable=False) + zeros = ht.broadcastto_op(zeros, attention_mask) + adder = ht.broadcastto_op(adder, attention_mask) + attention_mask = ht.where_op(attention_mask, zeros, adder) # (N, T) + attention_mask = ht.array_reshape_op( + attention_mask, [batch_size, 1, 1, -1]) + attention_scores = attention_scores + \ + ht.broadcastto_op(attention_mask, attention_scores) + if causality: + tril = ht.Variable(name='tril', value=np.tril( + np.ones((caus_len, caus_len))), trainable=False) # (T, T) + future_masks = ht.broadcast_shape_op( + tril, [batch_size, num_attention_heads, caus_len, caus_len]) + adder = ht.Variable('future_mask', value=np.array( + (-2**32+1,), dtype=np.float32), trainable=False) + adder = ht.broadcastto_op(adder, future_masks) + attention_scores = ht.where_op( + future_masks, attention_scores, adder) # (N, h, T, T) + + # probs + attention_probs = ht.softmax_op(attention_scores) + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + context_layer = ht.batch_matmul_op(attention_probs, value_layer) + context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) + outputs = ht.array_reshape_op( + context_layer, + [batch_size, -1, num_attention_heads * size_per_head]) + + # Residual connection + outputs = outputs + queries # (N, T_q, d_model) + + # Normalize + outputs = layer_norm(outputs, hidden_size) # (N, T_q, d_model) + return outputs + + +def ff(inputs, config): + outputs = ht.array_reshape_op(inputs, [-1, config.d_model]) + outputs = dense(outputs, config.d_model, + config.d_ff, activation=ht.relu_op) + outputs = dense(outputs, config.d_ff, config.d_model) + outputs = ht.array_reshape_op( + outputs, [config.batch_size, -1, config.d_model]) + outputs = outputs + inputs + outputs = layer_norm(outputs, config.d_model) + return outputs + + +def label_smoothing(inputs, V, epsilon=0.1): + # V = inputs.shape[-1] # number of channels + return ((1-epsilon) * inputs) + (epsilon / V) + + +def positional_encoding( + inputs, + inputs_shape, + maxlen, + masking=True +): + N, T, E = tuple(inputs_shape) + position_enc = np.array([ + [pos / np.power(10000, (i & -2)/E) for i in range(E)] + for pos in range(maxlen)]) + position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i + position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 + + position_enc = position_enc[:T, :] + outputs = ht.Variable(name='position_enc', value=np.tile( + position_enc, [N, 1, 1]), trainable=False) + zeros = ht.Variable(name='zeros', value=np.zeros( + inputs_shape), trainable=False) + + if masking: + outputs = ht.where_op(inputs, outputs, zeros) + + return outputs + + +class Transformer(object): + def __init__(self, hp): + self.hp = hp + self.embeddings = get_token_embeddings( + self.hp.vocab_size, self.hp.d_model, zero_pad=True) + + def encode(self, xs): + x = xs + + # embedding + enc = ht.embedding_lookup_op(self.embeddings, x) # (N, T1, d_model) + enc = enc * self.hp.d_model**0.5 # scale + + enc += positional_encoding(enc, (self.hp.batch_size, + self.hp.maxlen1, self.hp.d_model), self.hp.maxlen1) + enc = dropout(enc, self.hp.dropout_rate) + + # Blocks + for i in range(self.hp.num_blocks): + # self-attention + enc = multihead_attention( + queries=enc, keys=enc, values=enc, + config=self.hp, + attention_mask=x, + causality=False + ) + # feed forward + enc = ff(enc, config=self.hp) + memory = enc + return memory + + def decode(self, ys, memory, src_masks): + decoder_inputs = ys + + # embedding + dec = ht.embedding_lookup_op( + self.embeddings, decoder_inputs) # (N, T2, d_model) + dec = dec * self.hp.d_model ** 0.5 # scale + + dec += positional_encoding(dec, (self.hp.batch_size, + self.hp.maxlen2-1, self.hp.d_model), self.hp.maxlen2) + dec = dropout(dec, self.hp.dropout_rate) + + # Blocks + for i in range(self.hp.num_blocks): + # Masked self-attention (Note that causality is True at this time) + dec = multihead_attention( + queries=dec, keys=dec, values=dec, + config=self.hp, + attention_mask=decoder_inputs, + causality=True, + ) + # Vanilla attention + dec = multihead_attention( + queries=dec, keys=memory, values=memory, + config=self.hp, + attention_mask=src_masks, + causality=False, + ) + # Feed Forward + dec = ff(dec, config=self.hp) + + dec = ht.array_reshape_op( + dec, [-1, self.hp.d_model]) # (N * T, d_model) + logits = ht.array_reshape_op(ht.matmul_op(dec, self.embeddings, trans_B=True), [ + self.hp.batch_size, -1, self.hp.vocab_size]) # (N, T, vocab) + + return logits + + def train(self, xs, ys): + # forward + memory = self.encode(xs) + logits = self.decode(ys[0], memory, xs) + + # train scheme + y = ys[1] + y_ = label_smoothing(ht.one_hot_op( + y, self.hp.vocab_size), self.hp.vocab_size) # (N, T, vocab) + loss = ht.softmaxcrossentropy_op(logits, y_) + + return loss diff --git a/examples/nlp/hparams.py b/examples/nlp/hparams.py new file mode 100644 index 0000000..fbce591 --- /dev/null +++ b/examples/nlp/hparams.py @@ -0,0 +1,63 @@ +import argparse + + +class Hparams: + parser = argparse.ArgumentParser() + + # prepro + parser.add_argument('--vocab_size', default=32000, type=int) + + # train + # files + parser.add_argument('--train1', default='iwslt2016/segmented/train.de.bpe', + help="german training segmented data") + parser.add_argument('--train2', default='iwslt2016/segmented/train.en.bpe', + help="english training segmented data") + parser.add_argument('--eval1', default='iwslt2016/segmented/eval.de.bpe', + help="german evaluation segmented data") + parser.add_argument('--eval2', default='iwslt2016/segmented/eval.en.bpe', + help="english evaluation segmented data") + parser.add_argument('--eval3', default='iwslt2016/prepro/eval.en', + help="english evaluation unsegmented data") + + # vocabulary + parser.add_argument('--vocab', default='iwslt2016/segmented/bpe.vocab', + help="vocabulary file path") + + # training scheme + parser.add_argument('--batch_size', default=16, type=int) + parser.add_argument('--eval_batch_size', default=128, type=int) + + parser.add_argument('--lr', default=0.0003, + type=float, help="learning rate") + parser.add_argument('--warmup_steps', default=4000, type=int) + parser.add_argument('--logdir', default="logs/tf", help="log directory") + parser.add_argument('--num_epochs', default=20, type=int) + parser.add_argument('--evaldir', default="logs/tf/eval", + help="evaluation dir") + + # model + parser.add_argument('--d_model', default=512, type=int, + help="hidden dimension of encoder/decoder") + parser.add_argument('--d_ff', default=2048, type=int, + help="hidden dimension of feedforward layer") + parser.add_argument('--num_blocks', default=6, type=int, + help="number of encoder/decoder blocks") + parser.add_argument('--num_heads', default=8, type=int, + help="number of attention heads") + parser.add_argument('--maxlen1', default=100, type=int, + help="maximum length of a source sequence") + parser.add_argument('--maxlen2', default=100, type=int, + help="maximum length of a target sequence") + parser.add_argument('--dropout_rate', default=0.3, type=float) + parser.add_argument('--smoothing', default=0.1, type=float, + help="label smoothing rate") + + # test + parser.add_argument('--test1', default='iwslt2016/segmented/test.de.bpe', + help="german test segmented data") + parser.add_argument('--test2', default='iwslt2016/prepro/test.en', + help="english test data") + parser.add_argument('--ckpt', help="checkpoint file path") + parser.add_argument('--test_batch_size', default=128, type=int) + parser.add_argument('--testdir', default="test/1", help="test result dir") diff --git a/examples/nlp/prepare_data.py b/examples/nlp/prepare_data.py new file mode 100644 index 0000000..85b9ff6 --- /dev/null +++ b/examples/nlp/prepare_data.py @@ -0,0 +1,128 @@ +import os +import wget +import tarfile +import errno +import sentencepiece as spm +import re +from hparams import Hparams +import logging + +logging.basicConfig(level=logging.INFO) + + +def prepro(hp): + """Load raw data -> Preprocessing -> Segmenting with sentencepice + hp: hyperparams. argparse. + """ + logging.info("# Check if raw files exist") + train1 = "iwslt2016/de-en/train.tags.de-en.de" + train2 = "iwslt2016/de-en/train.tags.de-en.en" + eval1 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.de.xml" + eval2 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.en.xml" + test1 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.de.xml" + test2 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.en.xml" + for f in (train1, train2, eval1, eval2, test1, test2): + if not os.path.isfile(f): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f) + + logging.info("# Preprocessing") + # train + + def _prepro(x): return [line.strip() for line in open(x, 'r').read().split("\n") + if not line.startswith("<")] + prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2) + assert len(prepro_train1) == len( + prepro_train2), "Check if train source and target files match." + + # eval + def _prepro(x): return [re.sub("<[^>]+>", "", line).strip() + for line in open(x, 'r').read().split("\n") + if line.startswith("= num_to_predict: + break + masked_token = None + # replace with [MASK] at 80%. + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # keep original at 10%. + if rng.random() < 0.5: + masked_token = tokens[index] + # replace with random word at 10%. + else: + masked_token = vocab_words[rng.randint( + 0, len(vocab_words) - 1)] + output_tokens[index] = masked_token + masked_lms.append([index, tokens[index]]) + + masked_lms.sort(key=lambda x: x[0]) + masked_lm_positions = [] + masked_lm_labels = [] + + for p in masked_lms: + masked_lm_positions.append(p[0]) + masked_lm_labels.append(p[1]) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def create_data_from_document(all_document, doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """ Create Training example for input document """ + document = all_document[doc_id] + max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP] + target_seq_length = max_num_tokens + # generate short sequence at the probility of short_seq_prob + # In order to minimize the mismatch between pre-training and fine-tuning. + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # create sentence A + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + tokens_a = [] + for j in range(a_end): + tokens_a.extend([current_chunk[j]]) + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + for _ in range(10): + random_document_index = rng.randint( + 0, len(all_document) - 1) + if random_document_index != doc_id: + break + # If picked random document is the same as the current document + if random_document_index == doc_id: + is_random_next = False + random_document = all_document[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend([random_document[j]]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend([current_chunk[j]]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +def convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq): + + num_instances = len(instances) + input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32") + input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32") + segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32") + masked_lm_positions_list = np.zeros( + [num_instances, max_predictions_per_seq], dtype="int32") + masked_lm_ids_list = np.zeros( + [num_instances, max_predictions_per_seq], dtype="int32") + next_sentence_labels_list = np.zeros(num_instances, dtype="int32") + for (idx, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids( + instance.masked_lm_labels) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + input_ids_list[idx][:] = input_ids + input_mask_list[idx][:] = input_mask + segment_ids_list[idx][:] = segment_ids + masked_lm_positions_list[idx][:] = masked_lm_ids + next_sentence_labels_list[idx] = next_sentence_label + + return input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list + + +def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): + + documents = [] + for i in range(dataset['train'].shape[0]): + tokens = tokenizer.tokenize(dataset['train'][i]['text']) + documents.append(tokens) + print(len(tokens)) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + + for doc_id in range(len(documents)): + instances.extend(create_data_from_document(documents, doc_id, + max_seq_length, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng)) + + # instance: + # tokens + # segment_ids + # is_random_next + # masked_lm_positions + # masked_lm_labels + return convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def show_dataset_detail(dataset): + print(dataset.shape) + print(dataset.column_names) + print(dataset['train'].features) + print(dataset['train'][0]['text']) + + +if __name__ == "__main__": + max_seq_length = 512 + do_lower_case = True + short_seq_prob = 0.1 + masked_lm_prob = 0.15 + max_predictions_per_seq = 20 + + vocab_path = "/home/xiaonan/develope/Athena/datasets/bert-base-uncased-vocab.txt" + dataset = load_dataset( + '/home/xiaonan/develope/Athena/examples/nlp/bookcorpus', cache_dir=".") + print("total number of documents {} ".format(dataset['train'].shape[0])) + random_seed = 123 + rng = random.Random(random_seed) + tokenizer = hetu.BertTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + + input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list = create_pretrain_data( + dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng) + print(input_ids_list[-1]) + print(input_mask_list[-1]) + print(segment_ids_list[-1]) + print(masked_lm_positions_list[-1]) + print(next_sentence_labels_list[-1]) diff --git a/examples/nlp/tf_transformer.py b/examples/nlp/tf_transformer.py new file mode 100644 index 0000000..2c6b1d7 --- /dev/null +++ b/examples/nlp/tf_transformer.py @@ -0,0 +1,442 @@ +import numpy as np +import tensorflow as tf + +from tqdm import tqdm +import logging + +logging.basicConfig(level=logging.INFO) + + +def ln(inputs, epsilon=1e-8, scope="ln"): + '''Applies layer normalization. See https://arxiv.org/abs/1607.06450. + inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. + epsilon: A floating number. A very small number for preventing ZeroDivision Error. + scope: Optional scope for `variable_scope`. + + Returns: + A tensor with the same shape and data dtype as `inputs`. + ''' + with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): + inputs_shape = inputs.get_shape() + params_shape = inputs_shape[-1:] + + mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) + beta = tf.get_variable("beta", params_shape, + initializer=tf.zeros_initializer()) + gamma = tf.get_variable("gamma", params_shape, + initializer=tf.ones_initializer()) + normalized = (inputs - mean) / ((variance + epsilon) ** (.5)) + outputs = gamma * normalized + beta + + return outputs + + +def get_token_embeddings(vocab_size, num_units, initializer=tf.contrib.layers.xavier_initializer(), zero_pad=True): + '''Constructs token embedding matrix. + Note that the column of index 0's are set to zeros. + vocab_size: scalar. V. + num_units: embedding dimensionalty. E. + zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero + To apply query/key masks easily, zero pad is turned on. + + Returns + weight variable: (V, E) + ''' + with tf.variable_scope("shared_weight_matrix"): + embeddings = tf.get_variable('weight_mat', + dtype=tf.float32, + shape=(vocab_size, num_units), + initializer=initializer) + if zero_pad: + embeddings = tf.concat((tf.zeros(shape=[1, num_units]), + embeddings[1:, :]), 0) + return embeddings + + +def multihead_attention( + queries, keys, values, + batch_size, hidden_size, + num_attention_heads=8, + query_act=None, key_act=None, value_act=None, + attention_mask=None, + attention_probs_dropout_prob=0.0, + training=True, causality=False, + scope="multihead_attention"): + + def transpose_for_scores(input_tensor): + output_tensor = tf.reshape( + input_tensor, [batch_size, -1, num_attention_heads, hidden_size // num_attention_heads]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + size_per_head = hidden_size // num_attention_heads + with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): + # linear transformation + query_layer = tf.layers.dense( + queries, hidden_size, activation=query_act) # (N, T_q, d_model) + key_layer = tf.layers.dense( + keys, hidden_size, activation=key_act) # (N, T_k, d_model) + value_layer = tf.layers.dense( + values, hidden_size, activation=value_act) # (N, T_k, d_model) + + # transpose + query_layer = transpose_for_scores( + query_layer) # (N, h, T_q, d_model/h) + key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h) + value_layer = transpose_for_scores( + value_layer) # (N, h, T_k, d_model/h) + + # score + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True) # (N, h, T_q, T_k) + attention_scores /= size_per_head ** 0.5 + + # mask + if attention_mask is not None: + attention_mask = tf.to_float(attention_mask) # (N, T_k) + attention_mask = tf.reshape( + attention_mask, [batch_size, 1, 1, -1]) # (N, 1, 1, T_k) + attention_scores = attention_scores + \ + attention_mask * (-2**32+1) # (N, h, T_q, T_k) + if causality: + diag_vals = tf.ones_like( + attention_scores[0, 0, :, :]) # (T_q, T_k) + tril = tf.linalg.LinearOperatorLowerTriangular( + diag_vals).to_dense() # (T_q, T_k) + future_masks = tf.broadcast_to( + tril, [batch_size, num_attention_heads, tril.shape[0], tril.shape[1]]) # (N, h, T_q, T_k) + paddings = tf.ones_like(future_masks) * (-2**32+1) + attention_scores = tf.where( + tf.equal(future_masks, 0), paddings, attention_scores) + + # probs + attention_probs = tf.nn.softmax(attention_scores) # (N, h, T_q, T_k) + attention_probs = tf.layers.dropout( + attention_probs, rate=attention_probs_dropout_prob, training=training) + # (N, h, T_q, d_model/h) + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose( + context_layer, [0, 2, 1, 3]) # (N, T_q, h, d_model/h) + outputs = tf.reshape(context_layer, [ + batch_size, -1, num_attention_heads * size_per_head]) # (N, T_q, d_model) + + # Residual connection + outputs += queries # (N, T_q, d_model) + + # Normalize + outputs = ln(outputs) # (N, T_q, d_model) + + return outputs + + +def ff(inputs, num_units, scope="positionwise_feedforward"): + '''position-wise feed forward net. See 3.3 + + inputs: A 3d tensor with shape of [N, T, C]. + num_units: A list of two integers. + scope: Optional scope for `variable_scope`. + + Returns: + A 3d tensor with the same shape and dtype as inputs + ''' + with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): + # Inner layer + outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu) + # Outer layer + outputs = tf.layers.dense(outputs, num_units[1]) + # Residual connection + outputs += inputs + # Normalize + outputs = ln(outputs) + return outputs + + +def label_smoothing(inputs, epsilon=0.1): + '''Applies label smoothing. See 5.4 and https://arxiv.org/abs/1512.00567. + inputs: 3d tensor. [N, T, V], where V is the number of vocabulary. + epsilon: Smoothing rate. + + For example, + + ``` + import tensorflow as tf + inputs = tf.convert_to_tensor([[[0, 0, 1], + [0, 1, 0], + [1, 0, 0]], + + [[1, 0, 0], + [1, 0, 0], + [0, 1, 0]]], tf.float32) + + outputs = label_smoothing(inputs) + + with tf.Session() as sess: + print(sess.run([outputs])) + + >> + [array([[[ 0.03333334, 0.03333334, 0.93333334], + [ 0.03333334, 0.93333334, 0.03333334], + [ 0.93333334, 0.03333334, 0.03333334]], + + [[ 0.93333334, 0.03333334, 0.03333334], + [ 0.93333334, 0.03333334, 0.03333334], + [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] + ``` + ''' + V = inputs.get_shape().as_list()[-1] # number of channels + return ((1-epsilon) * inputs) + (epsilon / V) + + +def positional_encoding(inputs, + maxlen, + masking=True, + scope="positional_encoding"): + '''Sinusoidal Positional_Encoding. See 3.5 + inputs: 3d tensor. (N, T, E) + maxlen: scalar. Must be >= T + masking: Boolean. If True, padding positions are set to zeros. + scope: Optional scope for `variable_scope`. + + returns + 3d tensor that has the same shape as inputs. + ''' + + E = inputs.get_shape().as_list()[-1] # static + N, T = tf.shape(inputs)[0], tf.shape(inputs)[1] # dynamic + with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): + # position indices + position_ind = tf.tile(tf.expand_dims( + tf.range(T), 0), [N, 1]) # (N, T) + + # First part of the PE function: sin and cos argument + position_enc = np.array([ + [pos / np.power(10000, (i-i % 2)/E) for i in range(E)] + for pos in range(maxlen)]) + + # Second part, apply the cosine to even columns and sin to odds. + position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i + position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 + position_enc = tf.convert_to_tensor( + position_enc, tf.float32) # (maxlen, E) + + # lookup + outputs = tf.nn.embedding_lookup(position_enc, position_ind) + + # masks + if masking: + outputs = tf.where(tf.equal(inputs, 0), inputs, outputs) + + return tf.to_float(outputs) + +# def noam_scheme(init_lr, global_step, warmup_steps=4000.): +# '''Noam scheme learning rate decay +# init_lr: initial learning rate. scalar. +# global_step: scalar. +# warmup_steps: scalar. During warmup_steps, learning rate increases +# until it reaches init_lr. +# ''' +# step = tf.cast(global_step + 1, dtype=tf.float32) +# return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) + + +class Transformer(object): + ''' + xs: tuple of + x: int32 tensor. (N, T1) + x_seqlens: int32 tensor. (N,) + sents1: str tensor. (N,) + ys: tuple of + decoder_input: int32 tensor. (N, T2) + y: int32 tensor. (N, T2) + y_seqlen: int32 tensor. (N, ) + sents2: str tensor. (N,) + training: boolean. + ''' + + def __init__(self, hp): + self.hp = hp + # self.token2idx, self.idx2token = load_vocab(hp.vocab) + self.embeddings = get_token_embeddings( + self.hp.vocab_size, self.hp.d_model, zero_pad=True) + + def encode(self, xs, training=True): + ''' + Returns + memory: encoder outputs. (N, T1, d_model) + ''' + with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): + x = xs + + # src_masks + src_masks = tf.math.equal(x, 0) # (N, T1) + + # embedding + enc = tf.nn.embedding_lookup( + self.embeddings, x) # (N, T1, d_model) + enc *= self.hp.d_model**0.5 # scale + + enc += positional_encoding(enc, self.hp.maxlen1) + enc = tf.layers.dropout( + enc, self.hp.dropout_rate, training=training) + + # Blocks + for i in range(self.hp.num_blocks): + with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): + # self-attention + enc = multihead_attention( + queries=enc, keys=enc, values=enc, + batch_size=self.hp.batch_size, hidden_size=self.hp.d_model, + num_attention_heads=self.hp.num_heads, + attention_mask=src_masks, + attention_probs_dropout_prob=self.hp.dropout_rate, + training=training, + causality=False + ) + # feed forward + enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) + memory = enc + return memory, src_masks + + def decode(self, ys, memory, src_masks, training=True): + ''' + memory: encoder outputs. (N, T1, d_model) + src_masks: (N, T1) + + Returns + logits: (N, T2, V). float32. + y_hat: (N, T2). int32 + y: (N, T2). int32 + sents2: (N,). string. + ''' + with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): + decoder_inputs = ys + + # tgt_masks + tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) + + # embedding + dec = tf.nn.embedding_lookup( + self.embeddings, decoder_inputs) # (N, T2, d_model) + dec *= self.hp.d_model ** 0.5 # scale + + dec += positional_encoding(dec, self.hp.maxlen2) + dec = tf.layers.dropout( + dec, self.hp.dropout_rate, training=training) + + # Blocks + for i in range(self.hp.num_blocks): + with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): + # Masked self-attention (Note that causality is True at this time) + dec = multihead_attention( + queries=dec, keys=dec, values=dec, + batch_size=self.hp.batch_size, hidden_size=self.hp.d_model, + num_attention_heads=self.hp.num_heads, + attention_mask=tgt_masks, + attention_probs_dropout_prob=self.hp.dropout_rate, + training=training, + causality=True, + scope="self_attention" + ) + # Vanilla attention + dec = multihead_attention( + queries=dec, keys=memory, values=memory, + batch_size=self.hp.batch_size, hidden_size=self.hp.d_model, + num_attention_heads=self.hp.num_heads, + attention_mask=src_masks, + attention_probs_dropout_prob=self.hp.dropout_rate, + training=training, + causality=False, + scope="vanilla_attention" + ) + # Feed Forward + dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) + + # Final linear projection (embedding weights are shared) + weights = tf.transpose(self.embeddings) # (d_model, vocab_size) + logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) + # y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) + + return logits + + def train(self, xs, ys): + ''' + Returns + loss: scalar. + train_op: training operation + global_step: scalar. + summaries: training summary node + ''' + # forward + memory, src_masks = self.encode(xs) + logits = self.decode(ys[0], memory, src_masks) + + # train scheme + y = ys[1] + y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) + loss = tf.nn.softmax_cross_entropy_with_logits_v2( + logits=logits, labels=y_) + + return loss + + # def eval(self, xs, ys): + # '''Predicts autoregressively + # At inference, input ys is ignored. + # Returns + # y_hat: (N, T2) + # ''' + # decoder_inputs, y, y_seqlen, sents2 = ys + + # decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx[""] + # ys = (decoder_inputs, y, y_seqlen, sents2) + + # memory, sents1, src_masks = self.encode(xs, False) + + # logging.info("Inference graph is being built. Please be patient.") + # for _ in tqdm(range(self.hp.maxlen2)): + # logits, y_hat, y, sents2 = self.decode(ys, memory, src_masks, False) + # if tf.reduce_sum(y_hat, 1) == self.token2idx[""]: break + + # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1) + # ys = (_decoder_inputs, y, y_seqlen, sents2) + + # # monitor a random sample + # n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32) + # sent1 = sents1[n] + # pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token) + # sent2 = sents2[n] + + # tf.summary.text("sent1", sent1) + # tf.summary.text("pred", pred) + # tf.summary.text("sent2", sent2) + # summaries = tf.summary.merge_all() + + # return y_hat, summaries + + +# def convert_idx_to_token_tensor(inputs, idx2token): +# '''Converts int32 tensor to string tensor. +# inputs: 1d int32 tensor. indices. +# idx2token: dictionary + +# Returns +# 1d string tensor. +# ''' +# def my_func(inputs): +# return " ".join(idx2token[elem] for elem in inputs) + +# return tf.py_func(my_func, [inputs], tf.string) + +# def load_vocab(vocab_fpath): +# '''Loads vocabulary file and returns idx<->token maps +# vocab_fpath: string. vocabulary file path. +# Note that these are reserved +# 0: , 1: , 2: , 3: + +# Returns +# two dictionaries. +# ''' +# vocab = [line.split()[0] for line in open(vocab_fpath, 'r', encoding='utf-8').read().splitlines()] +# token2idx = {token: idx for idx, token in enumerate(vocab)} +# idx2token = {idx: token for idx, token in enumerate(vocab)} +# return token2idx, idx2token diff --git a/examples/nlp/train_hetu_transformer.py b/examples/nlp/train_hetu_transformer.py new file mode 100644 index 0000000..174f2b1 --- /dev/null +++ b/examples/nlp/train_hetu_transformer.py @@ -0,0 +1,62 @@ +from tqdm import tqdm +import os +import math +import logging +from hparams import Hparams +from hetu_transformer import Transformer +from data_load import DataLoader +import hetu as ht +import numpy as np +# import time + +logging.basicConfig(level=logging.INFO) + + +logging.info("# hparams") +hparams = Hparams() +parser = hparams.parser +hp = parser.parse_args() +print(hp) + +logging.info("# Prepare train/eval batches") +dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab) + +ctx = ht.gpu(1) +xs = ht.Variable(name='xs') +ys1 = ht.Variable(name='ys1') +ys2 = ht.Variable(name='ys2') +nonpadding = ht.Variable(name='nonpadding') + +logging.info("# Load model") +m = Transformer(hp) +loss = m.train(xs, (ys1, ys2)) +loss = ht.div_op(ht.reduce_sum_op(loss * nonpadding, + axes=[0, 1]), ht.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7) +opt = ht.optim.SGDOptimizer(hp.lr) +train_op = opt.minimize(loss) +executor = ht.Executor([loss, train_op], ctx=ctx) + +logging.info("# Session") + + +for ep in range(hp.num_epochs): + dataloader.make_epoch_data(hp.batch_size) + for i in tqdm(range(dataloader.batch_num)): + xs_val, ys_val = dataloader.get_batch() + # st = time.time() + xs_val = xs_val[0] + ys1_val = ys_val[0][:, :-1] + ys2_val = ys_val[0][:, 1:] + nonpadding_val = np.not_equal( + ys2_val, dataloader.get_pad()).astype(np.float32) + _loss, _ = executor.run( + feed_dict={xs: xs_val, ys1: ys1_val, ys2: ys2_val, nonpadding: nonpadding_val}) + # en = time.time() + # if i == 100: + # exit() + + log_str = 'Iteration %d, loss %f' % (i, _loss.asnumpy()) + print(log_str) + # print('time: ', (en - st)) + +logging.info("Done") diff --git a/examples/nlp/train_tf_transformer.py b/examples/nlp/train_tf_transformer.py new file mode 100644 index 0000000..8def5d0 --- /dev/null +++ b/examples/nlp/train_tf_transformer.py @@ -0,0 +1,98 @@ +import tensorflow as tf + +from tqdm import tqdm +import os +import math +import logging +from hparams import Hparams +from tf_transformer import Transformer +from data_load import DataLoader +# import time + +logging.basicConfig(level=logging.INFO) + + +logging.info("# hparams") +hparams = Hparams() +parser = hparams.parser +hp = parser.parse_args() +print(hp) +# save_hparams(hp, hp.logdir) + +logging.info("# Prepare train/eval batches") +dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab) + +xs = tf.placeholder(name='xs', dtype=tf.int32, shape=[16, 100]) +ys1 = tf.placeholder(name='ys1', dtype=tf.int32, shape=[16, 99]) +ys2 = tf.placeholder(name='ys2', dtype=tf.int32, shape=[16, 99]) + +logging.info("# Load model") +m = Transformer(hp) +loss = m.train(xs, (ys1, ys2)) +nonpadding = tf.to_float(tf.not_equal(ys2, dataloader.get_pad())) # 0: +loss = tf.reduce_sum(loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) + +global_step = tf.train.get_or_create_global_step() +optimizer = tf.train.GradientDescentOptimizer(hp.lr) +train_op = optimizer.minimize(loss, global_step=global_step) +# y_hat, eval_summaries = m.eval(xs, ys) +# y_hat = m.infer(xs, ys) + +logging.info("# Session") +saver = tf.train.Saver(max_to_keep=hp.num_epochs) +with tf.Session() as sess: + ckpt = tf.train.latest_checkpoint(hp.logdir) + if ckpt is None: + logging.info("Initializing from scratch") + sess.run(tf.global_variables_initializer()) + # save_variable_specs(os.path.join(hp.logdir, "specs")) + else: + saver.restore(sess, ckpt) + + _gs = sess.run(global_step) + + for ep in range(hp.num_epochs): + dataloader.make_epoch_data(hp.batch_size) + for i in tqdm(range(dataloader.batch_num)): + xs_val, ys_val = dataloader.get_batch() + # st = time.time() + _loss, _, _gs = sess.run([loss, train_op, global_step], feed_dict={ + xs: xs_val[0], ys1: ys_val[0][:, :-1], ys2: ys_val[0][:, 1:]}) + # en = time.time() + # if i == 100: + # exit() + # epoch = math.ceil(_gs / num_train_batches) + + log_str = 'Iteration %d, loss %f' % (i, _loss) + print(log_str) + # print('time: ', (en - st)) + + # logging.info("epoch {} is done".format(ep)) + # _loss = sess.run(loss) # train loss + + # logging.info("# test evaluation") + # _, _eval_summaries = sess.run([eval_init_op, eval_summaries]) + # summary_writer.add_summary(_eval_summaries, _gs) + + # logging.info("# get hypotheses") + # hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, m.idx2token) + + # logging.info("# write results") + # model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss) + # if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir) + # translation = os.path.join(hp.evaldir, model_output) + # with open(translation, 'w') as fout: + # fout.write("\n".join(hypotheses)) + + # logging.info("# calc bleu score and append it to translation") + # calc_bleu(hp.eval3, translation) + + # logging.info("# save models") + # ckpt_name = os.path.join(hp.logdir, model_output) + # saver.save(sess, ckpt_name, global_step=_gs) + # logging.info("after training of {} epochs, {} has been saved.".format(epoch, ckpt_name)) + + # logging.info("# fall back to train mode") + + +logging.info("Done") diff --git a/examples/nlp/wikipedia.py b/examples/nlp/wikipedia.py new file mode 100644 index 0000000..d86c9ac --- /dev/null +++ b/examples/nlp/wikipedia.py @@ -0,0 +1,540 @@ +"""Wikipedia dataset containing cleaned articles of all languages.""" +import bz2 +import codecs +import json +import re +import xml.etree.cElementTree as etree + +import datasets + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@ONLINE {wikidump, + author = {Wikimedia Foundation}, + title = {Wikimedia Downloads}, + url = {https://dumps.wikimedia.org} +} +""" + +_DESCRIPTION = """\ +Wikipedia dataset containing cleaned articles of all languages. +The datasets are built from the Wikipedia dump +(https://dumps.wikimedia.org/) with one split per language. Each example +contains the content of one full Wikipedia article with cleaning to strip +markdown and unwanted sections (references, etc.). +""" + +_LICENSE = ( + "This work is licensed under the Creative Commons Attribution-ShareAlike " + "3.0 Unported License. To view a copy of this license, visit " + "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to " + "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA." +) + +# Source: https://en.wikipedia.org/wiki/List_of_Wikipedias (accessed 3/1/2019) +# Removed because no articles: hz. +WIKIPEDIA_LANGUAGES = [ + "aa", + "ab", + "ace", + "ady", + "af", + "ak", + "als", + "am", + "an", + "ang", + "ar", + "arc", + "arz", + "as", + "ast", + "atj", + "av", + "ay", + "az", + "azb", + "ba", + "bar", + "bat-smg", + "bcl", + "be", + "be-x-old", + "bg", + "bh", + "bi", + "bjn", + "bm", + "bn", + "bo", + "bpy", + "br", + "bs", + "bug", + "bxr", + "ca", + "cbk-zam", + "cdo", + "ce", + "ceb", + "ch", + "cho", + "chr", + "chy", + "ckb", + "co", + "cr", + "crh", + "cs", + "csb", + "cu", + "cv", + "cy", + "da", + "de", + "din", + "diq", + "dsb", + "dty", + "dv", + "dz", + "ee", + "el", + "eml", + "en", + "eo", + "es", + "et", + "eu", + "ext", + "fa", + "ff", + "fi", + "fiu-vro", + "fj", + "fo", + "fr", + "frp", + "frr", + "fur", + "fy", + "ga", + "gag", + "gan", + "gd", + "gl", + "glk", + "gn", + "gom", + "gor", + "got", + "gu", + "gv", + "ha", + "hak", + "haw", + "he", + "hi", + "hif", + "ho", + "hr", + "hsb", + "ht", + "hu", + "hy", + "ia", + "id", + "ie", + "ig", + "ii", + "ik", + "ilo", + "inh", + "io", + "is", + "it", + "iu", + "ja", + "jam", + "jbo", + "jv", + "ka", + "kaa", + "kab", + "kbd", + "kbp", + "kg", + "ki", + "kj", + "kk", + "kl", + "km", + "kn", + "ko", + "koi", + "krc", + "ks", + "ksh", + "ku", + "kv", + "kw", + "ky", + "la", + "lad", + "lb", + "lbe", + "lez", + "lfn", + "lg", + "li", + "lij", + "lmo", + "ln", + "lo", + "lrc", + "lt", + "ltg", + "lv", + "mai", + "map-bms", + "mdf", + "mg", + "mh", + "mhr", + "mi", + "min", + "mk", + "ml", + "mn", + "mr", + "mrj", + "ms", + "mt", + "mus", + "mwl", + "my", + "myv", + "mzn", + "na", + "nah", + "nap", + "nds", + "nds-nl", + "ne", + "new", + "ng", + "nl", + "nn", + "no", + "nov", + "nrm", + "nso", + "nv", + "ny", + "oc", + "olo", + "om", + "or", + "os", + "pa", + "pag", + "pam", + "pap", + "pcd", + "pdc", + "pfl", + "pi", + "pih", + "pl", + "pms", + "pnb", + "pnt", + "ps", + "pt", + "qu", + "rm", + "rmy", + "rn", + "ro", + "roa-rup", + "roa-tara", + "ru", + "rue", + "rw", + "sa", + "sah", + "sat", + "sc", + "scn", + "sco", + "sd", + "se", + "sg", + "sh", + "si", + "simple", + "sk", + "sl", + "sm", + "sn", + "so", + "sq", + "sr", + "srn", + "ss", + "st", + "stq", + "su", + "sv", + "sw", + "szl", + "ta", + "tcy", + "te", + "tet", + "tg", + "th", + "ti", + "tk", + "tl", + "tn", + "to", + "tpi", + "tr", + "ts", + "tt", + "tum", + "tw", + "ty", + "tyv", + "udm", + "ug", + "uk", + "ur", + "uz", + "ve", + "vec", + "vep", + "vi", + "vls", + "vo", + "wa", + "war", + "wo", + "wuu", + "xal", + "xh", + "xmf", + "yi", + "yo", + "za", + "zea", + "zh", + "zh-classical", + "zh-min-nan", + "zh-yue", + "zu", +] + +_BASE_URL_TMPL = "https://dumps.wikimedia.org/{lang}wiki/{date}/" +_INFO_FILE = "dumpstatus.json" + + +class WikipediaConfig(datasets.BuilderConfig): + """BuilderConfig for Wikipedia.""" + + def __init__(self, language=None, date=None, **kwargs): + """BuilderConfig for Wikipedia. + Args: + language: string, the language code for the Wikipedia dump to use. + date: string, date of the Wikipedia dump in YYYYMMDD format. A list of + available dates can be found at https://dumps.wikimedia.org/enwiki/. + **kwargs: keyword arguments forwarded to super. + """ + super(WikipediaConfig, self).__init__( + name="{0}.{1}".format(date, language), + description="Wikipedia dataset for {0}, parsed from {1} dump.".format( + language, date), + **kwargs, + ) + self.date = date + self.language = language + + +_VERSION = datasets.Version("1.0.0", "") + + +class Wikipedia(datasets.BeamBasedBuilder): + """Wikipedia dataset.""" + + # Use mirror (your.org) to avoid download caps. + BUILDER_CONFIG_CLASS = WikipediaConfig + BUILDER_CONFIGS = [ + WikipediaConfig( + version=_VERSION, + language=lang, + date="20200501", + ) # pylint:disable=g-complex-comprehension + for lang in WIKIPEDIA_LANGUAGES + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features({"title": datasets.Value( + "string"), "text": datasets.Value("string")}), + # No default supervised_keys. + supervised_keys=None, + homepage="https://dumps.wikimedia.org", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager, pipeline): + def _base_url(lang): + return _BASE_URL_TMPL.format(lang=lang.replace("-", "_"), date=self.config.date) + + lang = self.config.language + + info_url = _base_url(lang) + _INFO_FILE + # Use dictionary since testing mock always returns the same result. + downloaded_files = dl_manager.download_and_extract({"info": info_url}) + + xml_urls = [] + total_bytes = 0 + with open(downloaded_files["info"], encoding="utf-8") as f: + dump_info = json.load(f) + multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"] + assert ( + multistream_dump_info["status"] == "done" + ), "Specified dump (%s) multistream status is not 'done': %s" % ( + _base_url(lang), + multistream_dump_info["status"], + ) + + for fname, info in multistream_dump_info["files"].items(): + if ".xml" not in fname: + continue + total_bytes += info["size"] + xml_urls.append(_base_url(lang) + fname) + + # Use dictionary since testing mock always returns the same result. + downloaded_files = dl_manager.download({"xml": xml_urls}) + if not pipeline.is_local(): + downloaded_files = dl_manager.ship_files_with_pipeline( + downloaded_files, pipeline) + + return [ + datasets.SplitGenerator( # pylint:disable=g-complex-comprehension + name=datasets.Split.TRAIN, gen_kwargs={ + "filepaths": downloaded_files["xml"], "language": lang} + ) + ] + + def _build_pcollection(self, pipeline, filepaths, language): + """Build PCollection of examples in the raw (text) form.""" + import apache_beam as beam + import mwparserfromhell + + def _extract_content(filepath): + """Extracts article content from a single WikiMedia XML file.""" + logger.info("generating examples from = %s", filepath) + with beam.io.filesystems.FileSystems.open(filepath) as f: + f = bz2.BZ2File(filename=f) + # Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563 + utf_f = codecs.getreader("utf-8")(f) + context = etree.iterparse(utf_f, events=("end",)) + for unused_event, elem in context: + if not elem.tag.endswith("page"): + continue + namespace = elem.tag[:-4] + title = elem.find("./{0}title".format(namespace)).text + ns = elem.find("./{0}ns".format(namespace)).text + id_ = elem.find("./{0}id".format(namespace)).text + + # Filter pages that are not in the "main" namespace. + if ns != "0": + elem.clear() + continue + + raw_content = elem.find( + "./{0}revision/{0}text".format(namespace)).text + elem.clear() + + # Filter redirects. + if raw_content is None or raw_content.lower().startswith("#redirect"): + beam.metrics.Metrics.counter( + language, "filtered-redirects").inc() + continue + + beam.metrics.Metrics.counter( + language, "extracted-examples").inc() + yield (id_, title, raw_content) + + def _clean_content(inputs): + """Cleans raw wikicode to extract text.""" + id_, title, raw_content = inputs + try: + text = _parse_and_clean_wikicode( + raw_content, parser=mwparserfromhell) + except (mwparserfromhell.parser.ParserError) as e: + beam.metrics.Metrics.counter(language, "parser-error").inc() + logger.error("mwparserfromhell ParseError: %s", e) + return + + if not text: + beam.metrics.Metrics.counter( + language, "empty-clean-examples").inc() + return + + beam.metrics.Metrics.counter(language, "cleaned-examples").inc() + + yield id_, {"title": title, "text": text} + + return ( + pipeline + | "Initialize" >> beam.Create(filepaths) + | "Extract content" >> beam.FlatMap(_extract_content) + | "Distribute" >> beam.transforms.Reshuffle() + | "Clean content" >> beam.FlatMap(_clean_content) + ) + + +def _parse_and_clean_wikicode(raw_content, parser): + """Strips formatting and unwanted sections from raw page content.""" + wikicode = parser.parse(raw_content) + + # Filters for references, tables, and file/image links. + re_rm_wikilink = re.compile( + "^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE) + + def rm_wikilink(obj): + return bool(re_rm_wikilink.match(str(obj.title))) + + def rm_tag(obj): + return str(obj.tag) in {"ref", "table"} + + def rm_template(obj): + return obj.name.lower() in {"reflist", "notelist", "notelist-ua", "notelist-lr", "notelist-ur", "notelist-lg"} + + def try_remove_obj(obj, section): + try: + section.remove(obj) + except ValueError: + # For unknown reasons, objects are sometimes not found. + pass + + section_text = [] + # Filter individual sections to clean. + for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True): + for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True): + try_remove_obj(obj, section) + for obj in section.ifilter_templates(matches=rm_template, recursive=True): + try_remove_obj(obj, section) + for obj in section.ifilter_tags(matches=rm_tag, recursive=True): + try_remove_obj(obj, section) + + section_text.append(section.strip_code().strip()) + return "\n\n".join(section_text) diff --git a/examples/rec/.gitignore b/examples/rec/.gitignore new file mode 100644 index 0000000..ccb0cb2 --- /dev/null +++ b/examples/rec/.gitignore @@ -0,0 +1,2 @@ +datasets/ +logs/ diff --git a/examples/rec/README.md b/examples/rec/README.md new file mode 100644 index 0000000..5fc2afa --- /dev/null +++ b/examples/rec/README.md @@ -0,0 +1,42 @@ +# Recommendation Model Example (with Distributed Settings) +In this directory we provide NCF model for recommendation task on movielens dataset. + +## Structure +``` +- rec + - run_hetu.py basic trainer for hetu + - run_tf.py basic trainer for tensorflow + - run_tfworker.py trainer for tensorflow in PS setting + - run_parallax.py trainer for tensorflow in parallax setting + - hetu_ncf.py model implementatino in hetu + - tf_ncf.py model implementation in tensorflow + - movielens.py script to download and handle dataset +``` + +## Prepare movielens data +Simply `python movielens.py` . + +## Usage +```bash +# run locally +python run_hetu.py +# run in ps setting (locally) +bash ps_ncf.sh +# run in hybrid setting (locally) +bash hybrid_ncf.sh + +# run tensorflow locally +python run_tf.py +# run tensorflow in parallax +python {absolute_path_to}/run_parallax.py +# run tensorflow in ps setting +python ../ctr/tf_launch_server.py --config {config} --id {rank} +python run_tfworker.py --rank {rank} --config {config} +# or +python ../ctr/tf_launch_server.py --config ../ctr/settings/tf_local_s1_w8.json --id 0 +bash tf_8workers.sh +``` + + +## Configuration +Please refer to `ctr` directory. diff --git a/examples/rec/hetu_ncf.py b/examples/rec/hetu_ncf.py new file mode 100644 index 0000000..ecf8b5d --- /dev/null +++ b/examples/rec/hetu_ncf.py @@ -0,0 +1,47 @@ +import hetu as ht +from hetu import init + +import numpy as np + + +def neural_mf(user_input, item_input, y_, num_users, num_items): + embed_dim = 8 + layers = [64, 32, 16, 8] + learning_rate = 0.01 + + User_Embedding = init.random_normal( + (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ht.cpu(0)) + Item_Embedding = init.random_normal( + (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ht.cpu(0)) + + user_latent = ht.embedding_lookup_op( + User_Embedding, user_input, ctx=ht.cpu(0)) + item_latent = ht.embedding_lookup_op( + Item_Embedding, item_input, ctx=ht.cpu(0)) + + mf_user_latent = ht.slice_op(user_latent, (0, 0), (-1, embed_dim)) + mlp_user_latent = ht.slice_op(user_latent, (0, embed_dim), (-1, -1)) + mf_item_latent = ht.slice_op(item_latent, (0, 0), (-1, embed_dim)) + mlp_item_latent = ht.slice_op(item_latent, (0, embed_dim), (-1, -1)) + + W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1') + W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2') + W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3') + W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4') + + mf_vector = ht.mul_op(mf_user_latent, mf_item_latent) + mlp_vector = ht.concat_op(mlp_user_latent, mlp_item_latent, axis=1) + fc1 = ht.matmul_op(mlp_vector, W1) + relu1 = ht.relu_op(fc1) + fc2 = ht.matmul_op(relu1, W2) + relu2 = ht.relu_op(fc2) + fc3 = ht.matmul_op(relu2, W3) + relu3 = ht.relu_op(fc3) + concat_vector = ht.concat_op(mf_vector, relu3, axis=1) + y = ht.matmul_op(concat_vector, W4) + y = ht.sigmoid_op(y) + loss = ht.binarycrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) + train_op = opt.minimize(loss) + return loss, y, train_op diff --git a/examples/rec/hybrid_ncf.sh b/examples/rec/hybrid_ncf.sh new file mode 100644 index 0000000..39f7c9a --- /dev/null +++ b/examples/rec/hybrid_ncf.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/run_hetu.py + +python -m hetu.launcher ${workdir}/../ctr/settings/local_s1.yml -n 1 --sched & +mpirun --allow-run-as-root -np 4 python ${mainpy} --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_w4.yml diff --git a/examples/rec/movielens.py b/examples/rec/movielens.py new file mode 100644 index 0000000..90b5048 --- /dev/null +++ b/examples/rec/movielens.py @@ -0,0 +1,119 @@ +import os +import wget +import zipfile +from collections import defaultdict as dd +import numpy as np +import scipy.sparse as sp +from tqdm import tqdm + + +DATASETS = ["ml-1m", "ml-20m", "ml-25m"] +urls = { + "ml-1m": "https://files.grouplens.org/datasets/movielens/ml-1m.zip", + "ml-20m": "https://files.grouplens.org/datasets/movielens/ml-20m.zip", + "ml-25m": "https://files.grouplens.org/datasets/movielens/ml-25m.zip", +} + + +def download(dataset, data_dir, num_negatives=4): + if not os.path.exists(data_dir): + os.mkdir(data_dir) + assert dataset in ["ml-1m", "ml-20m", + "ml-25m"], 'Invalid dataset: %s.' % dataset + data_subdir = os.path.join(data_dir, dataset) + print('Data in', data_subdir) + zip_file = os.path.join(data_dir, dataset + '.zip') + ratings = os.path.join(data_subdir, 'ratings.csv') + if not os.path.exists(ratings): + if not os.path.exists(zip_file): + print('Downloading movielens %s...' % dataset) + wget.download(urls[dataset], zip_file) + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + print('Extracting movielens %s...' % dataset) + zip_ref.extractall(data_dir) + ratings = os.path.join(data_subdir, 'ratings.csv') + + num_users, num_items = { + 'ml-1m': (6040, 3706), + 'ml-20m': (138493, 26744), + 'ml-25m': (162541, 59047), + }[dataset] + + # Generate raw training and testing files + item_reverse_mapping = {} + cur_item_idx = 0 + latest = [(0, -1)] * num_users + mat = sp.dok_matrix((num_users, num_items), dtype=np.float32) + with open(ratings, 'r') as fr: + fr.readline() + for line in tqdm(fr): + entries = line.strip().split(',') + user = int(entries[0]) + item = int(entries[1]) + if item not in item_reverse_mapping: + item_reverse_mapping[item] = cur_item_idx + cur_item_idx += 1 + rating = float(entries[2]) + if rating <= 0: + continue + reitem = item_reverse_mapping[item] + mat[user-1, reitem] = 1 + timestamp = int(entries[-1]) + if latest[user-1][0] < timestamp: + latest[user-1] = (timestamp, reitem) + print('#users:', num_users, '#items:', num_items) + + new_lates = np.concatenate((np.array(latest, dtype=np.int32)[ + :, 1:], np.empty((num_users, 99), dtype=np.int32)), 1) + + # sample for test data first, each user 99 items, using all data + for i, lat in enumerate(latest): + new_lates[i][0] = lat[1] + for k in range(1, 100): + j = np.random.randint(num_items) + while (i, j) in mat.keys(): + j = np.random.randint(num_items) + new_lates[i][k] = j + np.save(os.path.join(data_subdir, 'test.npy'), new_lates) + + # sample for train data, each data with num_negative negative samples + all_num = (1 + num_negatives) * (len(mat.keys()) - num_users) + user_input = np.empty((all_num,), dtype=np.int32) + item_input = np.empty((all_num,), dtype=np.int32) + labels = np.empty((all_num,), dtype=np.int32) + idx = 0 + for (i, j) in mat.keys(): + if new_lates[i][0] == j: + continue + # positive instance + user_input[idx] = i + item_input[idx] = j + labels[idx] = 1 + idx += 1 + # negative instances + for t in range(num_negatives): + k = np.random.randint(num_items) + while (i, k) in mat.keys(): + k = np.random.randint(num_items) + user_input[idx] = i + item_input[idx] = k + labels[idx] = 0 + idx += 1 + assert all_num == idx + np.savez(os.path.join(data_subdir, 'train.npz'), + user_input=user_input, item_input=item_input, labels=labels) + + +def getdata(dataset, data_dir='datasets'): + assert dataset in ["ml-1m", "ml-20m", + "ml-25m"], 'Invalid dataset: %s.' % dataset + data_subdir = os.path.join(data_dir, dataset) + file_paths = [os.path.join(data_subdir, data) + for data in ['train.npz', 'test.npy']] + if any([not os.path.exists(path) for path in file_paths]): + download(dataset, data_dir) + return np.load(file_paths[0]), np.load(file_paths[1]) + + +if __name__ == "__main__": + download('ml-25m', 'datasets') diff --git a/examples/rec/ps_ncf.sh b/examples/rec/ps_ncf.sh new file mode 100644 index 0000000..415f752 --- /dev/null +++ b/examples/rec/ps_ncf.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/run_hetu.py + +python ${mainpy} --comm PS --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_s1_w4.yml diff --git a/examples/rec/run_hetu.py b/examples/rec/run_hetu.py new file mode 100644 index 0000000..e20cb85 --- /dev/null +++ b/examples/rec/run_hetu.py @@ -0,0 +1,187 @@ +import hetu as ht +from hetu.launcher import launch + +import os +import numpy as np +import yaml +import time +import math +import argparse +from tqdm import tqdm +from hetu_ncf import neural_mf +import heapq # for retrieval topK + + +def getHitRatio(ranklist, gtItem): + for item in ranklist: + if item == gtItem: + return 1 + return 0 + + +def getNDCG(ranklist, gtItem): + for i in range(len(ranklist)): + item = ranklist[i] + if item == gtItem: + return math.log(2) / math.log(i+2) + return 0 + + +class Logging(object): + def __init__(self, path='logs/hetulog.txt'): + with open(path, 'w') as fw: + fw.write('') + self.path = path + + def write(self, s): + print(s) + with open(self.path, 'a') as fw: + fw.write(s + '\n') + fw.flush() + + +def worker(args): + def validate(): + hits, ndcgs = [], [] + for idx in range(testData.shape[0]): + start_index = idx * 100 + predictions = executor.run( + 'validate', convert_to_numpy_ret_vals=True) + map_item_score = { + testItemInput[start_index + i]: predictions[0][i] for i in range(100)} + gtItem = testItemInput[start_index] + # Evaluate top rank list + ranklist = heapq.nlargest( + topK, map_item_score, key=map_item_score.get) + hr = getHitRatio(ranklist, gtItem) + ndcg = getNDCG(ranklist, gtItem) + hits.append(hr) + ndcgs.append(ndcg) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + return hr, ndcg + + def get_current_shard(data): + if args.comm is not None: + part_size = data.shape[0] // nrank + start = part_size * rank + end = start + part_size if rank != nrank - 1 else data.shape[0] + return data[start:end] + else: + return data + + device_id = 0 + if args.comm == 'PS': + rank = ht.get_worker_communicate().rank() + nrank = int(os.environ['DMLC_NUM_WORKER']) + device_id = rank % 8 + elif args.comm == 'Hybrid': + comm = ht.wrapped_mpi_nccl_init() + device_id = comm.dev_id + rank = comm.rank + nrank = int(os.environ['DMLC_NUM_WORKER']) + + from movielens import getdata + if args.all: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = get_current_shard(trainData['user_input']) + trainItems = get_current_shard(trainData['item_input']) + trainLabels = get_current_shard(trainData['labels']) + testData = get_current_shard(testData) + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + else: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = get_current_shard(trainData['user_input'][:1024000]) + trainItems = get_current_shard(trainData['item_input'][:1024000]) + trainLabels = get_current_shard(trainData['labels'][:1024000]) + testData = get_current_shard(testData[:1470]) + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + + num_users, num_items = { + 'ml-1m': (6040, 3706), + 'ml-20m': (138493, 26744), + 'ml-25m': (162541, 59047), + }['ml-25m'] + # assert not args.all or num_users == testData.shape[0] + batch_size = 1024 + num_negatives = 4 + topK = 10 + user_input = ht.dataloader_op([ + ht.Dataloader(trainUsers, batch_size, 'train'), + ht.Dataloader(testUserInput, 100, 'validate'), + ]) + item_input = ht.dataloader_op([ + ht.Dataloader(trainItems, batch_size, 'train'), + ht.Dataloader(testItemInput, 100, 'validate'), + ]) + y_ = ht.dataloader_op([ + ht.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'), + ]) + + loss, y, train_op = neural_mf( + user_input, item_input, y_, num_users, num_items) + + executor = ht.Executor({'train': [loss, train_op], 'validate': [y]}, ctx=ht.gpu(device_id), + comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123) + + path = 'logs/hetulog_%s' % ({None: 'local', + 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm]) + path += '_%d.txt' % rank if args.comm else '.txt' + log = Logging(path=path) + epoch = 7 + start = time.time() + for ep in range(epoch): + ep_st = time.time() + log.write('epoch %d' % ep) + train_loss = [] + for idx in tqdm(range(executor.get_batch_num('train'))): + loss_val = executor.run('train', convert_to_numpy_ret_vals=True) + train_loss.append(loss_val[0]) + + tra_loss = np.mean(train_loss) + ep_en = time.time() + + # validate phase + if args.val: + hr, ndcg = validate() + printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % ( + tra_loss, hr, ndcg, ep_en - ep_st) + else: + printstr = "train_loss: %.4f, train_time: %.4f" % ( + tra_loss, ep_en - ep_st) + log.write(printstr) + log.write('all time: %f' % (time.time() - start)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--val", action="store_true", + help="whether to perform validation") + parser.add_argument("--all", action="store_true", + help="whether to use all data, default to use 1024000 training data") + parser.add_argument("--comm", default=None, + help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid") + parser.add_argument("--bsp", action="store_true", + help="whether to use bsp instead of asp") + parser.add_argument("--cache", default=None, help="cache policy") + parser.add_argument("--bound", default=100, help="cache bound") + parser.add_argument( + "--config", type=str, default="./settings/local_s1_w4.yml", help="configuration for ps") + args = parser.parse_args() + + if args.comm is None: + worker(args) + elif args.comm == 'Hybrid': + settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader) + value = settings['shared'] + os.environ['DMLC_ROLE'] = 'worker' + for k, v in value.items(): + os.environ[k] = str(v) + worker(args) + elif args.comm == 'PS': + launch(worker, args) + else: + raise NotImplementedError diff --git a/examples/rec/run_parallax.py b/examples/rec/run_parallax.py new file mode 100644 index 0000000..276d49a --- /dev/null +++ b/examples/rec/run_parallax.py @@ -0,0 +1,192 @@ +import os +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from tf_ncf import neural_mf +import heapq # for retrieval topK +import math + +from autodist import AutoDist +from autodist.resource_spec import ResourceSpec +from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax +from autodist.strategy.base import Strategy +from autodist.kernel.common.utils import get_op_name +from tensorflow.python.framework import ops + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + +# Please DO NOT modify /etc/bash.bashrc to activate conda environment. +# Use python_venv in spec yml file instead. +# Use absolute path of python file. +# Here we use the tf native partitioner instead of autodist's PartitionPS. + + +class Parallaxx(PSLoadBalancing, AllReduce): + """ + Modify original parallax to remove replica on CPUs. + """ + + def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0): + PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness) + AllReduce.__init__(self, chunk_size) + + # pylint: disable=attribute-defined-outside-init + def build(self, graph_item, resource_spec): + """Generate the strategy.""" + expr = Strategy() + + # For each variable, generate variable synchronizer config + expr.graph_config.replicas.extend( + [k for k, v in resource_spec.gpu_devices]) + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + self.loads = {ps: 0.0 for ps in reduction_device_names} + + # Generate node config + node_config = [] + for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()): + var_op_name = get_op_name(var.name) + grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name] + if isinstance(grad, ops.Tensor): # this is a dense variable + group_id = idx // self.chunk_size + config = self._gen_all_reduce_node_config( + var.name, group=group_id) + else: # sparse updates + # For Parallax Strategy, all PS vars are sparse so we don't use a proxy. + # Sparse variables are likely larger, so keeping copies would be costlier, + # and usually each device only requires a small part of the overall variable. + config = self._gen_ps_node_config( + var, + # For Parallax Strategy, all PS vars are sparse which does not need proxy. + False, + self._sync, + self._staleness + ) + node_config.append(config) + expr.node_config.extend(node_config) + + return expr + + +def getHitRatio(ranklist, gtItem): + for item in ranklist: + if item == gtItem: + return 1 + return 0 + + +def getNDCG(ranklist, gtItem): + for i in range(len(ranklist)): + item = ranklist[i] + if item == gtItem: + return math.log(2) / math.log(i+2) + return 0 + + +class Logging(object): + def __init__(self, path='logs/tflog.txt'): + with open(path, 'w') as fw: + fw.write('') + self.path = path + + def write(self, s): + print(s) + with open(self.path, 'a') as fw: + fw.write(s + '\n') + fw.flush() + + +def main(): + resource_spec_file = os.path.join(os.path.dirname( + __file__), '../ctr/settings', 'plx_local_spec.yml') + autodist = AutoDist(resource_spec_file, Parallaxx()) + respec = ResourceSpec(resource_spec_file) + + def validate(): + # validate phase + hits, ndcgs = [], [] + for idx in range(num_users): + start_index = idx * 100 + my_feed_dict = { + user_input: testUserInput[start_index:start_index+100], + item_input: testItemInput[start_index:start_index+100], + } + predictions = sess.run([y], feed_dict=my_feed_dict) + map_item_score = { + testItemInput[start_index+i]: predictions[0][i] for i in range(100)} + + # Evaluate top rank list + ranklist = heapq.nlargest( + topK, map_item_score, key=map_item_score.get) + hr = getHitRatio(ranklist, testItemInput[start_index]) + ndcg = getNDCG(ranklist, testItemInput[start_index]) + hits.append(hr) + ndcgs.append(ndcg) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + return hr, ndcg + + from movielens import getdata + trainData, testData = getdata('ml-25m', 'datasets') + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + num_users, num_items = { + 'ml-1m': (6040, 3706), + 'ml-20m': (138493, 26744), + 'ml-25m': (162541, 59047), + }['ml-25m'] + batch_size = 1024 + num_negatives = 4 + topK = 10 + with tf.Graph().as_default() as g, autodist.scope(): + user_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + item_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + y_ = tf.compat.v1.placeholder(tf.float32, [None, ]) + + loss, y, opt = neural_mf( + user_input, item_input, y_, num_users, num_items) + train_op = opt.minimize(loss) + + sess = autodist.create_distributed_session() + + log = Logging(path=os.path.join( + os.path.dirname(__file__), 'logs', 'tfplx.txt')) + epoch = 7 + iterations = trainData['user_input'].shape[0] // batch_size + start = time.time() + for ep in range(epoch): + ep_st = time.time() + log.write('epoch %d' % ep) + train_loss = [] + for idx in range(iterations): + start_index = idx * batch_size + my_feed_dict = { + user_input: trainData['user_input'][start_index:start_index+batch_size], + item_input: trainData['item_input'][start_index:start_index+batch_size], + y_: trainData['labels'][start_index:start_index+batch_size], + } + + loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) + train_loss.append(loss_val[0]) + + tra_loss = np.mean(train_loss) + ep_en = time.time() + + # validate phase + hr, ndcg = validate() + printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % ( + tra_loss, hr, ndcg, ep_en - ep_st) + log.write(printstr) + log.write('all time:', (time.time() - start)) + + +if __name__ == '__main__': + main() diff --git a/examples/rec/run_tf.py b/examples/rec/run_tf.py new file mode 100644 index 0000000..f5c1e03 --- /dev/null +++ b/examples/rec/run_tf.py @@ -0,0 +1,145 @@ +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from tf_ncf import neural_mf +import heapq # for retrieval topK +import math + + +def getHitRatio(ranklist, gtItem): + for item in ranklist: + if item == gtItem: + return 1 + return 0 + + +def getNDCG(ranklist, gtItem): + for i in range(len(ranklist)): + item = ranklist[i] + if item == gtItem: + return math.log(2) / math.log(i+2) + return 0 + + +class Logging(object): + def __init__(self, path='logs/tflog.txt'): + with open(path, 'w') as fw: + fw.write('') + self.path = path + + def write(self, s): + print(s) + with open(self.path, 'a') as fw: + fw.write(s + '\n') + fw.flush() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--val", action="store_true", + help="whether to perform validation") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + + def validate(): + # validate phase + hits, ndcgs = [], [] + for idx in range(num_users): + start_index = idx * 100 + my_feed_dict = { + user_input: testUserInput[start_index:start_index+100], + item_input: testItemInput[start_index:start_index+100], + } + predictions = sess.run([y], feed_dict=my_feed_dict) + map_item_score = { + testItemInput[start_index+i]: predictions[0][i] for i in range(100)} + + # Evaluate top rank list + ranklist = heapq.nlargest( + topK, map_item_score, key=map_item_score.get) + hr = getHitRatio(ranklist, testItemInput[start_index]) + ndcg = getNDCG(ranklist, testItemInput[start_index]) + hits.append(hr) + ndcgs.append(ndcg) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + return hr, ndcg + + from movielens import getdata + if args.all: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = trainData['user_input'] + trainItems = trainData['item_input'] + trainLabels = trainData['labels'] + testData = testData + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + else: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = trainData['user_input'][:1024000] + trainItems = trainData['item_input'][:1024000] + trainLabels = trainData['labels'][:1024000] + testData = testData[:1470] + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + num_users, num_items = { + 'ml-1m': (6040, 3706), + 'ml-20m': (138493, 26744), + 'ml-25m': (162541, 59047), + }['ml-25m'] + batch_size = 1024 + num_negatives = 4 + topK = 10 + user_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + item_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + y_ = tf.compat.v1.placeholder(tf.float32, [None, ]) + + loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items) + train_op = opt.minimize(loss) + + init = tf.compat.v1.global_variables_initializer() + gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) + sess = tf.compat.v1.Session( + config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + sess.run(init) + + log = Logging() + epoch = 7 + iterations = trainUsers.shape[0] // batch_size + start = time.time() + for ep in range(epoch): + ep_st = time.time() + log.write('epoch %d' % ep) + train_loss = [] + for idx in range(iterations): + start_index = idx * batch_size + my_feed_dict = { + user_input: trainUsers[start_index:start_index+batch_size], + item_input: trainItems[start_index:start_index+batch_size], + y_: trainLabels[start_index:start_index+batch_size], + } + + loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) + train_loss.append(loss_val[0]) + + tra_loss = np.mean(train_loss) + ep_en = time.time() + + # validate phase + if args.val: + hr, ndcg = validate() + printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % ( + tra_loss, hr, ndcg, ep_en - ep_st) + else: + printstr = "train_loss: %.4f, train_time: %.4f" % ( + tra_loss, ep_en - ep_st) + log.write(printstr) + log.write('all time:%f' % (time.time() - start)) + + +if __name__ == '__main__': + main() diff --git a/examples/rec/run_tfworker.py b/examples/rec/run_tfworker.py new file mode 100644 index 0000000..20c0402 --- /dev/null +++ b/examples/rec/run_tfworker.py @@ -0,0 +1,192 @@ +import os +import json +import numpy as np +import tensorflow as tf +import time +import argparse +from tqdm import tqdm +from tf_ncf import neural_mf +import heapq # for retrieval topK +import math + + +def pop_env(): + for k in ['https_proxy', 'http_proxy']: + if k in os.environ: + os.environ.pop(k) + + +pop_env() + + +def getHitRatio(ranklist, gtItem): + for item in ranklist: + if item == gtItem: + return 1 + return 0 + + +def getNDCG(ranklist, gtItem): + for i in range(len(ranklist)): + item = ranklist[i] + if item == gtItem: + return math.log(2) / math.log(i+2) + return 0 + + +class Logging(object): + def __init__(self, path='logs/tflog.txt'): + with open(path, 'w') as fw: + fw.write('') + self.path = path + + def write(self, s): + print(s) + with open(self.path, 'a') as fw: + fw.write(s + '\n') + fw.flush() + + +def train_ncf(cluster, rank, nrank, args): + def validate(): + # validate phase + hits, ndcgs = [], [] + for idx in range(testData.shape[0]): + start_index = idx * 100 + my_feed_dict = { + user_input: testUserInput[start_index:start_index+100], + item_input: testItemInput[start_index:start_index+100], + } + predictions = sess.run([y], feed_dict=my_feed_dict) + map_item_score = { + testItemInput[start_index+i]: predictions[0][i] for i in range(100)} + + # Evaluate top rank list + ranklist = heapq.nlargest( + topK, map_item_score, key=map_item_score.get) + hr = getHitRatio(ranklist, testItemInput[start_index]) + ndcg = getNDCG(ranklist, testItemInput[start_index]) + hits.append(hr) + ndcgs.append(ndcg) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + return hr, ndcg + + def get_current_shard(data): + part_size = data.shape[0] // nrank + start = part_size * rank + end = start + part_size if rank != nrank - 1 else data.shape[0] + return data[start:end] + + from movielens import getdata + if args.all: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = get_current_shard(trainData['user_input']) + trainItems = get_current_shard(trainData['item_input']) + trainLabels = get_current_shard(trainData['labels']) + testData = get_current_shard(testData) + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + else: + trainData, testData = getdata('ml-25m', 'datasets') + trainUsers = get_current_shard(trainData['user_input'][:1024000]) + trainItems = get_current_shard(trainData['item_input'][:1024000]) + trainLabels = get_current_shard(trainData['labels'][:1024000]) + testData = get_current_shard(testData[:1470]) + testUserInput = np.repeat( + np.arange(testData.shape[0], dtype=np.int32), 100) + testItemInput = testData.reshape((-1,)) + + num_users, num_items = { + 'ml-1m': (6040, 3706), + 'ml-20m': (138493, 26744), + 'ml-25m': (162541, 59047), + }['ml-25m'] + batch_size = 1024 + num_negatives = 4 + topK = 10 + + worker_device = "/job:worker/task:%d/gpu:0" % (rank) + with tf.device(worker_device): + user_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + item_input = tf.compat.v1.placeholder(tf.int32, [None, ]) + y_ = tf.compat.v1.placeholder(tf.float32, [None, ]) + + with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)): + server_num = len(cluster.as_dict()['ps']) + embed_partitioner = tf.fixed_size_partitioner( + server_num, 0) if server_num > 1 else None + loss, y, opt = neural_mf( + user_input, item_input, y_, num_users, num_items, embed_partitioner) + train_op = opt.minimize(loss) + + server = tf.train.Server( + cluster, job_name="worker", task_index=rank) + init = tf.compat.v1.global_variables_initializer() + sv = tf.train.Supervisor( + is_chief=(rank == 0), + init_op=init, + recovery_wait_secs=1) + sess_config = tf.compat.v1.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % rank]) + sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) + + log = Logging(path='logs/tflog%d.txt' % rank) + epoch = 7 + iterations = trainUsers.shape[0] // batch_size + start = time.time() + for ep in range(epoch): + ep_st = time.time() + log.write('epoch %d' % ep) + train_loss = [] + for idx in tqdm(range(iterations)): + start_index = idx * batch_size + my_feed_dict = { + user_input: trainUsers[start_index:start_index+batch_size], + item_input: trainItems[start_index:start_index+batch_size], + y_: trainLabels[start_index:start_index+batch_size], + } + + loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) + train_loss.append(loss_val[0]) + + tra_loss = np.mean(train_loss) + ep_en = time.time() + + # validate phase + if args.val: + hr, ndcg = validate() + printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % ( + tra_loss, hr, ndcg, ep_en - ep_st) + else: + printstr = "train_loss: %.4f, train_time: %.4f" % ( + tra_loss, ep_en - ep_st) + log.write(printstr) + log.write('all time: %f' % (time.time() - start)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--val", action="store_true", + help="whether to perform validation") + parser.add_argument("--rank", type=int, required=True, + help="rank of process") + parser.add_argument( + "--config", type=str, default='../ctr/settings/tf_local_s1_w2.json', help="config file path") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + args = parser.parse_args() + task_id = int(args.rank) + raw_config = args.config + + config = json.load(open(raw_config)) + cluster = tf.train.ClusterSpec(config) + + train_ncf(cluster, task_id, len(config['worker']), args) + + +if __name__ == '__main__': + main() diff --git a/examples/rec/tf_8workers.sh b/examples/rec/tf_8workers.sh new file mode 100644 index 0000000..cd8c494 --- /dev/null +++ b/examples/rec/tf_8workers.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +workdir=$(cd $(dirname $0); pwd) +mainpy=${workdir}/run_tfworker.py + +CUDA_VISIBLE_DEVICES=0 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 0 & +CUDA_VISIBLE_DEVICES=1 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 1 & +CUDA_VISIBLE_DEVICES=2 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 2 & +CUDA_VISIBLE_DEVICES=3 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 3 & +CUDA_VISIBLE_DEVICES=4 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 4 & +CUDA_VISIBLE_DEVICES=5 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 5 & +CUDA_VISIBLE_DEVICES=6 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 6 & +CUDA_VISIBLE_DEVICES=7 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 7 & +wait \ No newline at end of file diff --git a/examples/rec/tf_ncf.py b/examples/rec/tf_ncf.py new file mode 100644 index 0000000..c8ca51f --- /dev/null +++ b/examples/rec/tf_ncf.py @@ -0,0 +1,47 @@ +import tensorflow as tf + + +def neural_mf(user_input, item_input, y_, num_users, num_items, embed_partitioner=None): + embed_dim = 8 + layers = [64, 32, 16, 8] + learning_rate = 0.01 + with tf.compat.v1.variable_scope('nmf', dtype=tf.float32): + with tf.device('/cpu:0'): + User_Embedding = tf.compat.v1.get_variable(name="user_embed", shape=( + num_users, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner) + Item_Embedding = tf.compat.v1.get_variable(name="item_embed", shape=( + num_items, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner) + + user_latent = tf.nn.embedding_lookup(User_Embedding, user_input) + item_latent = tf.nn.embedding_lookup(Item_Embedding, item_input) + + W1 = tf.compat.v1.get_variable(name='W1', shape=( + layers[0], layers[1]), initializer=tf.random_normal_initializer(stddev=0.1)) + W2 = tf.compat.v1.get_variable(name='W2', shape=( + layers[1], layers[2]), initializer=tf.random_normal_initializer(stddev=0.1)) + W3 = tf.compat.v1.get_variable(name='W3', shape=( + layers[2], layers[3]), initializer=tf.random_normal_initializer(stddev=0.1)) + W4 = tf.compat.v1.get_variable(name='W4', shape=( + embed_dim + layers[3], 1), initializer=tf.random_normal_initializer(stddev=0.1)) + + with tf.device('/gpu:0'): + mf_user_latent, mlp_user_latent = tf.split( + user_latent, [embed_dim, layers[0] // 2], 1) + mf_item_latent, mlp_item_latent = tf.split( + item_latent, [embed_dim, layers[0] // 2], 1) + mf_vector = tf.multiply(mf_user_latent, mf_item_latent) + mlp_vector = tf.concat((mlp_user_latent, mlp_item_latent), 1) + fc1 = tf.matmul(mlp_vector, W1) + relu1 = tf.nn.relu(fc1) + fc2 = tf.matmul(relu1, W2) + relu2 = tf.nn.relu(fc2) + fc3 = tf.matmul(relu2, W3) + relu3 = tf.nn.relu(fc3) + concat_vector = tf.concat((mf_vector, relu3), 1) + y = tf.reshape(tf.matmul(concat_vector, W4), (-1,)) + loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_) + loss = tf.reduce_mean(loss) + y = tf.sigmoid(y) + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate) + return loss, y, optimizer diff --git a/examples/runner/README.md b/examples/runner/README.md new file mode 100644 index 0000000..272dd45 --- /dev/null +++ b/examples/runner/README.md @@ -0,0 +1,27 @@ +## Usage +This directory contains examples using `heturun` command. + +* Data Parallel (MLP model and WDL model): +```bash +# Local Data Parallel Using AllReduce +heturun -c local_allreduce.yml python run_mlp.py --config lar + +# Local Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters +heturun -c local_ps.yml python run_wdl.py --config lhy + +# Local Data Parallel Using PS +heturun -c local_ps.yml python run_mlp.py --config lps +heturun -c local_ps.yml python run_wdl.py --config lps + +# Distributed Data Parallel Using AllReduce +heturun -c remote_allreduce.yml python run_mlp.py --config rar + +# Distributed Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters +heturun -c remote_ps.yml python run_wdl.py --config rhy + +# Distributed Data Parallel Using PS +heturun -c remote_ps.yml python run_mlp.py --config rps +heturun -c remote_ps.yml python run_wdl.py --config rps +``` + +* For other parallel schemes, please refer to `parallel` directory. diff --git a/examples/runner/local_allreduce.yml b/examples/runner/local_allreduce.yml new file mode 100644 index 0000000..5028d3f --- /dev/null +++ b/examples/runner/local_allreduce.yml @@ -0,0 +1,5 @@ +nodes: + - host: localhost + servers: 0 + workers: 4 + chief: true diff --git a/examples/runner/local_ps.yml b/examples/runner/local_ps.yml new file mode 100644 index 0000000..c71709c --- /dev/null +++ b/examples/runner/local_ps.yml @@ -0,0 +1,5 @@ +nodes: + - host: localhost + servers: 1 + workers: 4 + chief: true diff --git a/examples/runner/models/MLP.py b/examples/runner/models/MLP.py new file mode 100644 index 0000000..df60da8 --- /dev/null +++ b/examples/runner/models/MLP.py @@ -0,0 +1,33 @@ +import hetu as ht +from hetu import init + + +def fc(x, shape, name, with_relu=True): + weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') + bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +def mlp(x, y_): + ''' + MLP model, for MNIST dataset. + + Parameters: + x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) + y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + Return: + loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) + y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) + ''' + + print("Building MLP model...") + x = fc(x, (784, 256), 'mlp_fc1', with_relu=True) + x = fc(x, (256, 256), 'mlp_fc2', with_relu=True) + y = fc(x, (256, 10), 'mlp_fc3', with_relu=False) + loss = ht.softmaxcrossentropy_op(y, y_) + loss = ht.reduce_mean_op(loss, [0]) + return loss, y diff --git a/examples/runner/models/__init__.py b/examples/runner/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/runner/models/load_data.py b/examples/runner/models/load_data.py new file mode 100644 index 0000000..4edeb0d --- /dev/null +++ b/examples/runner/models/load_data.py @@ -0,0 +1,220 @@ +import numpy as np +import six.moves.cPickle as pickle +import gzip +import os + + +def load_mnist_data(dataset): + """ Load the dataset + Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + """ + # Download the MNIST dataset if it is not present + data_dir, data_file = os.path.split(dataset) + if data_dir == "" and not os.path.isfile(dataset): + # Check if dataset is in the data directory. + new_path = os.path.join( + os.path.split(__file__)[0], + dataset + ) + if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': + dataset = new_path + + if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': + from six.moves import urllib + origin = ( + 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' + ) + print('Downloading data from %s' % origin) + urllib.request.urlretrieve(origin, dataset) + + # Load the dataset + with gzip.open(dataset, 'rb') as f: + try: + train_set, valid_set, test_set = pickle.load(f, encoding='latin1') + except: + train_set, valid_set, test_set = pickle.load(f) + # train_set, valid_set, test_set format: tuple(input, target) + # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32 + # where each row corresponds to an example. target is a + # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length + # as the number of rows in the input. It should give the target + # to the example with the same index in the input. + return train_set, valid_set, test_set + + +def convert_to_one_hot(vals, max_val=0): + """Helper method to convert label array to one-hot array.""" + if max_val == 0: + max_val = vals.max() + 1 + one_hot_vals = np.zeros((vals.size, max_val)) + one_hot_vals[np.arange(vals.size), vals] = 1 + return one_hot_vals + +########################################################################### +# adult +########################################################################### + + +def maybe_download(train_data, test_data): + import pandas as pd + """if adult data "train.csv" and "test.csv" are not in your directory, + download them. + """ + + COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", + "marital_status", "occupation", "relationship", "race", "gender", + "capital_gain", "capital_loss", "hours_per_week", "native_country", + "income_bracket"] + + if not os.path.exists(train_data): + print("downloading training data...") + df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", + names=COLUMNS, skipinitialspace=True) + else: + df_train = pd.read_csv("train.csv") + + if not os.path.exists(test_data): + print("downloading testing data...") + df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", + names=COLUMNS, skipinitialspace=True, skiprows=1) + else: + df_test = pd.read_csv("test.csv") + + return df_train, df_test + + +def cross_columns(x_cols): + """simple helper to build the crossed columns in a pandas dataframe + """ + crossed_columns = dict() + colnames = ['_'.join(x_c) for x_c in x_cols] + for cname, x_c in zip(colnames, x_cols): + crossed_columns[cname] = x_c + return crossed_columns + + +def val2idx(df, cols): + """helper to index categorical columns before embeddings. + """ + val_types = dict() + for c in cols: + val_types[c] = df[c].unique() + + val_to_idx = dict() + for k, v in val_types.items(): + val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])} + + for k, v in val_to_idx.items(): + df[k] = df[k].apply(lambda x: v[x]) + + unique_vals = dict() + for c in cols: + unique_vals[c] = df[c].nunique() + + return df, unique_vals + + +def onehot(x): + from sklearn.preprocessing import OneHotEncoder + return np.array(OneHotEncoder().fit_transform(x).todense()) + + +def wide(df_train, df_test, wide_cols, x_cols, target): + import pandas as pd + print('Processing wide data') + df_train['IS_TRAIN'] = 1 + df_test['IS_TRAIN'] = 0 + df_wide = pd.concat([df_train, df_test]) + + crossed_columns_d = cross_columns(x_cols) + categorical_columns = list( + df_wide.select_dtypes(include=['object']).columns) + + wide_cols += list(crossed_columns_d.keys()) + + for k, v in crossed_columns_d.items(): + df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1) + + df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']] + + dummy_cols = [ + c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())] + df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols]) + + train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) + test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) + assert all(train.columns == test.columns) + + cols = [c for c in train.columns if c != target] + X_train = train[cols].values + y_train = train[target].values.reshape(-1, 1) + X_test = test[cols].values + y_test = test[target].values.reshape(-1, 1) + return X_train, y_train, X_test, y_test + + +def load_adult_data(return_val=True): + import pandas as pd + df_train, df_test = maybe_download("train.csv", "test.csv") + + df_train['income_label'] = ( + df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) + df_test['income_label'] = ( + df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) + + age_groups = [0, 25, 65, 90] + age_labels = range(len(age_groups) - 1) + df_train['age_group'] = pd.cut( + df_train['age'], age_groups, labels=age_labels) + df_test['age_group'] = pd.cut( + df_test['age'], age_groups, labels=age_labels) + + # columns for wide model + wide_cols = ['workclass', 'education', 'marital_status', 'occupation', + 'relationship', 'race', 'gender', 'native_country', 'age_group'] + x_cols = (['education', 'occupation'], ['native_country', 'occupation']) + + # columns for deep model + embedding_cols = ['workclass', 'education', 'marital_status', 'occupation', + 'relationship', 'race', 'gender', 'native_country'] + cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week'] + + target = 'income_label' + + x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide( + df_train, df_test, wide_cols, x_cols, target) + x_train_wide = np.array(x_train_wide).astype(np.float32) + x_test_wide = np.array(x_test_wide).astype(np.float32) + + print('Processing deep data') + df_train['IS_TRAIN'] = 1 + df_test['IS_TRAIN'] = 0 + df_deep = pd.concat([df_train, df_test]) + + deep_cols = embedding_cols + cont_cols + df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']] + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]), + columns=cont_cols) + df_deep, unique_vals = val2idx(df_deep, embedding_cols) + + train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) + test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) + + x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32) + y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32) + x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32) + y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32) + + x_train_deep = np.transpose(x_train_deep) + x_test_deep = np.transpose(x_test_deep) + y_train = onehot(y_train) + y_test = onehot(y_test) + + if return_val: + return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test + else: + return x_train_deep, x_train_wide, y_train diff --git a/examples/runner/models/wdl_adult.py b/examples/runner/models/wdl_adult.py new file mode 100644 index 0000000..6cef28c --- /dev/null +++ b/examples/runner/models/wdl_adult.py @@ -0,0 +1,57 @@ +import hetu as ht +from hetu import init + + +def wdl_adult(X_deep, X_wide, y_, dense_param_ctx): + lr = 5 / 128 + dim_wide = 809 + dim_deep = 68 + + with ht.context(dense_param_ctx): + W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") + W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") + b1 = init.random_normal([50], stddev=0.1, name="b1") + W2 = init.random_normal([50, 20], stddev=0.1, name="W2") + b2 = init.random_normal([20], stddev=0.1, name="b2") + + # deep + Embedding = [] + X_deep_input = None + + for i in range(8): + Embedding_name = "Embedding_deep_" + str(i) + Embedding.append(init.random_normal( + [50, 8], stddev=0.1, name=Embedding_name)) + now = ht.embedding_lookup_op(Embedding[i], X_deep[i]) + now = ht.array_reshape_op(now, (-1, 8)) + if X_deep_input is None: + X_deep_input = now + else: + X_deep_input = ht.concat_op(X_deep_input, now, 1) + + for i in range(4): + now = ht.array_reshape_op(X_deep[i + 8], (-1, 1)) + X_deep_input = ht.concat_op(X_deep_input, now, 1) + + mat1 = ht.matmul_op(X_deep_input, W1) + add1 = mat1 + ht.broadcastto_op(b1, mat1) + relu1 = ht.relu_op(add1) + dropout1 = relu1 + mat2 = ht.matmul_op(dropout1, W2) + add2 = mat2 + ht.broadcastto_op(b2, mat2) + relu2 = ht.relu_op(add2) + dropout2 = relu2 + dmodel = dropout2 + + # wide + wmodel = ht.concat_op(X_wide, dmodel, 1) + wmodel = ht.matmul_op(wmodel, W) + + prediction = wmodel + loss = ht.softmaxcrossentropy_op(prediction, y_) + loss = ht.reduce_mean_op(loss, [0]) + + opt = ht.optim.SGDOptimizer(learning_rate=lr) + train_op = opt.minimize(loss) + + return loss, prediction, y_, train_op diff --git a/examples/runner/parallel/README.md b/examples/runner/parallel/README.md new file mode 100644 index 0000000..f5fc8d9 --- /dev/null +++ b/examples/runner/parallel/README.md @@ -0,0 +1,35 @@ +## Usage +* Complex Pipeline Parallel (not using heturun): +```bash +mpirun --allow-run-as-root --tag-output -np 8 python complex_pipeline_mlp.py +``` + +* Simple Pipeline Parallel: +```bash +heturun -c config8.yml python simple_pipeline_mlp.py +``` + +* Data + Pipeline Parallel: +```bash +heturun -c config8.yml python data_pipeline_mlp.py +``` + +* Multiple Machine Data + Pipeline Parallel: +```bash +heturun -c dist_config8.yml python dist_data_pipeline_mlp.py +``` + +* Test Model Parallel (the following commands should give the same results): +```bash +heturun -c config3.yml python test_model_mlp_base.py --save +heturun -c config4.yml python test_model_mlp.py --split left +heturun -c config4.yml python test_model_mlp.py --split right +heturun -c config4.yml python test_model_mlp.py --split middle +``` + +* Data + Model (+ Pipeline) Parallel: +```bash +heturun -c config8.yml python data_model_pipeline_mlp.py --split left +heturun -c config8.yml python data_model_pipeline_mlp.py --split right +heturun -c config8.yml python data_model_pipeline_mlp.py --split middle +``` diff --git a/examples/runner/parallel/complex_pipeline_mlp.py b/examples/runner/parallel/complex_pipeline_mlp.py new file mode 100644 index 0000000..a9a3d25 --- /dev/null +++ b/examples/runner/parallel/complex_pipeline_mlp.py @@ -0,0 +1,200 @@ +import hetu as ht +from hetu import stream +from hetu import init + +import os +import sys +import json +import time +import argparse +import numpy as np +import logging + +np.random.seed(123) + + +def convert_to_one_hot(vals, max_val=0): + """Helper method to convert label array to one-hot array.""" + if max_val == 0: + max_val = vals.max() + 1 + one_hot_vals = np.zeros((vals.size, max_val)) + one_hot_vals[np.arange(vals.size), vals] = 1 + return one_hot_vals + + +def fc(x, shape, name, with_relu=True, ctx=None): + weight = init.random_normal( + shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx) + bias = init.random_normal( + shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx) + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--steps', type=int, default=8, help='training steps') + parser.add_argument('--warmup', type=int, default=2, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, default=8, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.00001, help='learning rate') + args = parser.parse_args() + + # init and opt for both ranks + comm = ht.wrapped_mpi_nccl_init() + device_id = comm.dev_id + print("mpi_nccl init for gpu device: {}".format(device_id)) + executor_ctx = ht.gpu(device_id) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + + # init logger + logger = logging.getLogger() + ch = logging.StreamHandler() + formatter = logging.Formatter('[rank{}, PID{}]'.format( + device_id, os.getpid()) + ' %(asctime)s: %(message)s') + ch.setLevel(logging.DEBUG) + ch.setFormatter(formatter) + logger.addHandler(ch) + log = logger.warning + + # nccl communicate stream for pipeline_send/receive + communicate_stream = stream.create_stream_handle(executor_ctx) + + # dataset + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + batch_size = 10000 + batch_num = 5 + value_x_list = [] + value_y_list = [] + for i in range(batch_num): + start = i * batch_size + ending = (i+1) * batch_size + value_x_list.append(train_set_x[start:ending]) + value_y_list.append(train_set_y[start:ending]) + + x = ht.Variable(name="dataloader_x", trainable=False) + y_ = ht.Variable(name="dataloader_y", trainable=False) + + # model parallel + if comm.myRank.value == 0: + # rank0 + + # forward + activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True, + ctx=ht.gpu(comm.localRank.value)) + activation = fc(activation, (1024, 2048), 'mlp_fc2', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + activation = fc(activation, (2048, 1024), 'mlp_fc3', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + activation_send_op = ht.pipeline_send_op( + activation, 1, comm, stream=communicate_stream) + + # backward + gradient_receive_op = ht.pipeline_receive_op( + 1, comm, ctx=executor_ctx, stream=communicate_stream) + required_vars = opt.get_var_list(activation) + opt.params = required_vars + grads = ht.gradients(activation, required_vars, + insert_grad=gradient_receive_op) + train_op = ht.optim.OptimizerOp(grads, opt) + + executor = ht.Executor( + [activation_send_op, train_op], ctx=executor_ctx) + + elif comm.myRank.value != 7: + # from rank1 to rank6 + previous_rank = comm.myRank.value - 1 + next_rank = comm.myRank.value + 1 + + # 1. receive activation from previous rank + activation_receive_op = ht.pipeline_receive_op( + previous_rank, comm, ctx=executor_ctx, stream=communicate_stream) + # forward + activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + activation = fc(activation, (2048, 2048), 'mlp_fc2', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + activation = fc(activation, (2048, 1024), 'mlp_fc3', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + + # 2. send activation to next rank + activation_send_op = ht.pipeline_send_op( + activation, next_rank, comm, ctx=executor_ctx, stream=communicate_stream) + + # 3. receive gradients from next rank + gradient_receive_op = ht.pipeline_receive_op( + next_rank, comm, ctx=executor_ctx, stream=communicate_stream) + # backward + required_vars = opt.get_var_list(activation) + opt.params = required_vars + required_vars = [activation_receive_op] + required_vars + grads = ht.gradients(activation, required_vars, + insert_grad=gradient_receive_op) + train_op = ht.optim.OptimizerOp(grads[1:], opt) + + # 4. send gradients to previous rank + sendback_grad_op = ht.pipeline_send_op( + grads[0], previous_rank, comm, stream=communicate_stream) + + executor = ht.Executor( + [activation_send_op, sendback_grad_op, train_op], ctx=executor_ctx) + + else: + # rank7 + activation_receive_op = ht.pipeline_receive_op( + 6, comm, ctx=executor_ctx, stream=communicate_stream) + + # forward + activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + activation = fc(activation, (2048, 1024), 'mlp_fc2', + with_relu=True, ctx=ht.gpu(comm.localRank.value)) + y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + + # backward + required_vars = opt.get_var_list(loss) + opt.params = required_vars + required_vars = [activation_receive_op] + required_vars + grads = ht.gradients(loss, required_vars) + train_op = ht.optim.OptimizerOp(grads[1:], opt) + + sendback_grad_op = ht.pipeline_send_op( + grads[0], 6, comm, stream=communicate_stream) + executor = ht.Executor( + [loss, sendback_grad_op, train_op], ctx=executor_ctx) + + # training + for step in range(args.steps): + if step == args.warmup: + start = time.time() + if comm.myRank.value == 0: + log("step {}:".format(step)) + if comm.myRank.value == 0: + executor.run(feed_dict={x: value_x_list[step % batch_num]}) + log("gpu0 ok") + elif comm.myRank.value == 7: + loss, _, _ = executor.run( + feed_dict={y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True) + log("gpu7 ok, loss: {}".format(loss[0])) + else: + executor.run() + log("gpu{} ok".format(comm.myRank.value)) + + # comm.stream.sync() + if communicate_stream: + communicate_stream.sync() + + end = time.time() + log("time elapsed for {} steps: {}s".format( + args.steps-args.warmup, round(end-start, 3))) diff --git a/examples/runner/parallel/config3.yml b/examples/runner/parallel/config3.yml new file mode 100644 index 0000000..7c2ad6f --- /dev/null +++ b/examples/runner/parallel/config3.yml @@ -0,0 +1,5 @@ +nodes: + - host: localhost + servers: 0 + workers: 3 + chief: true diff --git a/examples/runner/parallel/config4.yml b/examples/runner/parallel/config4.yml new file mode 100644 index 0000000..5028d3f --- /dev/null +++ b/examples/runner/parallel/config4.yml @@ -0,0 +1,5 @@ +nodes: + - host: localhost + servers: 0 + workers: 4 + chief: true diff --git a/examples/runner/parallel/config8.yml b/examples/runner/parallel/config8.yml new file mode 100644 index 0000000..2676c7d --- /dev/null +++ b/examples/runner/parallel/config8.yml @@ -0,0 +1,5 @@ +nodes: + - host: localhost + servers: 0 + workers: 8 + chief: true diff --git a/examples/runner/parallel/data_model_pipeline_mlp.py b/examples/runner/parallel/data_model_pipeline_mlp.py new file mode 100644 index 0000000..ff6539c --- /dev/null +++ b/examples/runner/parallel/data_model_pipeline_mlp.py @@ -0,0 +1,97 @@ +import hetu as ht + +import time +import argparse + + +def fc(x, shape, name, with_relu=True, ctx=None): + weight = ht.init.random_normal( + shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx) + bias = ht.init.random_normal( + shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx) + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--steps', type=int, default=8, help='training steps') + parser.add_argument('--warmup', type=int, default=2, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, default=8, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.00001, help='learning rate') + parser.add_argument('--split', type=str, default='left', + help='left, middle, right') + args = parser.parse_args() + assert args.split in ('left', 'middle', 'right') + + # dataset + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + batch_size = 10000 + batch_num = 5 + value_x_list = [] + value_y_list = [] + for i in range(batch_num): + start = i * batch_size + ending = (i+1) * batch_size + value_x_list.append(train_set_x[start:ending]) + value_y_list.append(train_set_y[start:ending]) + + # model parallel + with ht.context([ht.gpu(0), ht.gpu(4)]): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True) + activation = fc(activation, (1024, 2048), 'mlp_fc2', with_relu=True) + activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True) + if args.split == 'left': + activation = ht.dispatch(activation, (2, 1)) + weight = ht.dispatch(ht.init.random_normal( + shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 1), duplicate=2) + elif args.split == 'right': + activation = ht.dispatch(activation, (1, 1), duplicate=2) + weight = ht.dispatch(ht.init.random_normal( + shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 2)) + else: + activation = ht.dispatch(activation, (1, 2)) + weight = ht.dispatch(ht.init.random_normal( + shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (2, 1)) + + with ht.context([(ht.gpu(1), ht.gpu(2)), (ht.gpu(5), ht.gpu(6))]): + activation = ht.matmul_op(activation, weight) + activation = ht.dispatch(activation, (1, 1)) + + with ht.context([ht.gpu(3), ht.gpu(7)]): + activation = ht.relu_op(activation) + activation = fc(activation, (2048, 2048), 'mlp_fc2', with_relu=True) + activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True) + y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + + executor = ht.Executor([loss, train_op]) + + # training + for step in range(args.steps): + if step == args.warmup: + start = time.time() + loss_val, _ = executor.run(feed_dict={ + x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True) + if executor.rank == 3: + print('step:', step, 'loss:', loss_val) + + end = time.time() + if executor.rank == 3: + print("time elapsed for {} steps: {}s".format( + args.steps-args.warmup, round(end-start, 3))) diff --git a/examples/runner/parallel/data_pipeline_mlp.py b/examples/runner/parallel/data_pipeline_mlp.py new file mode 100644 index 0000000..076a5bd --- /dev/null +++ b/examples/runner/parallel/data_pipeline_mlp.py @@ -0,0 +1,66 @@ +import hetu as ht + +import os +import time +import argparse +import numpy as np + + +def fc(x, shape, name, with_relu=True): + weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight') + bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--warmup', type=int, default=1, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, + default=10000, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.01, help='learning rate') + args = parser.parse_args() + + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + with ht.context("gpu:0,gpu:4"): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True) + + with ht.context("gpu:1,gpu:5"): + activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True) + activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True) + + with ht.context("gpu:2,gpu:6"): + activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True) + activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True) + + with ht.context("gpu:3,gpu:7"): + y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + executor = ht.Executor([loss, train_op]) + + print_devices = [3, 7] + + # training + steps = train_set_x.shape[0] // args.batch_size + for step in range(steps): + start = step * args.batch_size + end = start + args.batch_size + loss_val, _ = executor.run(feed_dict={ + x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True) + if executor.local_rank in print_devices: + print('[step {}]: loss: {}'.format(step, loss_val[0])) diff --git a/examples/runner/parallel/dist_config8.yml b/examples/runner/parallel/dist_config8.yml new file mode 100644 index 0000000..4129fb9 --- /dev/null +++ b/examples/runner/parallel/dist_config8.yml @@ -0,0 +1,8 @@ +nodes: + - host: daim118 + servers: 0 + workers: 4 + chief: true + - host: daim117 + servers: 0 + workers: 4 \ No newline at end of file diff --git a/examples/runner/parallel/dist_data_pipeline_mlp.py b/examples/runner/parallel/dist_data_pipeline_mlp.py new file mode 100644 index 0000000..e05d228 --- /dev/null +++ b/examples/runner/parallel/dist_data_pipeline_mlp.py @@ -0,0 +1,68 @@ +import hetu as ht + +import os +import time +import argparse +import numpy as np +import socket + + +def fc(x, shape, name, with_relu=True): + weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight') + bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--warmup', type=int, default=1, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, + default=10000, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.01, help='learning rate') + args = parser.parse_args() + + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + with ht.context([ht.rgpu('daim117', 0), ht.rgpu('daim117', 1)]): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True) + + with ht.context([ht.rgpu('daim117', 2), ht.rgpu('daim117', 3)]): + activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True) + activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True) + + with ht.context([ht.rgpu('daim118', 0), ht.rgpu('daim118', 1)]): + activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True) + activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True) + + with ht.context([ht.rgpu('daim118', 2), ht.rgpu('daim118', 3)]): + y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + executor = ht.Executor([loss, train_op]) + + print_ranks = [2, 3] + hostname = socket.gethostname() + + # training + steps = train_set_x.shape[0] // args.batch_size + for step in range(steps): + start = step * args.batch_size + end = start + args.batch_size + loss_val, _ = executor.run(feed_dict={ + x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True) + if executor.local_rank in print_ranks and hostname == 'daim118': + print('[step {}]: loss: {}'.format(step, loss_val[0])) diff --git a/examples/runner/parallel/simple_pipeline_mlp.py b/examples/runner/parallel/simple_pipeline_mlp.py new file mode 100644 index 0000000..d200364 --- /dev/null +++ b/examples/runner/parallel/simple_pipeline_mlp.py @@ -0,0 +1,76 @@ +import hetu as ht + +import os +import time +import argparse +import numpy as np + + +def fc(x, shape, name, with_relu=True): + weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight') + bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias') + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=8, + help='training epochs') + parser.add_argument('--warmup', type=int, default=1, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, + default=10000, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.01, help='learning rate') + args = parser.parse_args() + + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + # pipeline parallel + with ht.context(ht.gpu(0)): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True) + + for i in range(1, 7): + with ht.context(ht.gpu(i)): + activation = fc(activation, (1024, 1024), 'mlp_fc%d' % + (i + 1), with_relu=True) + + with ht.context(ht.gpu(7)): + y_pred = fc(activation, (1024, 10), 'mlp_fc8', with_relu=True) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + + executor = ht.Executor([loss, train_op]) + + # training + steps = train_set_x.shape[0] // args.batch_size + for epoch in range(args.epochs): + loss_vals = [] + if epoch == args.warmup: + start_time = time.time() + for step in range(steps): + start = step * args.batch_size + end = start + args.batch_size + loss_val, _ = executor.run(feed_dict={ + x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True) + loss_vals.append(loss_val) + if executor.rank == 7: + print('epoch: {}, loss: {}'.format(epoch, np.mean(loss_vals))) + + if executor.rank == 0: + end_time = time.time() + print("time elapsed for {} epochs: {}s".format( + args.epochs-args.warmup, round(end_time-start_time, 3))) diff --git a/examples/runner/parallel/test_model_mlp.py b/examples/runner/parallel/test_model_mlp.py new file mode 100644 index 0000000..2eeab1a --- /dev/null +++ b/examples/runner/parallel/test_model_mlp.py @@ -0,0 +1,92 @@ +import hetu as ht + +import time +import argparse +import numpy as np + + +def fc(x, shape, name, with_relu=True, ctx=None): + weight_save = np.load('std/' + name + '_weight.npy') + bias_save = np.load('std/' + name + '_bias.npy') + weight = ht.Variable(value=weight_save, name=name+'_weight', ctx=ctx) + bias = ht.Variable(value=bias_save, name=name+'_bias', ctx=ctx) + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--steps', type=int, default=8, help='training steps') + parser.add_argument('--warmup', type=int, default=2, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, default=8, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.00001, help='learning rate') + parser.add_argument('--split', type=str, default='left') + args = parser.parse_args() + assert args.split in ('left', 'right', 'middle') + + # dataset + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + batch_size = 10000 + batch_num = 5 + value_x_list = [] + value_y_list = [] + for i in range(batch_num): + start = i * batch_size + ending = (i+1) * batch_size + value_x_list.append(train_set_x[start:ending]) + value_y_list.append(train_set_y[start:ending]) + + # model parallel + with ht.context(ht.gpu(0)): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True) + weight_save = np.load('std/' + 'special_weight.npy') + weight = ht.Variable(value=weight_save, name='mlp_fc1_weight') + if args.split == 'left': + activation = ht.dispatch(activation, (2, 1)) + weight = ht.dispatch(weight, (1, 1), duplicate=2) + elif args.split == 'right': + activation = ht.dispatch(activation, (1, 1), duplicate=2) + weight = ht.dispatch(weight, (1, 2)) + else: + activation = ht.dispatch(activation, (1, 2)) + weight = ht.dispatch(weight, (2, 1)) + + with ht.context((ht.gpu(1), ht.gpu(2))): + activation = ht.matmul_op(activation, weight) + activation = ht.dispatch(activation, (1, 1)) + + with ht.context(ht.gpu(3)): + activation = ht.relu_op(activation) + y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + + executor = ht.Executor([loss, train_op]) + + # training + for step in range(args.steps): + if step == args.warmup: + start = time.time() + loss_val, _ = executor.run(feed_dict={ + x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True) + if executor.rank == 3: + print('step:', step, 'loss:', loss_val) + + end = time.time() + if executor.rank == 3: + print("time elapsed for {} steps: {}s".format( + args.steps-args.warmup, round(end-start, 3))) diff --git a/examples/runner/parallel/test_model_mlp_base.py b/examples/runner/parallel/test_model_mlp_base.py new file mode 100644 index 0000000..2941e96 --- /dev/null +++ b/examples/runner/parallel/test_model_mlp_base.py @@ -0,0 +1,94 @@ +import hetu as ht + +import time +import argparse +import os +import numpy as np + + +def fc(x, shape, name, with_relu=True, rank=-1): + weight_save = np.random.normal(0, 0.04, size=shape) + bias_save = np.random.normal(0, 0.04, size=shape[-1:]) + weight = ht.Variable(value=weight_save, name=name+'_weight') + bias = ht.Variable(value=bias_save, name=name+'_bias') + global args + if args.save and args.rank == rank: + np.save('std/' + name + '_weight.npy', weight_save) + np.save('std/' + name + '_bias.npy', bias_save) + x = ht.matmul_op(x, weight) + x = x + ht.broadcastto_op(bias, x) + if with_relu: + x = ht.relu_op(x) + return x + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--steps', type=int, default=8, help='training steps') + parser.add_argument('--warmup', type=int, default=2, + help='warm up steps excluded from timing') + parser.add_argument('--batch-size', type=int, default=8, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.00001, help='learning rate') + parser.add_argument('--save', action='store_true') + global args + args = parser.parse_args() + if args.save: + comm = ht.wrapped_mpi_nccl_init() + args.rank = comm.rank + if args.rank == 0 and not os.path.exists('std'): + os.mkdir('std') + + # dataset + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + batch_size = 10000 + batch_num = 5 + value_x_list = [] + value_y_list = [] + for i in range(batch_num): + start = i * batch_size + ending = (i+1) * batch_size + value_x_list.append(train_set_x[start:ending]) + value_y_list.append(train_set_y[start:ending]) + + # model parallel + with ht.context(ht.gpu(0)): + x = ht.Variable(name="dataloader_x", trainable=False) + activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True, rank=0) + + with ht.context(ht.gpu(1)): + weight_save = np.random.normal(0, 0.04, size=(1024, 2048)) + if args.save and args.rank == 1: + np.save('std/' + 'special_weight.npy', weight_save) + weight = ht.Variable(value=weight_save, name='mlp_fc1_weight') + activation = ht.matmul_op(activation, weight) + + with ht.context(ht.gpu(2)): + activation = ht.relu_op(activation) + y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False, rank=2) + y_ = ht.Variable(name="dataloader_y", trainable=False) + loss = ht.softmaxcrossentropy_op(y_pred, y_) + loss = ht.reduce_mean_op(loss, [0]) + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + train_op = opt.minimize(loss) + + executor = ht.Executor([loss, train_op]) + + # training + for step in range(args.steps): + if step == args.warmup: + start = time.time() + loss_val, _ = executor.run(feed_dict={ + x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True) + if executor.rank == 2: + print('step:', step, 'loss:', loss_val) + + end = time.time() + if executor.rank == 2: + print("time elapsed for {} steps: {}s".format( + args.steps-args.warmup, round(end-start, 3))) diff --git a/examples/runner/remote_allreduce.yml b/examples/runner/remote_allreduce.yml new file mode 100644 index 0000000..7da1df7 --- /dev/null +++ b/examples/runner/remote_allreduce.yml @@ -0,0 +1,6 @@ +nodes: + - host: daim118 + workers: 4 + chief: true + - host: daim117 + workers: 2 diff --git a/examples/runner/remote_ps.yml b/examples/runner/remote_ps.yml new file mode 100644 index 0000000..fee7c84 --- /dev/null +++ b/examples/runner/remote_ps.yml @@ -0,0 +1,8 @@ +nodes: + - host: daim118 + servers: 1 + workers: 4 + chief: true + - host: daim117 + servers: 1 + workers: 2 diff --git a/examples/runner/run_mlp.py b/examples/runner/run_mlp.py new file mode 100644 index 0000000..e773dcb --- /dev/null +++ b/examples/runner/run_mlp.py @@ -0,0 +1,118 @@ +import hetu as ht +from models import MLP + +import os +import numpy as np +import argparse +import json +from time import time + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser() + parser.add_argument('--config', type=str, default='local', + help='[local, lps(localps), lar(localallreduce), rps(remoteps), rar]') + parser.add_argument('--batch-size', type=int, + default=128, help='batch size') + parser.add_argument('--learning-rate', type=float, + default=0.1, help='learning rate') + parser.add_argument('--opt', type=str, default='sgd', + help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') + parser.add_argument('--num-epochs', type=int, + default=10, help='epoch number') + parser.add_argument('--validate', action='store_true', + help='whether to use validation') + parser.add_argument('--timing', action='store_true', + help='whether to time the training phase') + args = parser.parse_args() + + dataset = 'MNIST' + + assert args.opt in ['sgd', 'momentum', 'nesterov', + 'adagrad', 'adam'], 'Optimizer not supported!' + if args.opt == 'sgd': + print('Use SGD Optimizer.') + opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) + elif args.opt == 'momentum': + print('Use Momentum Optimizer.') + opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate) + elif args.opt == 'nesterov': + print('Use Nesterov Momentum Optimizer.') + opt = ht.optim.MomentumOptimizer( + learning_rate=args.learning_rate, nesterov=True) + elif args.opt == 'adagrad': + print('Use AdaGrad Optimizer.') + opt = ht.optim.AdaGradOptimizer( + learning_rate=args.learning_rate, initial_accumulator_value=0.1) + else: + print('Use Adam Optimizer.') + opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate) + + # data loading + print('Loading %s data...' % dataset) + if dataset == 'MNIST': + datasets = ht.data.mnist() + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + # train_set_x: (50000, 784), train_set_y: (50000,) + # valid_set_x: (10000, 784), valid_set_y: (10000,) + # x_shape = (args.batch_size, 784) + # y_shape = (args.batch_size, 10) + + # model definition + ctx = { + 'local': ht.gpu(0), + 'lps': [ht.cpu(0), ht.gpu(0), ht.gpu(1), ht.gpu(4), ht.gpu(5)], + 'lar': [ht.gpu(1), ht.gpu(2), ht.gpu(3), ht.gpu(6)], + 'rps': ['cpu:0', 'daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3'], + 'rar': ['daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3'] + }[args.config] + with ht.context(ctx): + print('Building model...') + x = ht.dataloader_op([ + ht.Dataloader(train_set_x, args.batch_size, 'train'), + ht.Dataloader(valid_set_x, args.batch_size, 'validate'), + ]) + y_ = ht.dataloader_op([ + ht.Dataloader(train_set_y, args.batch_size, 'train'), + ht.Dataloader(valid_set_y, args.batch_size, 'validate'), + ]) + + loss, y = MLP.mlp(x, y_) + train_op = opt.minimize(loss) + + executor = ht.Executor( + {'train': [loss, y, train_op], 'validate': [loss, y, y_]}) + n_train_batches = executor.get_batch_num('train') + n_valid_batches = executor.get_batch_num('validate') + + # training + print("Start training loop...") + for i in range(args.num_epochs): + print("Epoch %d" % i) + loss_all = 0 + if args.timing: + start = time() + for minibatch_index in range(n_train_batches): + loss_val, predict_y, _ = executor.run('train') + loss_val = loss_val.asnumpy() + loss_all += loss_val * x.dataloaders['train'].last_batch_size + loss_all /= len(train_set_x) + print("Loss = %f" % loss_all) + if args.timing: + end = time() + print("Time = %f" % (end - start)) + + if args.validate: + correct_predictions = [] + for minibatch_index in range(n_valid_batches): + loss_val, valid_y_predicted, y_val = executor.run( + 'validate', convert_to_numpy_ret_vals=True) + correct_prediction = np.equal( + np.argmax(y_val, 1), + np.argmax(valid_y_predicted, 1)).astype(np.float32) + correct_predictions.extend(correct_prediction) + accuracy = np.mean(correct_predictions) + print("Validation accuracy = %f" % accuracy) diff --git a/examples/runner/run_wdl.py b/examples/runner/run_wdl.py new file mode 100644 index 0000000..e4163c7 --- /dev/null +++ b/examples/runner/run_wdl.py @@ -0,0 +1,130 @@ +import hetu as ht +from hetu.launcher import launch + +import os +import numpy as np +import yaml +import time +import argparse +from tqdm import tqdm +from sklearn import metrics +from models import load_data, wdl_adult + + +def worker(args): + def train(iterations, auc_enabled=True, tqdm_enabled=False): + localiter = tqdm(range(iterations) + ) if tqdm_enabled else range(iterations) + train_loss = [] + train_acc = [] + if auc_enabled: + train_auc = [] + for it in localiter: + loss_val, predict_y, y_val, _ = executor.run( + 'train', convert_to_numpy_ret_vals=True) + acc_val = np.equal( + np.argmax(y_val, 1), + np.argmax(predict_y, 1)).astype(np.float32) + train_loss.append(loss_val[0]) + train_acc.append(acc_val) + if auc_enabled: + train_auc.append(metrics.roc_auc_score(y_val, predict_y)) + if auc_enabled: + return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc) + else: + return np.mean(train_loss), np.mean(train_acc) + + def validate(iterations, tqdm_enabled=False): + localiter = tqdm(range(iterations) + ) if tqdm_enabled else range(iterations) + test_loss = [] + test_acc = [] + test_auc = [] + for it in localiter: + loss_val, test_y_predicted, y_test_val = executor.run( + 'validate', convert_to_numpy_ret_vals=True) + correct_prediction = np.equal( + np.argmax(y_test_val, 1), + np.argmax(test_y_predicted, 1)).astype(np.float32) + test_loss.append(loss_val[0]) + test_acc.append(correct_prediction) + test_auc.append(metrics.roc_auc_score( + y_test_val, test_y_predicted)) + return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc) + + batch_size = 128 + + ctx = { + 'local': 'gpu:0', + 'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7', + 'lhy': 'cpu:0,gpu:1,gpu:2,gpu:3,gpu:6', + 'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3', + 'rhy': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3' + }[args.config] + dense_param_ctx = {'local': 'gpu:0', 'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7', 'lhy': 'gpu:1,gpu:2,gpu:3,gpu:6', + 'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3', + 'rhy': 'daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3'}[args.config] + with ht.context(ctx): + x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_data.load_adult_data() + dense_input = [ + ht.dataloader_op([ + [x_train_deep[:, i], batch_size, 'train'], + [x_test_deep[:, i], batch_size, 'validate'], + ]) for i in range(12) + ] + sparse_input = ht.dataloader_op([ + [x_train_wide, batch_size, 'train'], + [x_test_wide, batch_size, 'validate'], + ]) + y_ = ht.dataloader_op([ + [y_train, batch_size, 'train'], + [y_test, batch_size, 'validate'], + ]) + print("Data loaded.") + + loss, prediction, y_, train_op = wdl_adult.wdl_adult( + dense_input, sparse_input, y_, dense_param_ctx) + + eval_nodes = {'train': [loss, prediction, y_, train_op]} + if args.val: + print('Validation enabled...') + eval_nodes['validate'] = [loss, prediction, y_] + executor = ht.Executor(eval_nodes, + cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123) + + total_epoch = args.nepoch if args.nepoch > 0 else 50 + for ep in range(total_epoch): + if ep == 5: + start = time.time() + print("epoch %d" % ep) + ep_st = time.time() + train_loss, train_acc = train( + executor.get_batch_num('train'), auc_enabled=False) + ep_en = time.time() + if args.val: + val_loss, val_acc, val_auc = validate( + executor.get_batch_num('validate')) + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f" + % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc)) + else: + print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" + % (train_loss, train_acc, ep_en - ep_st)) + print('all time:', time.time() - start) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--config', type=str, default='local', + help='[local, lps(localps), lhy(localhybrid), rps(remoteps), rhy]') + parser.add_argument("--val", action="store_true", + help="whether to use validation") + parser.add_argument("--all", action="store_true", + help="whether to use all data") + parser.add_argument("--bsp", action="store_true", + help="whether to use bsp instead of asp") + parser.add_argument("--cache", default=None, help="cache policy") + parser.add_argument("--bound", default=100, help="cache bound") + parser.add_argument("--nepoch", type=int, default=-1, + help="num of epochs, each train 1/10 data") + args = parser.parse_args() + worker(args) diff --git a/hetu.exp b/hetu.exp new file mode 100755 index 0000000..1528483 --- /dev/null +++ b/hetu.exp @@ -0,0 +1,5 @@ +#!/bin/bash +path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +echo "Hetu root is" $path +export PATH="$path/bin:$PATH" +export PYTHONPATH="$path/python:$path/build/lib:$path/third_party/GraphMix/python:$PYTHONPATH:$path/third_party/HetuML/hetuml/python" diff --git a/img/alibabacloud.png b/img/alibabacloud.png new file mode 100644 index 0000000..2b0095e Binary files /dev/null and b/img/alibabacloud.png differ diff --git a/img/hetu.png b/img/hetu.png new file mode 100644 index 0000000..5791951 Binary files /dev/null and b/img/hetu.png differ diff --git a/img/kuaishou.png b/img/kuaishou.png new file mode 100644 index 0000000..acce838 Binary files /dev/null and b/img/kuaishou.png differ diff --git a/img/tencent.png b/img/tencent.png new file mode 100644 index 0000000..26fdb92 Binary files /dev/null and b/img/tencent.png differ diff --git a/ps-lite/.gitignore b/ps-lite/.gitignore new file mode 100644 index 0000000..ba96dc2 --- /dev/null +++ b/ps-lite/.gitignore @@ -0,0 +1,2 @@ +src/meta.pb.cc +src/meta.pb.h diff --git a/ps-lite/CMakeLists.txt b/ps-lite/CMakeLists.txt new file mode 100644 index 0000000..4d6fce6 --- /dev/null +++ b/ps-lite/CMakeLists.txt @@ -0,0 +1,41 @@ +# port from https://github.com/dmlc/ps-lite/blob/master/CMakeLists.txt + +aux_source_directory(src PS_SRC) +add_library(ps SHARED ${PS_SRC}) +target_include_directories(ps PUBLIC include) + +# find and build zeroMQ +find_package(ZMQ 4.3.2) +if(NOT ZMQ_FOUND) + message(STATUS "ZMQ not Found, Preparing ZMQ ...") + FetchContent_Declare(zmq URL https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip) + FetchContent_GetProperties(zmq) + if(NOT zmq_POPULATED) + FetchContent_Populate(zmq) + if(POLICY CMP0077) # Avoid building shared library and tests on CMake 3.13+ + cmake_policy(SET CMP0077 NEW) + set(BUILD_SHARED OFF CACHE BOOL "") + set(BUILD_TESTS OFF CACHE BOOL "") + endif() + add_subdirectory(${zmq_SOURCE_DIR} ${zmq_BINARY_DIR}) + endif() + target_link_libraries(ps PRIVATE libzmq-static) +else() + target_include_directories(ps PRIVATE ${ZMQ_INCLUDE_DIRS}) + target_link_libraries(ps PRIVATE ${ZMQ_LIBRARIES}) +endif() + +# find and build protobuf +find_package(Protobuf) +if(NOT Protobuf_FOUND) + message(FATAL_ERROR "Protobuf not found, use `conda install protobuf`") +endif() + +set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE) +file(GLOB_RECURSE PROTO_FILES "proto/*.proto") +protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES}) +target_sources(ps PRIVATE ${PROTO_SRCS}) +target_include_directories(ps PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # PROTO_HDRS files are here +target_include_directories(ps PRIVATE ${PROTOBUF_INCLUDE_DIR}) +target_include_directories(ps PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_link_libraries(ps PRIVATE ${PROTOBUF_LIBRARY}) diff --git a/ps-lite/README.md b/ps-lite/README.md new file mode 100644 index 0000000..4587c0f --- /dev/null +++ b/ps-lite/README.md @@ -0,0 +1,83 @@ +# PS-lite Module [adapted from https://github.com/dmlc/ps-lite] + +--- + +## Use Guide + +PS-lite module is a a light-weighted C++ parameter server with ctypes python interface. It provides a list of PS functions that are useful in distributed training. + +To use PS,we will have three roles: worker, server and scheduler. Worker are training process. Servers are where the parameters are stored. The scheduler setup and tear down the connection. There will be multiple servers and workers and only one scheduler. + +Currently, We only implemented python interface for hetu. Since it contains some symbol from hetu, we can only use ps functions after we import hetu. Here is a quick example on how we use ps-lite with hetu. + +```python +# worker.py +import hetu +import numpy as np +import ctypes +# create arrays +tgt_array = hetu.ndarray.empty([128]) +name = 0 # A number specifies a parameter, should be the same among all workers +param_type = 0 # 0 for dense parameter +# PS initialize +hetu.worker_init() +# PS functions here +comm = hetu.get_worker_communicate() +# InitTensor(node_name, param_type, length, width, init_type, init_param_a, init_param_b, seed, opt_type, opt_args, num_opt_args) +# This function is synchronous. +comm.InitTensor(name, param_type, 128, 1, 0, 5.0, 1.0, 123, 0, (ctypes.c_float * 1)(0.1), 1) +comm.Pull(name, tgt_array.handle) +comm.Wait(name) +print(tgt_array.asnumpy()) +# PS finialize +hetu.worker_finish() +``` +We will also have server code and scheduler code +```python +# server.py +import hetu +hetu.server_init() +hetu.server_finish() +``` + +```python +# scheduler.py +import hetu +hetu.scheduler_init() +hetu.scheduler_finish() +``` + +To run the sricpts, we should use environment variables to specify which ip address and port to use. Note that it is recommended to use a yaml or json file to store these environment variables. + +```shell +export DMLC_PS_ROOT_URI=127.0.0.1 DMLC_PS_ROOT_PORT=4080 DMLC_NUM_WORKER=1 DMLC_NUM_SERVER=1 DMLC_PS_VAN_TYPE=p3 +DMLC_ROLE=scheduler python3 scheduler.py & +DMLC_ROLE=server SERVER_ID=0 DMLC_PS_SERVER_URI=127.0.0.1 DMLC_PS_SERVER_PORT=4081 python3 server.py & +DMLC_ROLE=worker WORKER_ID=0 DMLC_PS_WORKER_URI=127.0.0.1 DMLC_PS_WORKER_PORT=4082 python3 worker.py +``` + +## PS functions + +We provide a list of useful parameter server functions for training. + +It also has the ability to easily extend to new ps functions. There will be several steps to go. + +1. Create a enum in psf/PSFunc.h and write a struct to define the ps function. + + ```C++ + template<> struct PSFData { + using Request = tuple< + unsigned long, + SArray + >; + using Response = tuple<>; + static void _callback(const Response &response) {/* callback here */} + }; + ``` + + here we can use scalar types like int,float... or arrays as function parameters. Note that arrays are shared and scalars are copied. + +2. Implement server handler in server/PSFHandler.h + +3. use a kvworker.Request to launch yout ps function and kvworker.Wait to wait till callback ends, see more example in PSAgent.h. We can also write python binding to expose the ps function to python layer. + diff --git a/ps-lite/include/common/dmlc_base.h b/ps-lite/include/common/dmlc_base.h new file mode 100644 index 0000000..f33ba10 --- /dev/null +++ b/ps-lite/include/common/dmlc_base.h @@ -0,0 +1,195 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file base.h + * \brief defines configuration macros + */ +#ifndef DMLC_BASE_H_ +#define DMLC_BASE_H_ + +/*! \brief whether use glog for logging */ +#ifndef DMLC_USE_GLOG +#define DMLC_USE_GLOG 0 +#endif + +/*! + * \brief whether throw dmlc::Error instead of + * directly calling abort when FATAL error occured + * NOTE: this may still not be perfect. + * do not use FATAL and CHECK in destructors + */ +#ifndef DMLC_LOG_FATAL_THROW +#define DMLC_LOG_FATAL_THROW 1 +#endif + +/*! + * \brief Whether to print stack trace for fatal error, + * enabled on linux when using gcc. + */ +#if (defined(__GNUC__) && !defined(__MINGW32__) && !defined(__sun) \ + && !defined(__SVR4) && !(defined __MINGW64__) && !(defined __ANDROID__)) +#if (!defined(DMLC_LOG_STACK_TRACE)) +#define DMLC_LOG_STACK_TRACE 1 +#endif +#if (!defined(DMLC_LOG_STACK_TRACE_SIZE)) +#define DMLC_LOG_STACK_TRACE_SIZE 10 +#endif +#endif + +/*! \brief whether compile with hdfs support */ +#ifndef DMLC_USE_HDFS +#define DMLC_USE_HDFS 0 +#endif + +/*! \brief whether compile with s3 support */ +#ifndef DMLC_USE_S3 +#define DMLC_USE_S3 0 +#endif + +/*! \brief whether or not use parameter server */ +#ifndef DMLC_USE_PS +#define DMLC_USE_PS 0 +#endif + +/*! \brief whether or not use c++11 support */ +#ifndef DMLC_USE_CXX11 +#define DMLC_USE_CXX11 \ + (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L \ + || defined(_MSC_VER)) +#endif + +/// check if g++ is before 4.6 +#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__) +#if __GNUC__ == 4 && __GNUC_MINOR__ < 6 +#pragma message("Will need g++-4.6 or higher to compile all" \ + "the features in dmlc-core, " \ + "compile without c++11, some features may be disabled") +#undef DMLC_USE_CXX11 +#define DMLC_USE_CXX11 0 +#endif +#endif + +/*! + * \brief Disable copy constructor and assignment operator. + * + * If C++11 is supported, both copy and move constructors and + * assignment operators are deleted explicitly. Otherwise, they are + * only declared but not implemented. Place this macro in private + * section if C++11 is not available. + */ +#ifndef DISALLOW_COPY_AND_ASSIGN +#if DMLC_USE_CXX11 +#define DISALLOW_COPY_AND_ASSIGN(T) \ + T(T const &) = delete; \ + T(T &&) = delete; \ + T &operator=(T const &) = delete; \ + T &operator=(T &&) = delete +#else +#define DISALLOW_COPY_AND_ASSIGN(T) \ + T(T const &); \ + T &operator=(T const &) +#endif +#endif + +/// +/// code block to handle optionally loading +/// +#if !defined(__GNUC__) +#define fopen64 std::fopen +#endif +#ifdef _MSC_VER +#if _MSC_VER < 1900 +// NOTE: sprintf_s is not equivalent to snprintf, +// they are equivalent when success, which is sufficient for our case +#define snprintf sprintf_s +#define vsnprintf vsprintf_s +#endif +#else +#ifdef _FILE_OFFSET_BITS +#if _FILE_OFFSET_BITS == 32 +#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit") +#endif +#endif + +#ifdef __APPLE__ +#define off64_t off_t +#define fopen64 std::fopen +#endif + +extern "C" { +#include +} +#endif + +#ifdef _MSC_VER +//! \cond Doxygen_Suppress +typedef signed char int8_t; +typedef __int16 int16_t; +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned char uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +//! \endcond +#else +#include +#endif +#include +#include + +/*! \brief namespace for dmlc */ +namespace dmlc { +/*! + * \brief safely get the beginning address of a vector + * \param vec input vector + * \return beginning address of a vector + */ +template +inline T *BeginPtr(std::vector &vec) { // NOLINT(*) + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} +/*! + * \brief get the beginning address of a vector + * \param vec input vector + * \return beginning address of a vector + */ +template +inline const T *BeginPtr(const std::vector &vec) { + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} +/*! + * \brief get the beginning address of a vector + * \param str input string + * \return beginning address of a string + */ +inline char *BeginPtr(std::string &str) { // NOLINT(*) + if (str.length() == 0) + return NULL; + return &str[0]; +} +/*! + * \brief get the beginning address of a vector + * \param str input string + * \return beginning address of a string + */ +inline const char *BeginPtr(const std::string &str) { + if (str.length() == 0) + return NULL; + return &str[0]; +} +} // namespace dmlc + +#if defined(_MSC_VER) && _MSC_VER < 1900 +#define constexpr const +#define alignof __alignof +#endif + +#endif // DMLC_BASE_H_ diff --git a/ps-lite/include/common/logging.h b/ps-lite/include/common/logging.h new file mode 100644 index 0000000..0aa23bc --- /dev/null +++ b/ps-lite/include/common/logging.h @@ -0,0 +1,349 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file logging.h + * \brief defines logging macros of dmlc + * allows use of GLOG, fall back to internal + * implementation when disabled + */ +#ifndef DMLC_LOGGING_H_ +#define DMLC_LOGGING_H_ +#include +#include +#include +#include +#include +#include +#include "dmlc_base.h" + +#if DMLC_LOG_STACK_TRACE +#include +#endif + +#if DMLC_LOG_STACK_TRACE +#include +#endif + +namespace dmlc { +/*! + * \brief exception class that will be thrown by + * default logger if DMLC_LOG_FATAL_THROW == 1 + */ +struct Error : public std::runtime_error { + /*! + * \brief constructor + * \param s the error message + */ + explicit Error(const std::string &s) : std::runtime_error(s) { + } +}; +} // namespace dmlc + +#if defined(_MSC_VER) && _MSC_VER < 1900 +#define noexcept(a) +#endif + +#if DMLC_USE_CXX11 +#define DMLC_THROW_EXCEPTION noexcept(false) +#else +#define DMLC_THROW_EXCEPTION +#endif + +#if DMLC_USE_GLOG +#include + +namespace dmlc { +inline void InitLogging(const char *argv0) { + google::InitGoogleLogging(argv0); +} +} // namespace dmlc + +#else +// use a light version of glog +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable : 4722) +#endif + +namespace dmlc { +inline void InitLogging(const char *argv0) { + // DO NOTHING +} + +// Always-on checking +#define CHECK(x) \ + if (!(x)) \ + dmlc::LogMessageFatal(__FILE__, __LINE__).stream() << "Check " \ + "failed: " #x \ + << ' ' +#define CHECK_LT(x, y) CHECK((x) < (y)) +#define CHECK_GT(x, y) CHECK((x) > (y)) +#define CHECK_LE(x, y) CHECK((x) <= (y)) +#define CHECK_GE(x, y) CHECK((x) >= (y)) +#define CHECK_EQ(x, y) CHECK((x) == (y)) +#define CHECK_NE(x, y) CHECK((x) != (y)) +#define CHECK_NOTNULL(x) \ + ((x) == NULL ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ + << "Check notnull: " #x << ' ', \ + (x) : (x)) // NOLINT(*) +// Debug-only checking. +#ifdef NDEBUG +#define DCHECK(x) \ + while (false) \ + CHECK(x) +#define DCHECK_LT(x, y) \ + while (false) \ + CHECK((x) < (y)) +#define DCHECK_GT(x, y) \ + while (false) \ + CHECK((x) > (y)) +#define DCHECK_LE(x, y) \ + while (false) \ + CHECK((x) <= (y)) +#define DCHECK_GE(x, y) \ + while (false) \ + CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) \ + while (false) \ + CHECK((x) == (y)) +#define DCHECK_NE(x, y) \ + while (false) \ + CHECK((x) != (y)) +#else +#define DCHECK(x) CHECK(x) +#define DCHECK_LT(x, y) CHECK((x) < (y)) +#define DCHECK_GT(x, y) CHECK((x) > (y)) +#define DCHECK_LE(x, y) CHECK((x) <= (y)) +#define DCHECK_GE(x, y) CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) CHECK((x) == (y)) +#define DCHECK_NE(x, y) CHECK((x) != (y)) +#endif // NDEBUG + +#define LOG_INFO dmlc::LogMessage(__FILE__, __LINE__) +#define LOG_ERROR LOG_INFO +#define LOG_WARNING LOG_INFO +#define LOG_FATAL dmlc::LogMessageFatal(__FILE__, __LINE__) +#define LOG_QFATAL LOG_FATAL + +// Poor man version of VLOG +#define VLOG(x) LOG_INFO.stream() + +#define LOG(severity) LOG_##severity.stream() +#define LG LOG_INFO.stream() +#define LF LOG_FATAL.stream() +#define LGMT dmlc::LogMessageMultiThread(__FILE__, __LINE__).stream() +#define LOG_IF(severity, condition) \ + !(condition) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity) + +#ifdef NDEBUG +#define LOG_DFATAL LOG_ERROR +#define DFATAL ERROR +#define DLOG(severity) \ + true ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity) +#define DLOG_IF(severity, condition) \ + (true || !(condition)) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity) +#else +#define LOG_DFATAL LOG_FATAL +#define DFATAL FATAL +#define DLOG(severity) LOG(severity) +#define DLOG_IF(severity, condition) LOG_IF(severity, condition) +#endif + +// Poor man version of LOG_EVERY_N +#define LOG_EVERY_N(severity, n) LOG(severity) + +class DateLogger { +public: + DateLogger() { +#if defined(_MSC_VER) + _tzset(); +#endif + } + const char *HumanDate() { +#if defined(_MSC_VER) + _strtime_s(buffer_, sizeof(buffer_)); +#else + time_t time_value = time(NULL); + struct tm now; + localtime_r(&time_value, &now); + snprintf(buffer_, sizeof(buffer_), "%02d:%02d:%02d", now.tm_hour, + now.tm_min, now.tm_sec); +#endif + return buffer_; + } + +private: + char buffer_[9]; +}; + +class LogMessage { +public: + LogMessage(const char *file, int line) : +#ifdef __ANDROID__ + log_stream_(std::cout) +#else + log_stream_(std::cerr) +#endif + { + log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":" + << line << ": "; + } + ~LogMessage() { + log_stream_ << "\n"; + } + std::ostream &stream() { + return log_stream_; + } + +protected: + std::ostream &log_stream_; + +private: + DateLogger pretty_date_; + LogMessage(const LogMessage &); + void operator=(const LogMessage &); +}; + +#if DMLC_LOG_STACK_TRACE +inline std::string Demangle(char const *msg_str) { + using std::string; + string msg(msg_str); + size_t symbol_start = string::npos; + size_t symbol_end = string::npos; + if (((symbol_start = msg.find("_Z")) != string::npos) + && (symbol_end = msg.find_first_of(" +", symbol_start))) { + string left_of_symbol(msg, 0, symbol_start); + string symbol(msg, symbol_start, symbol_end - symbol_start); + string right_of_symbol(msg, symbol_end); + + int status = 0; + size_t length = string::npos; + std::unique_ptr demangled_symbol = { + abi::__cxa_demangle(symbol.c_str(), 0, &length, &status), + &std::free}; + if (demangled_symbol && status == 0 && length > 0) { + string symbol_str(demangled_symbol.get()); + std::ostringstream os; + os << left_of_symbol << symbol_str << right_of_symbol; + return os.str(); + } + } + return string(msg_str); +} + +inline std::string StackTrace() { + using std::string; + std::ostringstream stacktrace_os; + const int MAX_STACK_SIZE = DMLC_LOG_STACK_TRACE_SIZE; + void *stack[MAX_STACK_SIZE]; + int nframes = backtrace(stack, MAX_STACK_SIZE); + stacktrace_os << "Stack trace returned " << nframes + << " entries:" << std::endl; + char **msgs = backtrace_symbols(stack, nframes); + if (msgs != nullptr) { + for (int frameno = 0; frameno < nframes; ++frameno) { + string msg = dmlc::Demangle(msgs[frameno]); + stacktrace_os << "[bt] (" << frameno << ") " << msg << "\n"; + } + } + free(msgs); + string stack_trace = stacktrace_os.str(); + return stack_trace; +} + +#else // DMLC_LOG_STACK_TRACE is off + +inline std::string demangle(char const *msg_str) { + return std::string(); +} + +inline std::string StackTrace() { + return std::string("stack traces not available when " + "DMLC_LOG_STACK_TRACE is disabled at compile time."); +} + +#endif // DMLC_LOG_STACK_TRACE + +#if DMLC_LOG_FATAL_THROW == 0 +class LogMessageFatal : public LogMessage { +public: + LogMessageFatal(const char *file, int line) : LogMessage(file, line) { + } + ~LogMessageFatal() { + log_stream_ << "\n"; + abort(); + } + +private: + LogMessageFatal(const LogMessageFatal &); + void operator=(const LogMessageFatal &); +}; +#else +class LogMessageFatal { +public: + LogMessageFatal(const char *file, int line) { + log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":" + << line << ": "; + } + std::ostringstream &stream() { + return log_stream_; + } + ~LogMessageFatal() DMLC_THROW_EXCEPTION { +#if DMLC_LOG_STACK_TRACE + log_stream_ << "\n\n" << StackTrace() << "\n"; +#endif + // throwing out of destructor is evil + // hopefully we can do it here + // also log the message before throw + LOG(ERROR) << log_stream_.str(); + throw Error(log_stream_.str()); + } + +private: + std::ostringstream log_stream_; + DateLogger pretty_date_; + LogMessageFatal(const LogMessageFatal &); + void operator=(const LogMessageFatal &); +}; +#endif + +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { +public: + LogMessageVoidify() { + } + // This has to be an operator with a precedence lower than << but + // higher than "?:". See its usage. + void operator&(std::ostream &) { + } +}; + +class LogMessageMultiThread { +public: + LogMessageMultiThread(const char *file, int line) { + log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":" + << line << ": "; + } + std::ostringstream &stream() { + return log_stream_; + } + ~LogMessageMultiThread() { + log_stream_ << "\n"; + } + +private: + std::ostringstream log_stream_; + DateLogger pretty_date_; + LogMessageMultiThread(const LogMessageMultiThread &); + void operator=(const LogMessageMultiThread &); +}; + +} // namespace dmlc + +#endif +#endif // DMLC_LOGGING_H_ diff --git a/ps-lite/include/common/sarray.h b/ps-lite/include/common/sarray.h new file mode 100644 index 0000000..71c7cea --- /dev/null +++ b/ps-lite/include/common/sarray.h @@ -0,0 +1,328 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "logging.h" + +/** + * \brief Shared array + * + * A smart array that retains shared ownership. It provides similar + * functionalities comparing to std::vector, including data(), size(), + * operator[], resize(), clear(). SArray can be easily constructed from + * std::vector, such as + * + * \code + * std::vector a(10); SArray b(a); // copying + * std::shared_ptr> c(new std::vector(10)); + * SArray d(c); // only pointer copying + * \endcode + * + * SArray is also like a C pointer when copying and assigning, namely + * both copy are assign are passing by pointers. The memory will be release only + * if there is no copy exists. It is also can be cast without memory copy, such + * as + * + * \code + * SArray a(10); + * SArray b(a); // now b.size() = 10 * sizeof(int); + * \endcode + * + * \tparam V the value type + */ +template +class SArray { +public: + /** \brief empty constructor */ + SArray() { + } + + /** \brief empty deconstrcutor */ + ~SArray() { + } + + /** + * \brief Create an array with length n with initialized value + * \param size the length + * \param val the initial length (0 in default) + */ + explicit SArray(size_t size, V val = 0) { + resize(size, val); + } + + /** + * \brief construct from another SArray. + * + * Zero-copy constructor, namely just copy the pointer + * + * \tparam W the value type of the source array + * \param arr the source array + */ + template + explicit SArray(const SArray &arr) { + *this = arr; + } + + /** + * \brief construct from another SArray. + * + * Zero-copy constructor, namely just copy the pointer + * + * \tparam W the value type of the source array + * \param arr the source array + */ + template + void operator=(const SArray &arr) { + size_ = arr.size() * sizeof(W) / sizeof(V); + CHECK_EQ(size_ * sizeof(V), arr.size() * sizeof(W)) + << "cannot be divided"; + capacity_ = arr.capacity() * sizeof(W) / sizeof(V); + ptr_ = std::shared_ptr(arr.ptr(), reinterpret_cast(arr.data())); + } + + /** + * \brief construct from a c-array + * + * Zero-copy constructor, namely just copy the pointer + * + * \param data the source data + * \param size the length + * \param deletable whether or not can call `delete [] data` when the + * reference count goes 0 + */ + + SArray(V *data, size_t size, bool deletable = false) { + if (deletable) { + reset(data, size, [](V *data) { delete[] data; }); + } else { + reset(data, size, [](V *data) {}); + } + } + + /** + * \brief copy from a c-array + * + * \param data the source data + * \param size the length + */ + void CopyFrom(const V *data, size_t size) { + resize(size); + memcpy(this->data(), data, size * sizeof(V)); + } + + /** + * \brief copy from another SArray + * + * \param other the source data + */ + void CopyFrom(const SArray &other) { + if (this == &other) + return; + CopyFrom(other.data(), other.size()); + } + + /** + * \brief copy from an iterator + */ + template + void CopyFrom(const ForwardIt &first, const ForwardIt &last) { + size_t size = static_cast(std::distance(first, last)); + V *data = new V[size]; + reset(data, size, [](V *data) { delete[] data; }); + auto it = first; + while (size-- > 0) { + *data = *it; + ++data; + ++it; + } + } + + /** + * \brief construct from a std::vector, copy the data + */ + explicit SArray(const std::vector &vec) { + CopyFrom(vec.data(), vec.size()); + } + + /** + * \brief construct from a shared std::vector pinter, no data copy + */ + explicit SArray(const std::shared_ptr> &vec) { + ptr_ = std::shared_ptr(vec, vec->data()); + size_ = vec->size(); + capacity_ = size_; + } + + /** @brief Copy from a initializer_list */ + template + SArray(const std::initializer_list &list) { + CopyFrom(list.begin(), list.end()); + } + + /** @brief Copy from a initializer_list */ + template + void operator=(const std::initializer_list &list) { + CopyFrom(list.begin(), list.end()); + } + + /** + * @brief Reset the current data pointer with a deleter + */ + template + void reset(V *data, size_t size, Deleter del) { + size_ = size; + capacity_ = size; + ptr_.reset(data, del); + } + + /** + * @brief Resizes the array to size elements + * + * If size <= capacity_, then only change the size. otherwise, append size - + * current_size entries, and then set new value to val + */ + void resize(size_t size, V val = 0) { + size_t cur_n = size_; + if (capacity_ >= size) { + size_ = size; + } else { + V *new_data = new V[size + 5]; + memcpy(new_data, data(), size_ * sizeof(V)); + reset(new_data, size, [](V *data) { delete[] data; }); + } + if (size <= cur_n) + return; + V *p = data() + cur_n; + if (val == 0) { + memset(p, 0, (size - cur_n) * sizeof(V)); + } else { + for (size_t i = 0; i < size - cur_n; ++i) { + *p = val; + ++p; + } + } + } + + /** + * @brief Requests that the capacity be at least enough to contain n + * elements. + */ + void reserve(size_t size) { + if (capacity_ >= size) { + return; + } + size_t old_size = size_; + resize(size); + size_ = old_size; + } + + /** @brief release the memory */ + void clear() { + reset(nullptr, 0, [](V *data) {}); + } + + inline bool empty() const { + return size() == 0; + } + inline size_t size() const { + return size_; + } + inline size_t capacity() const { + return capacity_; + } + + inline V *begin() { + return data(); + } + inline const V *begin() const { + return data(); + } + inline V *end() { + return data() + size(); + } + inline const V *end() const { + return data() + size(); + } + + inline V *data() const { + return ptr_.get(); + } + + /** \brief get the shared pointer */ + inline std::shared_ptr &ptr() { + return ptr_; + } + /** \brief get the const shared pointer */ + inline const std::shared_ptr &ptr() const { + return ptr_; + } + + inline V back() const { + CHECK(!empty()); + return data()[size_ - 1]; + } + inline V front() const { + CHECK(!empty()); + return data()[0]; + } + inline V &operator[](size_t i) { + return data()[i]; + } + inline const V &operator[](size_t i) const { + return data()[i]; + } + + inline void push_back(const V &val) { + if (size_ == capacity_) + reserve(size_ * 2 + 5); + data()[size_++] = val; + } + + void pop_back() { + if (size_) + --size_; + } + + void append(const SArray &arr) { + if (arr.empty()) + return; + auto orig_size = size_; + resize(size_ + arr.size()); + memcpy(data() + orig_size, arr.data(), arr.size() * sizeof(V)); + } + + /** + * @brief Slice a segment, zero-copy + * + * @param begin the start index segment + * @param end the end index segment + * @return the segment [begin, end) + */ + SArray segment(size_t begin, size_t end) const { + CHECK_GE(end, begin); + CHECK_LE(end, size()); + SArray ret; + ret.ptr_ = std::shared_ptr(ptr_, data() + begin); + ret.size_ = end - begin; + ret.capacity_ = end - begin; + return ret; + } + +private: + size_t size_ = 0; + size_t capacity_ = 0; + std::shared_ptr ptr_; +}; + +/** + * \brief print a debug string + */ +template +std::ostream &operator<<(std::ostream &os, const SArray &obj) { + os << DebugStr(obj.data(), obj.size()); + return os; +} diff --git a/ps-lite/include/common/shared_mutex.h b/ps-lite/include/common/shared_mutex.h new file mode 100644 index 0000000..d7ea395 --- /dev/null +++ b/ps-lite/include/common/shared_mutex.h @@ -0,0 +1,184 @@ +/* + * shared_mutex (C) 2017 E. Oriani, ema fastwebnet it + * + * This file is part of shared_mutex. + * + * shared_mutex is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * shared_mutex is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with nettop. If not, see . + */ + +#pragma once + +#include +#include +#include + +// this should be defined in the Makefile +// if not defined, use what is most common +// for x86_64 CPUs in 2017... +#ifndef LEVEL1_DCACHE_LINESIZE +#define LEVEL1_DCACHE_LINESIZE 64 +#endif + +namespace ps { +template +class shared_mutex { + // purpose of this structure is to hold + // status of each individual bucket-mutex + // object + // Ideally each thread should be mapped to + // one entry only of 'el_' during its + // lifetime + struct entry_lock { + const static uint64_t W_MASK = 0x8000000000000000, R_MASK = ~W_MASK; + + // purpose ot this variable is to hold + // in the first bit (W_MASK) if we're locking + // in exclusive mode, otherwise use the + // reamining 63 bits to count how many R/O + // locks we share in this very bucket + std::atomic wr_lock; + + entry_lock() : wr_lock(0) { + } + } __attribute__((aligned(LEVEL1_DCACHE_LINESIZE))); + // array holding all the buckets + std::array el_; + // atomic variable used to initialize thread + // ids so that they should evenly spread + // across all the buckets + static std::atomic idx_hint_; + // lock-free function to return a 'unique' id + static uint64_t get_hint_idx(void) { + while (true) { + size_t cur_hint = idx_hint_.load(); + if (idx_hint_.compare_exchange_weak(cur_hint, cur_hint + 1)) + return cur_hint; + } + } + // get index for given thread + // could hav used something like + // std::hash()(std::this_thread::get_id()) but honestly + // using a controlled idx_hint_ seems to be better in terms of putting + // threads into buckets evenly note - thread_local is supposed to be + // static... + inline static size_t get_thread_idx(void) { + const thread_local size_t rv = get_hint_idx() % N; + return rv; + } + +public: + shared_mutex() { + } + + void lock_shared(void) { + // try to replace the wr_lock with current value incremented by one + while (true) { + size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load(); + if (entry_lock::W_MASK & cur_rw_lock) { + // if someone has got W access yield and retry... + std::this_thread::yield(); + continue; + } + if (el_[get_thread_idx()].wr_lock.compare_exchange_weak( + cur_rw_lock, cur_rw_lock + 1)) + break; + } + } + + void unlock_shared(void) { + // try to decrement the count + while (true) { + size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load(); +#ifndef _RELEASE + if (entry_lock::W_MASK & cur_rw_lock) + throw std::runtime_error( + "Fatal: unlock_shared but apparently this entry is W_MASK locked!"); +#endif //_RELEASE + if (el_[get_thread_idx()].wr_lock.compare_exchange_weak( + cur_rw_lock, cur_rw_lock - 1)) + break; + } + } + + void lock(void) { + for (size_t i = 0; i < N; ++i) { + // acquire all locks from all buckets + while (true) { + size_t cur_rw_lock = el_[i].wr_lock.load(); + if (cur_rw_lock != 0) { + std::this_thread::yield(); + continue; + } + // if cur_rw_lock is 0 then proceed + if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock, + entry_lock::W_MASK)) + break; + } + } + } + + void unlock(void) { + for (size_t i = 0; i < N; ++i) { + // release all locks + while (true) { + size_t cur_rw_lock = el_[i].wr_lock.load(); +#ifndef _RELEASE + if (cur_rw_lock != entry_lock::W_MASK) + throw std::runtime_error( + "Fatal: unlock but apparently this entry is shared locked or uninitialized!"); +#endif //_RELEASE + // then proceed resetting to 0 + if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock, 0)) + break; + } + } + } + + ~shared_mutex() { + } +}; + +template +std::atomic shared_mutex::idx_hint_{0}; + +// utility class for exclusive RAII lock +template +class x_lock { + shared_mutex &sm_; + +public: + x_lock(shared_mutex &sm) : sm_(sm) { + sm_.lock(); + } + + ~x_lock() { + sm_.unlock(); + } +}; + +// utility class for share RAII lock +template +class s_lock { + shared_mutex &sm_; + +public: + s_lock(shared_mutex &sm) : sm_(sm) { + sm_.lock_shared(); + } + + ~s_lock() { + sm_.unlock_shared(); + } +}; +} // namespace ps diff --git a/ps-lite/include/common/thread_pool.h b/ps-lite/include/common/thread_pool.h new file mode 100644 index 0000000..1899b0a --- /dev/null +++ b/ps-lite/include/common/thread_pool.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class ThreadPool { +public: + ThreadPool(size_t thread_num); + ~ThreadPool(); + static ThreadPool *Get(); + + template + auto Enqueue(F &&f, Args &&... args) + -> std::future::type> { + using return_type = typename std::result_of::type; + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...)); + std::future res = task->get_future(); + { + std::unique_lock lock(mutex_); + if (terminate_) + throw std::runtime_error("enqueue on stopped ThreadPool"); + tasks_.emplace([task]() { (*task)(); }); + } + cond_.notify_one(); + return res; + } + + void Wait(int task_num); + + size_t ThreadNum() { + return thread_num_; + } + +private: + bool terminate_; + size_t thread_num_; + std::atomic_int complete_task_num_; + std::vector threads_; + std::queue> tasks_; + std::mutex mutex_; + std::condition_variable cond_; +}; diff --git a/ps-lite/include/common/thread_safe_hash_map.h b/ps-lite/include/common/thread_safe_hash_map.h new file mode 100644 index 0000000..ce1f1e7 --- /dev/null +++ b/ps-lite/include/common/thread_safe_hash_map.h @@ -0,0 +1,325 @@ +#pragma once +#include +#include +#include +#include "shared_mutex.h" + +namespace ps { +/* + thread_safe unordered_map + use read-write lock to guaruntee concurrency +*/ +template , + typename _Pred = std::equal_to<_Key>, + typename _Alloc = std::allocator>> +class threadsafe_unordered_map { +private: + std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc> map; + mutable shared_mutex<4> mtx; + +public: + using map_type = std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>; + using key_type = typename map_type::key_type; + using mapped_type = typename map_type::mapped_type; + using value_type = typename map_type::value_type; + using hasher = typename map_type::hasher; + using key_equal = typename map_type::key_equal; + using allocator_type = typename map_type::allocator_type; + using reference = typename map_type::reference; + using const_reference = typename map_type::const_reference; + using pointer = typename map_type::pointer; + using const_pointer = typename map_type::const_pointer; + using iterator = typename map_type::iterator; + using const_iterator = typename map_type::const_iterator; + using local_iterator = typename map_type::local_iterator; + using const_local_iterator = typename map_type::const_local_iterator; + using size_type = typename map_type::size_type; + using difference_type = typename map_type::difference_type; + + threadsafe_unordered_map() = default; + threadsafe_unordered_map(const threadsafe_unordered_map &) = delete; + threadsafe_unordered_map(threadsafe_unordered_map &&) = default; + threadsafe_unordered_map & + operator=(const threadsafe_unordered_map &) = delete; + threadsafe_unordered_map &operator=(threadsafe_unordered_map &&) = delete; + explicit threadsafe_unordered_map( + size_type __n, const hasher &__hf = hasher(), + const key_equal &__eql = key_equal(), + const allocator_type &__a = allocator_type()) : + map(__n, __hf, __eql, __a) { + } + template + threadsafe_unordered_map(_InputIterator __first, _InputIterator __last, + size_type __n = 0, const hasher &__hf = hasher(), + const key_equal &__eql = key_equal(), + const allocator_type &__a = allocator_type()) : + map(__first, __last, __n, __hf, __eql, __a) { + } + threadsafe_unordered_map(const map_type &v) : map(v) { + } + threadsafe_unordered_map(map_type &&rv) : map(std::move(rv)) { + } + explicit threadsafe_unordered_map(const allocator_type &__a) : map(__a) { + } + threadsafe_unordered_map(const map_type &__umap, + const allocator_type &__a) : + map(__umap, __a) { + } + threadsafe_unordered_map(map_type &&__umap, const allocator_type &__a) : + map(std::move(__umap), __a) { + } + threadsafe_unordered_map(std::initializer_list __l, + size_type __n = 0, const hasher &__hf = hasher(), + const key_equal &__eql = key_equal(), + const allocator_type &__a = allocator_type()) : + map(__l, __n, __hf, __eql, __a) { + } + threadsafe_unordered_map(size_type __n, const allocator_type &__a) : + threadsafe_unordered_map(__n, hasher(), key_equal(), __a) { + } + threadsafe_unordered_map(size_type __n, const hasher &__hf, + const allocator_type &__a) : + threadsafe_unordered_map(__n, __hf, key_equal(), __a) { + } + template + threadsafe_unordered_map(_InputIterator __first, _InputIterator __last, + size_type __n, const allocator_type &__a) : + map(__first, __last, __n, __a) { + } + template + threadsafe_unordered_map(_InputIterator __first, _InputIterator __last, + size_type __n, const hasher &__hf, + const allocator_type &__a) : + threadsafe_unordered_map(__first, __last, __n, __hf, key_equal(), __a) { + } + threadsafe_unordered_map(std::initializer_list __l, + size_type __n, const allocator_type &__a) : + threadsafe_unordered_map(__l, __n, hasher(), key_equal(), __a) { + } + threadsafe_unordered_map(std::initializer_list __l, + size_type __n, const hasher &__hf, + const allocator_type &__a) : + threadsafe_unordered_map(__l, __n, __hf, key_equal(), __a) { + } + bool empty() const noexcept { + s_lock<4> read_lock(mtx); + return map.empty(); + } + size_type size() const noexcept { + s_lock<4> read_lock(mtx); + return map.size(); + } + size_type max_size() const noexcept { + s_lock<4> read_lock(mtx); + return map.max_size(); + } + iterator begin() noexcept { + x_lock<4> write_lock(mtx); + return map.begin(); + } + const_iterator begin() const noexcept { + s_lock<4> read_lock(mtx); + return map.begin(); + } + const_iterator cbegin() const noexcept { + s_lock<4> read_lock(mtx); + return map.cbegin(); + } + iterator end() noexcept { + x_lock<4> write_lock(mtx); + return map.end(); + } + const_iterator end() const noexcept { + s_lock<4> read_lock(mtx); + return map.end(); + } + const_iterator cend() const noexcept { + s_lock<4> read_lock(mtx); + return map.cend(); + } + template + std::pair emplace(_Args &&... __args) { + x_lock<4> write_lock(mtx); + return map.emplace(std::forward<_Args>(__args)...); + } + template + iterator emplace_hint(const_iterator __pos, _Args &&... __args) { + x_lock<4> write_lock(mtx); + return map.emplace_hint(__pos, std::forward<_Args>(__args)...); + } + std::pair insert(const value_type &__x) { + x_lock<4> write_lock(mtx); + return map.insert(__x); + } + template ::value>::type> + std::pair insert(_Pair &&__x) { + x_lock<4> write_lock(mtx); + return map.insert(std::forward<_Pair>(__x)); + } + iterator insert(const_iterator __hint, const value_type &__x) { + x_lock<4> write_lock(mtx); + return map.insert(__hint, __x); + } + template ::value>::type> + iterator insert(const_iterator __hint, _Pair &&__x) { + x_lock<4> write_lock(mtx); + return map.insert(__hint, std::forward<_Pair>(__x)); + } + template + void insert(_InputIterator __first, _InputIterator __last) { + x_lock<4> write_lock(mtx); + map.insert(__first, __last); + } + void insert(std::initializer_list __l) { + x_lock<4> write_lock(mtx); + map.insert(__l); + } + iterator erase(const_iterator __position) { + x_lock<4> write_lock(mtx); + return map.erase(__position); + } + iterator erase(iterator __position) { + x_lock<4> write_lock(mtx); + return map.erase(__position); + } + size_type erase(const key_type &__x) { + x_lock<4> write_lock(mtx); + return map.erase(__x); + } + iterator erase(const_iterator __first, const_iterator __last) { + x_lock<4> write_lock(mtx); + return map.erase(__first, __last); + } + void clear() noexcept { + x_lock<4> write_lock(mtx); + map.clear(); + } + void swap(map_type &__x) noexcept(noexcept(map.swap(__x._M_h))) { + x_lock<4> write_lock(mtx); + map.swap(__x._M_h); + } + hasher hash_function() const { + s_lock<4> read_lock(mtx); + return map.hash_function(); + } + key_equal key_eq() const { + s_lock<4> read_lock(mtx); + return map.key_eq(); + } + iterator find(const key_type &__x) { + x_lock<4> write_lock(mtx); + return map.find(__x); + } + const_iterator find(const key_type &__x) const { + s_lock<4> read_lock(mtx); + return map.find(__x); + } + size_type count(const key_type &__x) const { + s_lock<4> read_lock(mtx); + return map.count(__x); + } + std::pair equal_range(const key_type &__x) { + x_lock<4> write_lock(mtx); + return map.equal_range(__x); + } + std::pair + equal_range(const key_type &__x) const { + s_lock<4> read_lock(mtx); + return map.equal_range(__x); + } + mapped_type &operator[](const key_type &__k) { + x_lock<4> write_lock(mtx); + return map[__k]; + } + mapped_type &operator[](key_type &&__k) { + x_lock<4> write_lock(mtx); + return map[std::move(__k)]; + } + mapped_type &at(const key_type &__k) { + x_lock<4> write_lock(mtx); + return map.at(__k); + } + const mapped_type &at(const key_type &__k) const { + s_lock<4> read_lock(mtx); + return map.at(__k); + } + size_type bucket_count() const noexcept { + s_lock<4> read_lock(mtx); + return map.bucket_count(); + } + + size_type max_bucket_count() const noexcept { + s_lock<4> read_lock(mtx); + return map.max_bucket_count(); + } + size_type bucket_size(size_type __n) const { + s_lock<4> read_lock(mtx); + return map.bucket_size(__n); + } + size_type bucket(const key_type &__key) const { + s_lock<4> read_lock(mtx); + return map.bucket(__key); + } + local_iterator begin(size_type __n) { + x_lock<4> write_lock(mtx); + return map.begin(__n); + } + const_local_iterator begin(size_type __n) const { + s_lock<4> read_lock(mtx); + return map.begin(__n); + } + const_local_iterator cbegin(size_type __n) const { + s_lock<4> read_lock(mtx); + return map.cbegin(__n); + } + local_iterator end(size_type __n) { + x_lock<4> write_lock(mtx); + return map.end(__n); + } + const_local_iterator end(size_type __n) const { + s_lock<4> read_lock(mtx); + return map.end(__n); + } + const_local_iterator cend(size_type __n) const { + s_lock<4> read_lock(mtx); + return map.cend(__n); + } + float load_factor() const noexcept { + s_lock<4> read_lock(mtx); + return map.load_factor(); + } + float max_load_factor() const noexcept { + s_lock<4> read_lock(mtx); + return map.max_load_factor(); + } + void max_load_factor(float __z) { + x_lock<4> write_lock(mtx); + map.max_load_factor(__z); + } + void rehash(size_type __n) { + x_lock<4> write_lock(mtx); + map.rehash(__n); + } + void reserve(size_type __n) { + x_lock<4> write_lock(mtx); + map.reserve(__n); + } + // ----------------------------- Added function + // ---------------------------------- + template + const_iterator emplaceIfAbsent(const key_type &__x, _Args &&... __args) { + x_lock<4> write_lock(mtx); + iterator iter = map.find(__x); + if (iter == map.end()) { + iter = map.emplace(__x, mapped_type(std::forward<_Args>(__args)...)) + .first; + } + return iter; + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/base.h b/ps-lite/include/ps/base.h new file mode 100644 index 0000000..0984d05 --- /dev/null +++ b/ps-lite/include/ps/base.h @@ -0,0 +1,33 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_BASE_H_ +#define PS_BASE_H_ +#include +#include "ps/internal/utils.h" +namespace ps { + +#if USE_KEY32 +/*! \brief Use unsigned 32-bit int as the key type */ +using Key = uint32_t; +#else +/*! \brief Use unsigned 64-bit int as the key type */ +using Key = uint64_t; +#endif +/*! \brief The maximal allowed key value */ +static const Key kMaxKey = std::numeric_limits::max(); +/** \brief node ID for the scheduler */ +static const int kScheduler = 1; +/** + * \brief the server node group ID + * + * group id can be combined: + * - kServerGroup + kScheduler means all server nodes and the scheuduler + * - kServerGroup + kWorkerGroup means all server and worker nodes + */ +static const int kServerGroup = 2; +/** \brief the worker node group ID */ +static const int kWorkerGroup = 4; + +} // namespace ps +#endif // PS_BASE_H_ diff --git a/ps-lite/include/ps/internal/assign_op.h b/ps-lite/include/ps/internal/assign_op.h new file mode 100644 index 0000000..7941960 --- /dev/null +++ b/ps-lite/include/ps/internal/assign_op.h @@ -0,0 +1,84 @@ +/** + * Copyright (c) 2015 by Contributors + * \file assign_op.h + * \brief assignment operator + * http://en.cppreference.com/w/cpp/language/operator_assignment + */ +#ifndef PS_INTERNAL_ASSIGN_OP_H_ +#define PS_INTERNAL_ASSIGN_OP_H_ +#include "ps/internal/utils.h" +namespace ps { + +enum AssignOp { + ASSIGN, // a = b + PLUS, // a += b + MINUS, // a -= b + TIMES, // a *= b + DIVIDE, // a -= b + AND, // a &= b + OR, // a |= b + XOR // a ^= b +}; + +/** + * \brief return an assignment function: right op= left + */ +template +inline void AssignFunc(const T &lhs, AssignOp op, T *rhs) { + switch (op) { + case ASSIGN: + *right = left; + break; + case PLUS: + *right += left; + break; + case MINUS: + *right -= left; + break; + case TIMES: + *right *= left; + break; + case DIVIDE: + *right /= left; + break; + default: + LOG(FATAL) << "use AssignOpInt.."; + } +} + +/** + * \brief return an assignment function including bit operations, only + * works for integers + */ +template +inline void AssignFuncInt(const T &lhs, AssignOp op, T *rhs) { + switch (op) { + case ASSIGN: + *right = left; + break; + case PLUS: + *right += left; + break; + case MINUS: + *right -= left; + break; + case TIMES: + *right *= left; + break; + case DIVIDE: + *right /= left; + break; + case AND: + *right &= left; + break; + case OR: + *right |= left; + break; + case XOR: + *right ^= left; + break; + } +} + +} // namespace ps +#endif // PS_INTERNAL_ASSIGN_OP_H_ diff --git a/ps-lite/include/ps/internal/customer.h b/ps-lite/include/ps/internal/customer.h new file mode 100644 index 0000000..0a7ee11 --- /dev/null +++ b/ps-lite/include/ps/internal/customer.h @@ -0,0 +1,119 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_CUSTOMER_H_ +#define PS_INTERNAL_CUSTOMER_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ps/internal/message.h" +#include "ps/internal/threadsafe_pqueue.h" +namespace ps { + +/** + * \brief The object for communication. + * + * As a sender, a customer tracks the responses for each request sent. + * + * It has its own receiving thread which is able to process any message received + * from a remote node with `msg.meta.customer_id` equal to this customer's id + */ +class Customer { +public: + /** + * \brief the handle for a received message + * \param recved the received message + */ + using RecvHandle = std::function; + + /** + * \brief constructor + * \param app_id the globally unique id indicating the application the + * postoffice serving for \param customer_id the locally unique id + * indicating the customer of a postoffice \param recv_handle the functino + * for processing a received message + */ + Customer(int app_id, int customer_id, const RecvHandle &recv_handle); + + /** + * \brief desconstructor + */ + ~Customer(); + + /** + * \brief return the globally unique application id + */ + inline int app_id() { + return app_id_; + } + + /** + * \brief return the locally unique customer id + */ + inline int customer_id() { + return customer_id_; + } + + /** + * \brief get a timestamp for a new request. threadsafe + * \param recver the receive node id of this request + * \return the timestamp of this request + */ + int NewRequest(int recver); + + /** + * \brief wait until the request is finished. threadsafe + * \param timestamp the timestamp of the request + */ + void WaitRequest(int timestamp); + + /** + * \brief return the number of responses received for the request. + * threadsafe \param timestamp the timestamp of the request + */ + // int NumResponse(int timestamp); + + /** + * \brief add a number of responses to timestamp + */ + // void AddResponse(int timestamp, int num = 1); + + /** + * \brief accept a received message from \ref Van. threadsafe + * \param recved the received the message + */ + inline void Accept(const Message &recved) { + recv_queue_.Push(recved); + } + +private: + /** + * \brief the thread function + */ + void Receiving(); + + int app_id_; + + int customer_id_; + + RecvHandle recv_handle_; + ThreadsafePQueue recv_queue_; + // using multithread to speed data processing + std::vector> recv_threads_; + + std::mutex tracker_mu_; + std::condition_variable tracker_cond_; + std::unordered_map tracker_; + int cur_timestamp; + + DISALLOW_COPY_AND_ASSIGN(Customer); +}; + +} // namespace ps +#endif // PS_INTERNAL_CUSTOMER_H_ diff --git a/ps-lite/include/ps/internal/env.h b/ps-lite/include/ps/internal/env.h new file mode 100644 index 0000000..d3e4aa0 --- /dev/null +++ b/ps-lite/include/ps/internal/env.h @@ -0,0 +1,69 @@ +/** + * Copyright (c) 2016 by Contributors + */ +#ifndef PS_INTERNAL_ENV_H_ +#define PS_INTERNAL_ENV_H_ +#include +#include +#include +#include +namespace ps { + +/** + * \brief Environment configurations + */ +class Environment { +public: + /** + * \brief return the singleton instance + */ + static inline Environment *Get() { + return _GetSharedRef(nullptr).get(); + } + /** + * \brief return a shared ptr of the singleton instance + */ + static inline std::shared_ptr _GetSharedRef() { + return _GetSharedRef(nullptr); + } + /** + * \brief initialize the environment + * \param envs key-value environment variables + * \return the initialized singleton instance + */ + static inline Environment * + Init(const std::unordered_map &envs) { + Environment *env = _GetSharedRef(&envs).get(); + env->kvs = envs; + return env; + } + + /** + * \brief find the env value. + * User-defined env vars first. If not found, check system's environment + * \param k the environment key + * \return the related environment value, nullptr when not found + */ + const char *find(const char *k) { + std::string key(k); + return kvs.find(key) == kvs.end() ? getenv(k) : kvs[key].c_str(); + } + +private: + explicit Environment( + const std::unordered_map *envs) { + if (envs) + kvs = *envs; + } + + static std::shared_ptr + _GetSharedRef(const std::unordered_map *envs) { + static std::shared_ptr inst_ptr(new Environment(envs)); + return inst_ptr; + } + + std::unordered_map kvs; +}; + +} // namespace ps +#endif // PS_INTERNAL_ENV_H_ diff --git a/ps-lite/include/ps/internal/message.h b/ps-lite/include/ps/internal/message.h new file mode 100644 index 0000000..ceb8133 --- /dev/null +++ b/ps-lite/include/ps/internal/message.h @@ -0,0 +1,170 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_MESSAGE_H_ +#define PS_INTERNAL_MESSAGE_H_ +#include "common/sarray.h" +#include "ps/psf/PSFunc.h" +#include +#include +#include +#include + +namespace ps { + +/** + * \brief information about a node + */ +struct Node { + /** \brief the empty value */ + static const int kEmpty; + /** \brief default constructor */ + Node() : id(kEmpty), port(kEmpty), is_recovery(false) { + } + /** \brief node roles */ + enum Role { SERVER, WORKER, SCHEDULER }; + /** \brief get debug string */ + std::string DebugString() const { + std::stringstream ss; + ss << "role=" + << (role == SERVER ? "server" : + (role == WORKER ? "worker" : "scheduler")) + << (id != kEmpty ? ", id=" + std::to_string(id) : "") + << ", ip=" << hostname << ", port=" << port + << ", is_recovery=" << is_recovery; + + return ss.str(); + } + /** \brief get short debug string */ + std::string ShortDebugString() const { + std::string str = role == SERVER ? "S" : (role == WORKER ? "W" : "H"); + if (id != kEmpty) + str += "[" + std::to_string(id) + "]"; + return str; + } + /** \brief the role of this node */ + Role role; + /** \brief node id */ + int id; + /** \brief customer id */ + int customer_id; + /** \brief hostname or ip */ + std::string hostname; + /** \brief the port this node is binding */ + int port; + /** \brief whether this node is created by failover */ + bool is_recovery; +}; +/** + * \brief meta info of a system control message + */ +struct Control { + /** \brief empty constructor */ + Control() : cmd(EMPTY) { + } + /** \brief return true is empty */ + inline bool empty() const { + return cmd == EMPTY; + } + /** \brief get debug string */ + std::string DebugString() const { + if (empty()) + return ""; + std::vector cmds = {"EMPTY", "TERMINATE", "ADD_NODE", + "BARRIER", "ACK", "HEARTBEAT"}; + std::stringstream ss; + ss << "cmd=" << cmds[cmd]; + if (node.size()) { + ss << ", node={"; + for (const Node &n : node) + ss << " " << n.DebugString(); + ss << " }"; + } + if (cmd == BARRIER) + ss << ", barrier_group=" << barrier_group; + if (cmd == ACK) + ss << ", msg_sig=" << msg_sig; + return ss.str(); + } + /** \brief all commands */ + enum Command { EMPTY, TERMINATE, ADD_NODE, BARRIER, ACK, HEARTBEAT }; + /** \brief the command */ + Command cmd; + /** \brief node infos */ + std::vector node; + /** \brief the node group for a barrier, such as kWorkerGroup */ + int barrier_group; + /** message signature */ + uint64_t msg_sig; +}; +/** + * \brief meta info of a message + */ +struct Meta { + /** \brief the empty value */ + static const int kEmpty; + /** \brief default constructor */ + Meta() : + app_id(kEmpty), customer_id(kEmpty), timestamp(kEmpty), sender(kEmpty), + recver(kEmpty), request(false), priority(kEmpty), + psftype(PsfType::DensePull) { + } + std::string DebugString() const { + std::stringstream ss; + if (sender == Node::kEmpty) { + ss << "?"; + } else { + ss << sender; + } + ss << " => " << recver; + if (timestamp != kEmpty) + ss << ", timestamp=" << timestamp; + if (!control.empty()) { + ss << ", control={ " << control.DebugString() << " }"; + } else { + ss << ", app_id=" << app_id << ", customer_id=" << customer_id + << ", priority=" << priority << ", psfType=" << psftype; + } + return ss.str(); + } + /** \brief the unique id of the application of messsage is for*/ + int app_id; + /** \brief customer id*/ + int customer_id; + /** \brief the timestamp of this message */ + int timestamp; + /** \brief the node id of the sender of this message */ + int sender; + /** \brief the node id of the receiver of this message */ + int recver; + /** \brief whether or not this is a request message*/ + bool request; + /** \brief system control message */ + Control control; + /** \brief message priority */ + int priority; + /** \brief server-side computation op for keys */ + PsfType psftype; +}; +/** + * \brief messages that communicated amaong nodes. + */ +struct Message { + /** \brief the meta info of this message */ + Meta meta; + /** \brief the large chunk of data of this message */ + std::vector> data; + + std::string DebugString() const { + std::stringstream ss; + ss << meta.DebugString(); + if (data.size()) { + ss << " Body:"; + for (const auto &d : data) + ss << " data_size=" << d.size(); + } + return ss.str(); + } +}; +} // namespace ps +#endif // PS_INTERNAL_MESSAGE_H_ diff --git a/ps-lite/include/ps/internal/parallel_kv_match.h b/ps-lite/include/ps/internal/parallel_kv_match.h new file mode 100644 index 0000000..6a9bf18 --- /dev/null +++ b/ps-lite/include/ps/internal/parallel_kv_match.h @@ -0,0 +1,125 @@ +/** + * Copyright (c) 2015 by Contributors + * \file parallel_kv_match.h + * \brief paralle key-value pairs matching + */ +#ifndef PS_INTERNAL_PARALLEL_KV_MATCH_H_ +#define PS_INTERNAL_PARALLEL_KV_MATCH_H_ +#include +#include +#include "ps/sarray.h" +#include "ps/internal/assign_op.h" + +namespace ps { +namespace { +/** + * \brief thread function, internal use + * + * \param src_key start of source key + * \param src_key_end end of source key + * \param src_val start of source val + * \param dst_key start of destination key + * \param dst_key_end end of denstination key + * \param dst_val start of destination val + * \param k length of a single value + * \param op assignment operator + * \param grainsize thread grainsize size + * \param n number of matched kv pairs + */ +template +void ParallelOrderedMatch(const K *src_key, const K *src_key_end, + const V *src_val, const K *dst_key, + const K *dst_key_end, V *dst_val, int k, AsOp op, + size_t grainsize, size_t *n) { + size_t src_len = std::distance(src_key, src_key_end); + size_t dst_len = std::distance(dst_key, dst_key_end); + if (dst_len == 0 || src_len == 0) + return; + + // drop the unmatched tail of src + src_key = std::lower_bound(src_key, src_key_end, *dst_key); + src_val += (src_key - (src_key_end - src_len)) * k; + + if (dst_len <= grainsize) { + while (dst_key != dst_key_end && src_key != src_key_end) { + if (*src_key < *dst_key) { + ++src_key; + src_val += k; + } else { + if (!(*dst_key < *src_key)) { + for (int i = 0; i < k; ++i) { + AssignOp(dst_val[i], src_val[i], op); + } + ++src_key; + src_val += k; + *n += k; + } + ++dst_key; + dst_val += k; + } + } + } else { + std::thread thr(ParallelOrderedMatch, src_key, src_key_end, + src_val, dst_key, dst_key + dst_len / 2, dst_val, k, op, + grainsize, n); + size_t m = 0; + ParallelOrderedMatch( + src_key, src_key_end, src_val, dst_key + dst_len / 2, dst_key_end, + dst_val + (dst_len / 2) * k, k, op, grainsize, &m); + thr.join(); + *n += m; + } +} +} // namespace + +/** + * \brief Merge \a src_val into \a dst_val by matching keys. Keys must be unique + * and sorted. + * + * \code + * if (dst_key[i] == src_key[j]) { + * dst_val[i] op= src_val[j] + * } + * \endcode + * + * When finished, \a dst_val will have length `k * dst_key.size()` and filled + * with matched value. Umatched value will be untouched if exists or filled with + * 0. + * + * \tparam K type of key + * \tparam V type of value + * \tparam C type of the container such as \ref SArray or \ref std::vector + * \param src_key the source keys + * \param src_val the source values + * \param dst_key the destination keys + * \param dst_val the destination values. + * \param k the length of a single value (default is 1) + * \param op the assignment operator (default is ASSIGN) + * \param num_threads number of thread (default is 1) + * \return the number of matched kv pairs + */ +template +size_t ParallelOrderedMatch(const SArray &src_key, const SArray &src_val, + const SArray &dst_key, C *dst_val, int k = 1, + AssignOp op = ASSIGN, int num_threads = 1) { + // do check + CHECK_GT(num_threads, 0); + CHECK_EQ(src_key.size() * k, src_val.size()); + CHECK_NOTNULL(dst_val->resize(dst_key.size() * k)); + if (dst_key.empty()) + return 0; + + // shorten the matching range + Range range = FindRange(dst_key, src_key.begin(), src_key.end()); + size_t grainsize = std::max(range.size() * k / num_threads + 5, + static_cast(1024 * 1024)); + size_t n = 0; + ParallelOrderedMatch( + src_key.begin(), src_key.end(), src_val.begin(), + dst_key.begin() + range.begin(), dst_key.begin() + range.end(), + dst_val->begin() + range.begin() * k, k, op, grainsize, &n); + return n; +} + +} // namespace ps +#endif // PS_INTERNAL_PARALLEL_KV_MATCH_H_ diff --git a/ps-lite/include/ps/internal/parallel_sort.h b/ps-lite/include/ps/internal/parallel_sort.h new file mode 100644 index 0000000..174b0be --- /dev/null +++ b/ps-lite/include/ps/internal/parallel_sort.h @@ -0,0 +1,58 @@ +/** + * Copyright (c) 2015 by Contributors + * @file parallel_sort.h + * @brief Parallel sort + */ +#ifndef PS_INTERNAL_PARALLEL_SORT_H_ +#define PS_INTERNAL_PARALLEL_SORT_H_ +#include +#include +#include +#include "ps/sarray.h" +namespace ps { + +namespace { +/** + * \brief the thread function + * + * \param data start pointer of data + * \param len length of data + * \param grainsize max data length of one thread + * \param cmp comparison function + */ +template +void ParallelSort(T *data, size_t len, size_t grainsize, const Fn &cmp) { + if (len <= grainsize) { + std::sort(data, data + len, cmp); + } else { + std::thread thr(ParallelSort, data, len / 2, grainsize, cmp); + ParallelSort(data + len / 2, len - len / 2, grainsize, cmp); + thr.join(); + + std::inplace_merge(data, data + len / 2, data + len, cmp); + } +} +} // namespace + +/** + * \brief Parallel Sort + * + * \param arr the array for sorting + * \param num_threads number of thread + * \param cmp the comparision function such as + * [](const T& a, const T& b) {* return a < b; } + * or an even simplier version: + * std::less() + */ +template +void ParallelSort(SArray *arr, int num_threads = 2, + const Fn &cmp = std::less()) { + CHECK_GT(num_threads, 0); + CHECK(cmp); + size_t grainsize = + std::max(arr->size() / num_threads + 5, (size_t)1024 * 16); + ParallelSort(arr->data(), arr->size(), grainsize, cmp); +} + +} // namespace ps +#endif // PS_INTERNAL_PARALLEL_SORT_H_ diff --git a/ps-lite/include/ps/internal/postoffice.h b/ps-lite/include/ps/internal/postoffice.h new file mode 100644 index 0000000..674b01e --- /dev/null +++ b/ps-lite/include/ps/internal/postoffice.h @@ -0,0 +1,222 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_POSTOFFICE_H_ +#define PS_INTERNAL_POSTOFFICE_H_ +#include +#include +#include +#include +#include +#include "ps/range.h" +#include "ps/internal/env.h" +#include "ps/internal/customer.h" +#include "ps/internal/van.h" +namespace ps { +/** + * \brief the center of the system + */ +class Postoffice { +public: + /** + * \brief return the singleton object + */ + static Postoffice *Get() { + static Postoffice e; + return &e; + } + /** \brief get the van */ + Van *van() { + return van_; + } + /** + * \brief start the system + * + * This function will block until every nodes are started. + * \param argv0 the program name, used for logging. + * \param do_barrier whether to block until every nodes are started. + */ + void Start(int customer_id, const char *argv0, const bool do_barrier); + /** + * \brief terminate the system + * + * All nodes should call this function before existing. + * \param do_barrier whether to do block until every node is finalized, + * default true. + */ + void Finalize(const int customer_id, const bool do_barrier = true); + /** + * \brief add an customer to the system. threadsafe + */ + void AddCustomer(Customer *customer); + /** + * \brief remove a customer by given it's id. threasafe + */ + void RemoveCustomer(Customer *customer); + /** + * \brief get the customer by id, threadsafe + * \param app_id the application id + * \param customer_id the customer id + * \param timeout timeout in sec + * \return return nullptr if doesn't exist and timeout + */ + Customer *GetCustomer(int app_id, int customer_id, int timeout = 0) const; + /** + * \brief get the id of a node (group), threadsafe + * + * if it is a node group, return the list of node ids in this + * group. otherwise, return {node_id} + */ + const std::vector &GetNodeIDs(int node_id) const { + const auto it = node_ids_.find(node_id); + CHECK(it != node_ids_.cend()) << "node " << node_id << " doesn't exist"; + return it->second; + } + /** + * \brief return the key ranges of all server nodes + */ + const std::vector &GetServerKeyRanges(); + /** + * \brief the template of a callback + */ + using Callback = std::function; + /** + * \brief Register a callback to the system which is called after Finalize() + * + * The following codes are equal + * \code {cpp} + * RegisterExitCallback(cb); + * Finalize(); + * \endcode + * + * \code {cpp} + * Finalize(); + * cb(); + * \endcode + * \param cb the callback function + */ + void RegisterExitCallback(const Callback &cb) { + exit_callback_ = cb; + } + /** + * \brief convert from a worker rank into a node id + * \param rank the worker rank + */ + static inline int WorkerRankToID(int rank) { + return rank * 2 + 9; + } + /** + * \brief convert from a server rank into a node id + * \param rank the server rank + */ + static inline int ServerRankToID(int rank) { + return rank * 2 + 8; + } + /** + * \brief convert from a node id into a server or worker rank + * \param id the node id + */ + static inline int IDtoRank(int id) { +#ifdef _MSC_VER +#undef max +#endif + return std::max((id - 8) / 2, 0); + } + /** \brief Returns the number of worker nodes */ + int num_workers() const { + return num_workers_; + } + /** \brief Returns the number of server nodes */ + int num_servers() const { + return num_servers_; + } + /** \brief Returns the rank of this node in its group + * + * Each worker will have a unique rank within [0, NumWorkers()). So are + * servers. This function is available only after \ref Start has been + * called. + */ + int my_rank() const { + return IDtoRank(van_->my_node().id); + } + /** \brief Returns true if this node is a worker node */ + int is_worker() const { + return is_worker_; + } + /** \brief Returns true if this node is a server node. */ + int is_server() const { + return is_server_; + } + /** \brief Returns true if this node is a scheduler node. */ + int is_scheduler() const { + return is_scheduler_; + } + /** \brief Returns the verbose level. */ + int verbose() const { + return verbose_; + } + /** \brief Return whether this node is a recovery node */ + bool is_recovery() const { + return van_->my_node().is_recovery; + } + /** + * \brief barrier + * \param node_id the barrier group id + */ + void Barrier(int customer_id, int node_group); + /** + * \brief process a control message, called by van + * \param the received message + */ + void Manage(const Message &recv); + /** + * \brief update the heartbeat record map + * \param node_id the \ref Node id + * \param t the last received heartbeat time + */ + void UpdateHeartbeat(int node_id, time_t t) { + std::lock_guard lk(heartbeat_mu_); + heartbeats_[node_id] = t; + } + /** + * \brief get node ids that haven't reported heartbeats for over t seconds + * \param t timeout in sec + */ + std::vector GetDeadNodes(int t = 60); + +private: + Postoffice(); + ~Postoffice() { + delete van_; + } + + void InitEnvironment(); + Van *van_; + mutable std::mutex mu_; + // app_id -> (customer_id -> customer pointer) + std::unordered_map> customers_; + std::unordered_map> node_ids_; + std::mutex server_key_ranges_mu_; + std::vector server_key_ranges_; + bool is_worker_, is_server_, is_scheduler_; + int num_servers_, num_workers_; + std::unordered_map> barrier_done_; + int verbose_; + std::mutex barrier_mu_; + std::condition_variable barrier_cond_; + std::mutex heartbeat_mu_; + std::mutex start_mu_; + int init_stage_ = 0; + std::unordered_map heartbeats_; + Callback exit_callback_; + /** \brief Holding a shared_ptr to prevent it from being destructed too + * early */ + std::shared_ptr env_ref_; + time_t start_time_; + DISALLOW_COPY_AND_ASSIGN(Postoffice); +}; + +/** \brief verbose log */ +#define PS_VLOG(x) LOG_IF(INFO, x <= Postoffice::Get()->verbose()) +} // namespace ps +#endif // PS_INTERNAL_POSTOFFICE_H_ diff --git a/ps-lite/include/ps/internal/threadsafe_pqueue.h b/ps-lite/include/ps/internal/threadsafe_pqueue.h new file mode 100644 index 0000000..95a927f --- /dev/null +++ b/ps-lite/include/ps/internal/threadsafe_pqueue.h @@ -0,0 +1,64 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_THREADSAFE_PQUEUE_H_ +#define PS_INTERNAL_THREADSAFE_PQUEUE_H_ +#include +#include +#include +#include +#include +#include +#include "ps/base.h" +namespace ps { + +/** + * \brief thread-safe queue allowing push and waited pop + */ +class ThreadsafePQueue { +public: + ThreadsafePQueue() { + } + ~ThreadsafePQueue() { + } + + /** + * \brief push an value into the end. threadsafe. + * \param new_value the value + */ + void Push(Message new_value) { + mu_.lock(); + queue_.push(std::move(new_value)); + mu_.unlock(); + cond_.notify_all(); + } + + /** + * \brief wait until pop an element from the beginning, threadsafe + * \param value the poped value + */ + void WaitAndPop(Message *value) { + std::unique_lock lk(mu_); + cond_.wait(lk, [this] { return !queue_.empty(); }); + *value = std::move(queue_.top()); + queue_.pop(); + } + +private: + class Compare { + public: + bool operator()(const Message &l, const Message &r) { + // hbsun: note it is the max-heap, + // In other words, the priority is larger, the quickly it is + // processed push is first, pull is later + return l.meta.priority <= r.meta.priority; + } + }; + mutable std::mutex mu_; + std::priority_queue, Compare> queue_; + std::condition_variable cond_; +}; + +} // namespace ps + +#endif // PS_INTERNAL_THREADSAFE_PQUEUE_H_ diff --git a/ps-lite/include/ps/internal/threadsafe_queue.h b/ps-lite/include/ps/internal/threadsafe_queue.h new file mode 100644 index 0000000..a4cd340 --- /dev/null +++ b/ps-lite/include/ps/internal/threadsafe_queue.h @@ -0,0 +1,63 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_THREADSAFE_QUEUE_H_ +#define PS_INTERNAL_THREADSAFE_QUEUE_H_ +#include +#include +#include +#include +#include +#include "ps/base.h" +namespace ps { + +/** + * \brief thread-safe queue allowing push and waited pop + */ +template +class ThreadsafeQueue { +public: + ThreadsafeQueue() { + } + ~ThreadsafeQueue() { + } + + /** + * \brief push an value into the end. threadsafe. + * \param new_value the value + */ + void Push(T new_value) { + mu_.lock(); + queue_.push(std::move(new_value)); + mu_.unlock(); + cond_.notify_all(); + } + + /** + * \brief wait until pop an element from the beginning, threadsafe + * \param value the poped value + */ + void WaitAndPop(T *value) { + std::unique_lock lk(mu_); + cond_.wait(lk, [this] { return !queue_.empty(); }); + *value = std::move(queue_.front()); + queue_.pop(); + } + +private: + mutable std::mutex mu_; + std::queue queue_; + std::condition_variable cond_; +}; + +} // namespace ps + +// bool TryPop(T& value) { +// std::lock_guard lk(mut); +// if(data_queue.empty()) +// return false; +// value=std::move(data_queue.front()); +// data_queue.pop(); +// return true; +// } +#endif // PS_INTERNAL_THREADSAFE_QUEUE_H_ diff --git a/ps-lite/include/ps/internal/utils.h b/ps-lite/include/ps/internal/utils.h new file mode 100644 index 0000000..f12b968 --- /dev/null +++ b/ps-lite/include/ps/internal/utils.h @@ -0,0 +1,57 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_UTILS_H_ +#define PS_INTERNAL_UTILS_H_ +#include "common/logging.h" +#include "ps/internal/env.h" +namespace ps { + +#ifdef _MSC_VER +typedef signed char int8_t; +typedef __int16 int16_t; +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned char uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +/*! + * \brief Get environment variable as int with default. + * \param key the name of environment variable. + * \param default_val the default value of environment vriable. + * \return The value received + */ +template +inline V GetEnv(const char *key, V default_val) { + const char *val = Environment::Get()->find(key); + if (val == nullptr) { + return default_val; + } else { + return V(val); + } +} + +inline int GetEnv(const char *key, int default_val) { + const char *val = Environment::Get()->find(key); + if (val == nullptr) { + return default_val; + } else { + return atoi(val); + } +} + +#ifndef DISALLOW_COPY_AND_ASSIGN +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName &); \ + void operator=(const TypeName &) +#endif + +#define LL LOG(ERROR) + +} // namespace ps +#endif // PS_INTERNAL_UTILS_H_ diff --git a/ps-lite/include/ps/internal/van.h b/ps-lite/include/ps/internal/van.h new file mode 100644 index 0000000..b9f5ef9 --- /dev/null +++ b/ps-lite/include/ps/internal/van.h @@ -0,0 +1,213 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_INTERNAL_VAN_H_ +#define PS_INTERNAL_VAN_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ps/base.h" +#include "ps/internal/message.h" +namespace ps { +class Resender; +class PBMeta; +/** + * \brief Van sends messages to remote nodes + * + * If environment variable PS_RESEND is set to be 1, then van will resend a + * message if it no ACK messsage is received within PS_RESEND_TIMEOUT + * millisecond + */ +class Van { +public: + /** + * \brief create Van + * \param type zmq, socket, ... + */ + static Van *Create(const std::string &type); + + /** \brief constructer, do nothing. use \ref Start for real start */ + Van() { + } + + /**\brief deconstructer, do nothing. use \ref Stop for real stop */ + virtual ~Van() { + } + + /** + * \brief start van + * + * must call it before calling Send + * + * it initalizes all connections to other nodes. start the receiving + * threads, which keeps receiving messages. if it is a system + * control message, give it to postoffice::manager, otherwise, give it to + * the accoding app. + */ + virtual void Start(int customer_id); + + /** + * \brief send a message, It is thread-safe + * \return the number of bytes sent. -1 if failed + */ + int Send(const Message &msg); + + /** + * \brief return my node + */ + inline const Node &my_node() const { + CHECK(ready_) << "call Start() first"; + return my_node_; + } + + /** + * \brief stop van + * stop receiving threads + */ + virtual void Stop(); + + /** + * \brief get next available timestamp. thread safe + */ + inline int GetTimestamp() { + return timestamp_++; + } + + /** + * \brief whether it is ready for sending. thread safe + */ + inline bool IsReady() { + return ready_; + } + +protected: + /** + * \brief connect to a node + */ + virtual void Connect(const Node &node) = 0; + + /** + * \brief bind to my node + * do multiple retries on binding the port. since it's possible that + * different nodes on the same machine picked the same port + * \return return the port binded, -1 if failed. + */ + virtual int Bind(const Node &node, int max_retry) = 0; + + /** + * \brief block until received a message + * \return the number of bytes received. -1 if failed or timeout + */ + virtual int RecvMsg(Message *msg) = 0; + + /** + * \brief send a mesage + * \return the number of bytes sent + */ + virtual int SendMsg(const Message &msg) = 0; + + /** + * \brief pack meta into a string + */ + void PackMeta(const Meta &meta, char **meta_buf, int *buf_size); + + /** + * \brief pack meta into protobuf + */ + void PackMetaPB(const Meta &meta, PBMeta *pb); + + /** + * \brief unpack meta from a string + */ + void UnpackMeta(const char *meta_buf, int buf_size, Meta *meta); + + Node scheduler_; + Node my_node_; + bool is_scheduler_; + std::mutex start_mu_; + +private: + /** thread function for receving */ + void Receiving(); + + /** thread function for heartbeat */ + void Heartbeat(); + + // node's address string (i.e. ip:port) -> node id + // this map is updated when ip:port is received for the first time + std::unordered_map connected_nodes_; + // maps the id of node which is added later to the id of node + // which is with the same ip:port and added first + std::unordered_map shared_node_mapping_; + + /** whether it is ready for sending */ + std::atomic ready_{false}; + std::atomic send_bytes_{0}; + size_t recv_bytes_ = 0; + int num_servers_ = 0; + int num_workers_ = 0; + /** the thread for receiving messages */ + std::unique_ptr receiver_thread_; + /** the thread for sending heartbeat */ + std::unique_ptr heartbeat_thread_; + std::vector barrier_count_; + /** msg resender */ + Resender *resender_ = nullptr; + int drop_rate_ = 0; + std::atomic timestamp_{0}; + int init_stage = 0; + + /** + * \brief processing logic of AddNode message for scheduler + */ + void ProcessAddNodeCommandAtScheduler(Message *msg, Meta *nodes, + Meta *recovery_nodes); + + /** + * \brief processing logic of Terminate message + */ + void ProcessTerminateCommand(); + + /** + * \brief processing logic of AddNode message (run on each node) + */ + void ProcessAddNodeCommand(Message *msg, Meta *nodes, Meta *recovery_nodes); + + /** + * \brief processing logic of Barrier message (run on each node) + */ + void ProcessBarrierCommand(Message *msg); + + /** + * \brief processing logic of AddNode message (run on each node) + */ + void ProcessHearbeat(Message *msg); + + /** + * \brief processing logic of Data message + */ + void ProcessDataMsg(Message *msg); + + /** + * \brief called by ProcessAddNodeCommand, in scheduler it assigns an id to + * the newly added node; in other nodes, it updates the node id with what is + * received from scheduler + */ + void UpdateLocalID(Message *msg, std::unordered_set *deadnodes_set, + Meta *nodes, Meta *recovery_nodes); + + const char *heartbeat_timeout_val = + Environment::Get()->find("PS_HEARTBEAT_TIMEOUT"); + int heartbeat_timeout_ = + heartbeat_timeout_val ? atoi(heartbeat_timeout_val) : 0; + + DISALLOW_COPY_AND_ASSIGN(Van); +}; +} // namespace ps +#endif // PS_INTERNAL_VAN_H_ diff --git a/ps-lite/include/ps/kvapp.h b/ps-lite/include/ps/kvapp.h new file mode 100644 index 0000000..eb87b5b --- /dev/null +++ b/ps-lite/include/ps/kvapp.h @@ -0,0 +1,47 @@ +#pragma once + +#include "ps/internal/postoffice.h" +#include "ps/internal/customer.h" +#include "ps/internal/message.h" + +namespace ps { + +// Recursively register receive message handler (from 0 to kNumPSfunction) +template +struct KVAppRegisterHelper { + static void init(app *ptr) { + ptr->message_handlers[ftype] = std::bind( + &app::template onReceive, ptr, std::placeholders::_1); + KVAppRegisterHelper::init(ptr); + } +}; + +template +struct KVAppRegisterHelper { + static void init(app *ptr) { + } +}; + +class KVApp { +public: + explicit KVApp(int app_id) { + obj_.reset(new Customer( + app_id, app_id, + std::bind(&KVApp::Process, this, std::placeholders::_1))); + } + std::unique_ptr obj_; + +private: + void Process(const Message &msg) { + CHECK_LT(msg.meta.psftype, kNumPSfunction) + << "Unknown PS Function Received"; + message_handlers[msg.meta.psftype](msg); + } + + typedef std::function MessageHandle; + MessageHandle message_handlers[kNumPSfunction]; + template + friend struct KVAppRegisterHelper; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/partitioner.h b/ps-lite/include/ps/partitioner.h new file mode 100644 index 0000000..bb90576 --- /dev/null +++ b/ps-lite/include/ps/partitioner.h @@ -0,0 +1,125 @@ +#pragma once + +#include + +namespace ps { + +class Partitioner { +protected: + const std::vector &server_range; + size_t server_num; + +public: + Partitioner() : server_range(Postoffice::Get()->GetServerKeyRanges()) { + server_num = server_range.size(); + } + virtual ~Partitioner() { + } + virtual void partitionDense(size_t length, std::vector &keys, + std::vector &parts) { + } + virtual void partitionSparse(size_t length, size_t width, + std::vector &keys, + std::vector &parts) { + } + virtual int queryServer(Key key) { + return 0; + } +}; + +/* Naive partitioner, average partition into servers */ +class AveragePartitioner : public Partitioner { +private: + Key _globalId; + size_t _serverIndex; + size_t partition_num; + +public: + AveragePartitioner(size_t part_num = 0) : Partitioner() { + _globalId = 0; + _serverIndex = 0; + if (part_num == 0 || part_num > server_num) + part_num = server_num; + partition_num = part_num; + } + + void partitionDense(size_t length, std::vector &keys, + std::vector &parts) { + size_t per_part_len = length / partition_num; + size_t rem = length % partition_num; + for (size_t i = 0; i < partition_num; i++) { + size_t server_idx = (i + _serverIndex) % server_num; + keys.push_back(_globalId + server_range[server_idx].begin()); + parts.push_back(per_part_len + (i < rem)); + } + _globalId++; + _serverIndex = (_serverIndex + partition_num) % server_num; + } + + void partitionSparse(size_t length, size_t width, std::vector &keys, + std::vector &parts) { + partitionDense(length, keys, parts); + } + + int queryServer(Key key) { + size_t server_id = 0; + while (server_id < server_num + && key >= server_range[server_id].begin()) { + server_id++; + } + return int(server_id - 1); + } +}; + +/* Use blocks to partition, intuition from BytePS */ +class BlockPartitioner : public Partitioner { +private: + Key _globalId; + size_t _serverIndex; + size_t _block; + +public: + BlockPartitioner(size_t block_size = 1000000) : Partitioner() { + _globalId = 0; + _serverIndex = 0; + _block = block_size; + } + + void partitionDense(size_t length, std::vector &keys, + std::vector &parts) { + partitionImpl(length, _block, keys, parts); + } + + void partitionSparse(size_t length, size_t width, std::vector &keys, + std::vector &parts) { + size_t cur_block = std::max(_block / width, size_t(1)); + partitionImpl(length, cur_block, keys, parts); + } + + void partitionImpl(size_t length, size_t cur_block, std::vector &keys, + std::vector &parts) { + size_t DLArray_len = length; + while (DLArray_len != 0) { + keys.push_back(_globalId + server_range[_serverIndex].begin()); + _serverIndex++; + auto tmp = std::min(cur_block, DLArray_len); + parts.push_back(tmp); + DLArray_len -= tmp; + if (_serverIndex == server_num) { + _globalId++; + _serverIndex = 0; + } + } + } + + int queryServer(Key key) { + size_t server_id = 0; + while (server_id < server_num + && key >= server_range[server_id].begin()) { + server_id++; + } + return int(server_id - 1); + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/ps.h b/ps-lite/include/ps/ps.h new file mode 100644 index 0000000..f40262f --- /dev/null +++ b/ps-lite/include/ps/ps.h @@ -0,0 +1,90 @@ +/*! + * Copyright (c) 2015 by Contributors + * @file ps.h + * \brief The parameter server interface + */ +#ifndef PS_PS_H_ +#define PS_PS_H_ +/** \brief basic setups in ps */ +#include "ps/base.h" +/** \brief communcating with a list of key-value paris. */ +#include "common/thread_pool.h" +#include "ps/internal/postoffice.h" +namespace ps { +/** \brief Returns the number of worker nodes */ +inline int NumWorkers() { + return Postoffice::Get()->num_workers(); +} +/** \brief Returns the number of server nodes */ +inline int NumServers() { + return Postoffice::Get()->num_servers(); +} +/** \brief Returns true if this node is a worker node */ +inline bool IsWorker() { + return Postoffice::Get()->is_worker(); +} +/** \brief Returns true if this node is a server node. */ +inline bool IsServer() { + return Postoffice::Get()->is_server(); +} +/** \brief Returns true if this node is a scheduler node. */ +inline bool IsScheduler() { + return Postoffice::Get()->is_scheduler(); +} +/** \brief Returns the rank of this node in its group + * + * Each worker will have a unique rank within [0, NumWorkers()). So are + * servers. This function is available only after \ref Start has been called. + */ +inline int MyRank() { + return Postoffice::Get()->my_rank(); +} +/** + * \brief start the system + * + * This function will block until every nodes are started. + * \param argv0 the program name, used for logging + */ +inline void Start(int customer_id, const char *argv0 = nullptr) { + Postoffice::Get()->Start(customer_id, argv0, true); +} +/** + * \brief start the system + * + * This function will NOT block. + * \param argv0 the program name, used for logging + */ +inline void StartAsync(int customer_id, const char *argv0 = nullptr) { + Postoffice::Get()->Start(customer_id, argv0, false); +} +/** + * \brief terminate the system + * + * All nodes should call this function before existing. + * \param do_barrier whether to block until every node is finalized, default + * true. + */ +inline void Finalize(int customer_id, const bool do_barrier = true) { + Postoffice::Get()->Finalize(customer_id, do_barrier); +} +/** + * \brief Register a callback to the system which is called after Finalize() + * + * The following codes are equal + * \code {cpp} + * RegisterExitCallback(cb); + * Finalize(); + * \endcode + * + * \code {cpp} + * Finalize(); + * cb(); + * \endcode + * \param cb the callback function + */ +inline void RegisterExitCallback(const std::function &cb) { + Postoffice::Get()->RegisterExitCallback(cb); +} + +} // namespace ps +#endif // PS_PS_H_ diff --git a/ps-lite/include/ps/psf/PSFunc.h b/ps-lite/include/ps/psf/PSFunc.h new file mode 100644 index 0000000..2988de8 --- /dev/null +++ b/ps-lite/include/ps/psf/PSFunc.h @@ -0,0 +1,63 @@ +#pragma once + +#include "common/sarray.h" +#include "ps/base.h" + +#include +#include +using std::tuple; +using std::get; +using std::function; + +namespace ps { + +enum PsfType { + /* Dense ops */ + DensePush, + DensePull, + DDPushPull, + /* Sparse ops */ + SparsePush, + SparsePull, + SDPushPull, + SSPushPull, + /* misc ops */ + ParamInit, + ParamClear, + ParamSave, + ParamLoad, + /* cache sparse table */ + kSyncEmbedding, + kPushEmbedding, + kPushSyncEmbedding, + kNumPSfunction, +}; + +template +struct PSFData; +/* + To define a new PSFunc, we need 3 parts : Request, Response, _callback + * Request and Response are tuple-like object, and must only use + scalar types like int, float or Sarray + * _callback is a function having format void(const Response&, args...) + where args are some target memory space to write back + * See examples in dense.h sparse.h ... +*/ + +/* + getCallBack, use this to bind _callback to the get the real callback which can + be stored example: getCallBack(target); +*/ +template +function::Response &)> +getCallBack(Args &&... args) { + return std::bind(PSFData::_callback, std::placeholders::_1, + std::forward(args)...); +} + +} // namespace ps + +#include "dense.h" +#include "sparse.h" +#include "misc.h" +#include "cachetable.h" diff --git a/ps-lite/include/ps/psf/cachetable.h b/ps-lite/include/ps/psf/cachetable.h new file mode 100644 index 0000000..04ed8f5 --- /dev/null +++ b/ps-lite/include/ps/psf/cachetable.h @@ -0,0 +1,50 @@ +#pragma once + +#include "PSFunc.h" + +namespace ps { + +typedef int64_t version_t; + +template <> +struct PSFData { + using Request = tuple, // rows + SArray, // data + SArray // updates + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = tuple, // rows + SArray, // current version + version_t // bound + >; + using Response = tuple, // rows that should be updated + SArray, // server version returned + SArray // embedding value + >; + // Use a closure to pass cached embedding data target + typedef std::function + Closure; +}; + +template <> +struct PSFData { + using Request = tuple, // rows + SArray, // current version + version_t, // bound + SArray, // push rows + SArray, // push data + SArray // push updates + >; + using Response = PSFData::Response; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/psf/dense.h b/ps-lite/include/ps/psf/dense.h new file mode 100644 index 0000000..ebb5870 --- /dev/null +++ b/ps-lite/include/ps/psf/dense.h @@ -0,0 +1,44 @@ +#pragma once + +#include "PSFunc.h" + +namespace ps { + +template <> +struct PSFData { + using Request = tuple; + using Response = tuple // data + >; + static void _callback(const Response &response, SArray tgt) { + auto val = get<0>(response); + CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size(); + std::copy(val.begin(), val.end(), tgt.begin()); + } +}; + +template <> +struct PSFData { + using Request = tuple // data + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = PSFData::Request; + using Response = PSFData::Response; + + static void _callback(const Response &response, SArray tgt) { + auto val = get<0>(response); + CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size(); + std::copy(val.begin(), val.end(), tgt.begin()); + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/psf/misc.h b/ps-lite/include/ps/psf/misc.h new file mode 100644 index 0000000..6ad944a --- /dev/null +++ b/ps-lite/include/ps/psf/misc.h @@ -0,0 +1,62 @@ +#pragma once + +#include "PSFunc.h" + +namespace ps { + +enum InitType { + Constant, + Uniform, + Normal, + TruncatedNormal, +}; + +template <> +struct PSFData { + using Request = tuple // opt arguments + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = tuple; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = tuple, // address + bool // different from load + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = tuple // address + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/psf/serializer.h b/ps-lite/include/ps/psf/serializer.h new file mode 100644 index 0000000..dadd127 --- /dev/null +++ b/ps-lite/include/ps/psf/serializer.h @@ -0,0 +1,110 @@ +#pragma once + +#include "common/sarray.h" + +#include +#include +using std::tuple; +using std::vector; + +namespace ps { + +// we don't have if-constexpr in c++11, so we use this +template +class ScalarTag {}; + +// decide whether a data is scalar type or SArray +// isScalar::value -> true +template +class isScalar { +public: + constexpr static bool value = + std::is_integral::value || std::is_floating_point::value; + using Tag = ScalarTag; +}; + +// Helper class to serialize Tuples recursively +template +class tupleSerializer { +public: + // encode scalar type, put it in target[0] + template + static void _encode(const dtype &t, vector> &target, + ScalarTag) { + size_t cur_size = target[0].size(); + target[0].resize(cur_size + sizeof(dtype)); + dtype *ptr = reinterpret_cast(target[0].data() + cur_size); + *ptr = t; + } + // encode sarray type, append it to target(no copy) + template + static void _encode(const dtype &t, vector> &target, + ScalarTag) { + SArray bytes(t); + target.push_back(bytes); + } + // encode a tuple from back to front + static void encode(const Tuple &tup, vector> &target) { + auto &t = std::get(tup); + typedef typename std::remove_reference::type dtype; + _encode(t, target, typename isScalar::Tag()); + tupleSerializer::encode(tup, target); + } + //---------------------------------Decode--------------------------------------- + template + static void _decode(dtype &t, const vector> &target, + ScalarTag, size_t &scalar_hint, + size_t &array_hint) { + dtype *ptr = reinterpret_cast(target[0].data() + scalar_hint + - sizeof(dtype)); + t = *ptr; + scalar_hint -= sizeof(dtype); + } + template + static void _decode(dtype &t, const vector> &target, + ScalarTag, size_t &scalar_hint, + size_t &array_hint) { + t = target[array_hint - 1]; + array_hint--; + } + // scalar_hint, array_hint, tell where to take the data from target + static void decode(Tuple &tup, const vector> &target, + size_t scalar_hint, size_t array_hint) { + // When decode, from front to back + auto &t = std::get::value - N>(tup); + typedef typename std::remove_reference::type dtype; + _decode(t, target, typename isScalar::Tag(), scalar_hint, + array_hint); + tupleSerializer::decode(tup, target, scalar_hint, + array_hint); + } +}; + +// Handle template specialization +template +class tupleSerializer { +public: + static void encode(const Tuple &tup, vector> &target) { + } + static void decode(Tuple &tup, const vector> &target, + size_t scalar_hint, size_t array_hint) { + } +}; + +// ------------------------------ Exported APIs +// ------------------------------------------------ +template +void tupleEncode(const Tuple &tup, vector> &dest) { + dest.clear(); + dest.push_back(SArray()); // Reserve for scalar types + dest[0].reserve(sizeof(Tuple)); + tupleSerializer::value>::encode(tup, dest); +} + +template +void tupleDecode(Tuple &tup, const vector> &dest) { + tupleSerializer::value>::decode( + tup, dest, dest[0].size(), dest.size()); +} + +} // namespace ps diff --git a/ps-lite/include/ps/psf/sparse.h b/ps-lite/include/ps/psf/sparse.h new file mode 100644 index 0000000..4934fa0 --- /dev/null +++ b/ps-lite/include/ps/psf/sparse.h @@ -0,0 +1,87 @@ +#pragma once + +#include "PSFunc.h" +#include "dense.h" + +namespace ps { + +template <> +struct PSFData { + using Request = tuple // offset + >; + using Response = tuple // data + >; + static void + _callback(const Response &response, SArray tgt, + std::vector>> mapping, + size_t offset, size_t width) { + auto val = get<0>(response); + CHECK_EQ(val.size(), mapping.size() * width) + << val.size() << " " << mapping.size() << " " << width; + for (size_t i = 0; i < mapping.size(); ++i) { + auto begin_iter = val.begin() + i * width; + auto end_iter = begin_iter + width; + for (auto idx : mapping[i].second) { + std::copy(begin_iter, end_iter, tgt.begin() + idx * width); + } + } + } +}; + +template <> +struct PSFData { + using Request = tuple, // offset + SArray // data + >; + using Response = tuple<>; + static void _callback(const Response &response) { + } +}; + +template <> +struct PSFData { + using Request = tuple, // offset + SArray, // data + size_t // len for densepull + >; + using Response = PSFData::Response; + + static void _callback(const Response &response, SArray tgt) { + auto val = get<0>(response); + CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size(); + std::copy(val.begin(), val.end(), tgt.begin()); + } +}; + +template <> +struct PSFData { + using Request = tuple, // push offset + SArray, // data + SArray // pull offset + >; + using Response = PSFData::Response; + + static void + _callback(const Response &response, SArray tgt, + std::vector>> mapping, + size_t offset, size_t width) { + auto val = get<0>(response); + if (val.size() > 0) { + CHECK_EQ(val.size(), mapping.size() * width) + << val.size() << " " << mapping.size() << " " << width; + for (size_t i = 0; i < mapping.size(); ++i) { + auto begin_iter = val.begin() + i * width; + auto end_iter = begin_iter + width; + for (auto idx : mapping[i].second) { + std::copy(begin_iter, end_iter, tgt.begin() + idx * width); + } + } + } + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/range.h b/ps-lite/include/ps/range.h new file mode 100644 index 0000000..3905939 --- /dev/null +++ b/ps-lite/include/ps/range.h @@ -0,0 +1,35 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_RANGE_H_ +#define PS_RANGE_H_ +#include "ps/internal/utils.h" +namespace ps { + +/** + * \brief a range [begin, end) + */ +class Range { +public: + Range() : Range(0, 0) { + } + Range(uint64_t begin, uint64_t end) : begin_(begin), end_(end) { + } + + uint64_t begin() const { + return begin_; + } + uint64_t end() const { + return end_; + } + uint64_t size() const { + return end_ - begin_; + } + +private: + uint64_t begin_; + uint64_t end_; +}; + +} // namespace ps +#endif // PS_RANGE_H_ diff --git a/ps-lite/include/ps/server/PSFHandle.h b/ps-lite/include/ps/server/PSFHandle.h new file mode 100644 index 0000000..95b823b --- /dev/null +++ b/ps-lite/include/ps/server/PSFHandle.h @@ -0,0 +1,404 @@ +#pragma once + +#include "ps/psf/PSFunc.h" + +#include "common/thread_safe_hash_map.h" +#include "param.h" +#include +#include +#include +#include +#include +#include + +namespace ps { +/** + * \brief used in ML part for sparse/dense pull, push. + * keys is used for the key of one partition. + * lens is used as the offset of the keys. + * vals is vals. + * One key (two keys for binary op) per request in Hetu. + * Is it ok in a lock-free manner? By @Zhipeng + */ + +class KVServerMatrixHandle { +public: + KVServerMatrixHandle() { + } + KVServerMatrixHandle(const KVServerMatrixHandle &handle) { + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + size_t len = get<1>(request); + SArray &pull_vals = get<0>(response); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = *iter->second; + size_t data_size = value_set_.size(); + CHECK_EQ(len, data_size) << " size mismatch in DensePull " << k + << " " << len << " " << data_size; + pull_vals.resize(data_size); + auto read_lock = value_set_.read_guard(); + std::copy(value_set_.begin(), value_set_.end(), pull_vals.begin()); + } else { + LG << "Key does not exist on PS in DensePull" << k; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + size_t len = get<1>(request); + SArray vals = get<2>(request); + + if (const_store.find(k) == const_store.end()) { + store[k] = std::make_shared>(len, OptType::None, + SArray()); + } + auto iter = const_store.find(k); + if (iter != const_store.end()) { + CHECK_EQ(len, iter->second->size()) + << k << " " << len << " " << iter->second->size() + << " size mismatch in DensePush"; + // write, discard const qualifier + auto &value_set_ = + *const_cast(iter->second); + auto write_lock = value_set_.write_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < value_set_.size(); j++) + value_set_[j] += vals[j]; + } else { + LG << "Key does not exist on PS in DensePull" << k; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + // one key per request. + // with response result + Key k = get<0>(request); + size_t len = get<1>(request); + SArray vals = get<2>(request); + SArray &pull_vals = get<0>(response); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = + *const_cast(iter->second); + size_t data_size = value_set_.size(); + CHECK_EQ(len, data_size) + << " size mismatch in DDPushPull " << len << " " << data_size; + pull_vals.resize(data_size); + auto write_lock = value_set_.write_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < data_size; j++) { + value_set_[j] += vals[j]; + pull_vals[j] = value_set_[j]; + } + } else { + LG << "Key does not exist on PS in DensePull" << k; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + // we use length as the offset, i.e., #length = #vals. + // with response result + Key k = get<0>(request); + SArray offset = get<1>(request); + SArray &pull_vals = get<0>(response); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set_.width; + pull_vals.resize(offset.size() * width); + auto read_lock = value_set_.read_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offset.size(); ++j) { + auto value_begin = value_set_.data() + offset[j] * width; + auto value_end = value_begin + width; + auto dst_begin = pull_vals.data() + j * width; + std::copy(value_begin, value_end, dst_begin); + } + } else { + // error, the key does not exist on PS. + LF << "[Error] The pulled key: " << k + << " does not exist on PS in SparsePull."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + // we use length as the offset, i.e., #length = #vals. + // no response result + Key k = get<0>(request); + SArray offsets = get<1>(request); + SArray vals = get<2>(request); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set_.width; + + CHECK_EQ(vals.size(), offsets.size() * width) + << " in Psf::SparsePush check failed," + << " size of vals is " << vals.size() << " size of lens is " + << offsets.size() << " size of width is " << width; + + // write, discard const qualifier + auto write_lock = value_set_.write_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + value_set_[dst_offset + k] += vals[src_offset + k]; + } + } + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in SparsePush."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + SArray offsets = get<1>(request); + SArray vals = get<2>(request); + size_t len = get<3>(request); + SArray &pull_vals = get<0>(response); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set_.width; + CHECK_EQ(len, value_set_.size()) + << " size mismatch in SDPushPull " << k << " " << len << " " + << value_set_.size(); + + // sparsepush phase + if (vals.size() > 0) { + CHECK_EQ(vals.size(), offsets.size() * width) + << " in Psf::SDPushPull check failed," + << " size of vals is " << vals.size() << " size of lens is " + << offsets.size() << " size of width is " << width; + + // write, discard const qualifier + auto write_lock = value_set_.write_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + value_set_[dst_offset + k] += vals[src_offset + k]; + } + } + } + // densepull phase + pull_vals.resize(value_set_.size()); + auto read_lock = value_set_.read_guard(); + std::copy(value_set_.begin(), value_set_.end(), pull_vals.begin()); + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in SDPushPull."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + SArray push_offsets = get<1>(request); + SArray vals = get<2>(request); + SArray pull_offsets = get<3>(request); + SArray &pull_vals = get<0>(response); + + auto iter = const_store.find(k); + if (iter != const_store.end()) { + auto &value_set_ = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set_.width; + + // sparsepush phase + if (vals.size() > 0) { + CHECK_EQ(vals.size(), push_offsets.size() * width) + << " in Psf::SSPushPull check failed," + << " size of vals is " << vals.size() << " size of lens is " + << push_offsets.size() << " size of width is " << width; + + // write, discard const qualifier + auto write_lock = value_set_.write_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < push_offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = push_offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + value_set_[dst_offset + k] += vals[src_offset + k]; + } + } + } + + // sparsepull phase + if (pull_offsets.size() > 0) { + pull_vals.resize(pull_offsets.size() * width); + auto read_lock = value_set_.read_guard(); +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < pull_offsets.size(); ++j) { + auto val_begin = + value_set_.begin() + pull_offsets[j] * width; + auto val_end = val_begin + width; + auto dst_begin = pull_vals.begin() + j * width; + std::copy(val_begin, val_end, dst_begin); + } + } + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in SparsePush."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response); + void serve(const PSFData::Request &request, + PSFData::Response &response); + void serve(const PSFData::Request &request, + PSFData::Response &response); + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + // one key per request. + // no response result + Key k = get<0>(request); + ParamType param_type = (ParamType)get<1>(request); + size_t len = get<2>(request); + size_t width = get<3>(request); + InitType init_type = (InitType)get<4>(request); + double init_a = get<5>(request); + double init_b = get<6>(request); + unsigned long long seed = get<7>(request); + OptType otype = (OptType)get<8>(request); + SArray lrs = get<9>(request); + + Param *newParam = nullptr; + switch (param_type) { + case kParam: + newParam = new Param(len, otype, lrs); + break; + case kParam2D: + newParam = new Param2D(len, width, otype, lrs); + break; + case kCacheTable: + newParam = new CacheTable(len, width, otype, lrs); + } + auto iter = store.emplaceIfAbsent(k, newParam); + + CHECK_EQ(len * width, iter->second->size()) + << k << " " << len << " " << width << " " << iter->second->size() + << " size mismatch in UniformInit"; + // write, discard const qualifier + auto &value_set_ = + *const_cast(iter->second); + auto write_lock = value_set_.write_guard(); + if (init_type == InitType::Constant) { + float filled_value = static_cast(init_a); + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < value_set_.size(); j++) + value_set_[j] = filled_value; + } else if (init_type == InitType::Uniform) { + std::uniform_real_distribution uniform_dist(init_a, init_b); + std::default_random_engine generator(seed); + for (size_t j = 0; j < value_set_.size(); j++) { + value_set_[j] = uniform_dist(generator); + } + } else if (init_type == InitType::Normal) { + std::normal_distribution normal_dist(init_a, init_b); + std::default_random_engine generator(seed); + for (size_t j = 0; j < value_set_.size(); j++) { + value_set_[j] = normal_dist(generator); + } + } else if (init_type == InitType::TruncatedNormal) { + std::normal_distribution truncated_normal_dist(init_a, + init_b); + float upper_limit = init_a + 2 * init_b; + float lower_limit = init_a - 2 * init_b; + std::default_random_engine generator(seed); + for (size_t j = 0; j < value_set_.size(); j++) { + float temp = truncated_normal_dist(generator); + while (temp > upper_limit || temp < lower_limit) + temp = truncated_normal_dist(generator); + value_set_[j] = temp; + } + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + auto iter = store.find(k); + if (iter != store.end()) { + store.erase(iter); + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in ParamClear."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + SArray address = get<1>(request); + auto iter = store.find(k); + if (iter != store.end()) { + auto &value_set_ = *iter->second; + auto read_lock = value_set_.read_guard(); + std::ofstream fout( + std::string(address.data(), address.size()).c_str(), + std::ios::binary); + fout.write((char *)value_set_.data(), + value_set_.size() * sizeof(float)); + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in ParamSave."; + } + } + + void serve(const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + SArray address = get<1>(request); + auto iter = store.find(k); + if (iter != store.end()) { + auto &value_set_ = *iter->second; + auto write_lock = value_set_.write_guard(); + std::ifstream fin( + std::string(address.data(), address.size()).c_str(), + std::ios::binary); + fin.read((char *)value_set_.data(), + value_set_.size() * sizeof(float)); + } else { + // error, the key does not exist on PS. + LF << "[Error] The pushed key: " << k + << " does not exist on PS in ParamLoad."; + } + } + +private: + typedef threadsafe_unordered_map>> tmap; + tmap store; + const tmap &const_store = + store; // const reference to force compiler to use read lock +}; + +} // namespace ps diff --git a/ps-lite/include/ps/server/kvserver.h b/ps-lite/include/ps/server/kvserver.h new file mode 100644 index 0000000..e2a8556 --- /dev/null +++ b/ps-lite/include/ps/server/kvserver.h @@ -0,0 +1,47 @@ +#pragma once + +#include "ps/server/PSFHandle.h" +#include "ps/psf/serializer.h" +#include "ps/kvapp.h" +#include +#include +namespace ps { + +template +struct KVServerRegisterHelper; + +/** + * \brief A server node for maintaining key-value pairs + */ +class KVServer : public KVApp { +public: + /** + * \brief constructor + * \param app_id the app id, should match with \ref KVWorker's id + */ + explicit KVServer(int app_id) : KVApp(app_id) { + KVAppRegisterHelper::init(this); + } + +private: + template + void onReceive(const Message &msg) { + typename PSFData::Request request; + typename PSFData::Response response; + tupleDecode(request, msg.data); + handler.serve(request, response); + Message rmsg; + tupleEncode(response, rmsg.data); + rmsg.meta = msg.meta; + rmsg.meta.recver = msg.meta.sender; + rmsg.meta.request = false; + Postoffice::Get()->van()->Send(rmsg); + } + + /** \brief request handle */ + KVServerMatrixHandle handler; + template + friend struct KVAppRegisterHelper; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/server/optimizer.h b/ps-lite/include/ps/server/optimizer.h new file mode 100644 index 0000000..c433019 --- /dev/null +++ b/ps-lite/include/ps/server/optimizer.h @@ -0,0 +1,357 @@ +#pragma once + +#include +#include "ps/server/param.h" + +namespace ps { + +template +class Param; +template +class Param2D; +template +class CacheTable; + +enum OptType { + SGD, + Momentum, + NesterovMomentum, + AdaGrad, + Adam, + None, +}; + +template +class Optimizer { +public: + virtual void ApplyDense(Param ¶m, SArray &grads); + virtual void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads); + virtual void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads); + virtual void InitStates(size_t size); +}; + +template +class SGDOptimizer : public Optimizer { +public: + explicit SGDOptimizer(float learning_rate) : lr(learning_rate) { + } + + void ApplyDense(Param ¶m, SArray &grads) { +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < param.size(); ++j) { + param[j] -= lr * grads[j]; + } + } + + void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads) { + size_t width = param.width; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + param[dst_offset + k] -= lr * grads[src_offset + k]; + } + } + } + + void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads) { + size_t width = param.width; + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + param.ver[offsets[j]] += updates[j]; + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + param[dst_offset + k] -= lr * grads[src_offset + k]; + } + } + } + + void InitStates(size_t size) { + } + +private: + float lr; +}; + +// Optimizers below need tests! No correctness guarantees. +template +class MomentumOptimizer : public Optimizer { +public: + explicit MomentumOptimizer(float learning_rate, float momentum) : + lr(learning_rate), moment(momentum) { + } + + void ApplyDense(Param ¶m, SArray &grads) { +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < param.size(); ++j) { + velocity[j] = moment * velocity[j] - lr * grads[j]; + param[j] = param[j] + velocity[j]; + } + } + + void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads) { + size_t width = param.width; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + velocity[cur_dst] = + moment * velocity[cur_dst] - lr * grads[cur_src]; + param[cur_dst] = param[cur_dst] + velocity[cur_dst]; + } + } + } + + void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads) { + size_t width = param.width; + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + param.ver[offsets[j]] += updates[j]; + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + velocity[cur_dst] = + moment * velocity[cur_dst] - lr * grads[cur_src]; + param[cur_dst] = param[cur_dst] + velocity[cur_dst]; + } + } + } + + void InitStates(size_t size) { + velocity = new V[size](); + } + +private: + float lr; + float moment; + V *velocity; +}; + +template +class NesterovMomentumOptimizer : public Optimizer { +public: + explicit NesterovMomentumOptimizer(float learning_rate, float momentum) : + lr(learning_rate), moment(momentum) { + } + + void ApplyDense(Param ¶m, SArray &grads) { +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < param.size(); ++j) { + V temp = -lr * grads[j]; + velocity[j] = moment * (velocity[j] + temp); + param[j] = param[j] + velocity[j] + temp; + } + } + + void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads) { + size_t width = param.width; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + V temp = -lr * grads[cur_src]; + velocity[cur_dst] = moment * (velocity[cur_dst] + temp); + param[cur_dst] = param[cur_dst] + velocity[cur_dst] + temp; + } + } + } + + void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads) { + size_t width = param.width; + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + param.ver[offsets[j]] += updates[j]; + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + V temp = -lr * grads[cur_src]; + velocity[cur_dst] = moment * (velocity[cur_dst] + temp); + param[cur_dst] = param[cur_dst] + velocity[cur_dst] + temp; + } + } + } + + void InitStates(size_t size) { + velocity = new V[size](); + } + +private: + float lr; + float moment; + V *velocity; +}; + +template +class AdaGradOptimizer : public Optimizer { +public: + explicit AdaGradOptimizer(float learning_rate, float initial, + float epsilon) : + lr(learning_rate), + init(initial), eps(epsilon) { + } + + void ApplyDense(Param ¶m, SArray &grads) { +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < param.size(); ++j) { + accum[j] = accum[j] + grads[j] * grads[j]; + param[j] = param[j] - lr * grads[j] / (sqrt(accum[j]) + eps); + } + } + + void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads) { + size_t width = param.width; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + accum[cur_dst] = + accum[cur_dst] + grads[cur_src] * grads[cur_src]; + param[cur_dst] = + param[cur_dst] + - lr * grads[cur_src] / (sqrt(accum[cur_dst]) + eps); + } + } + } + + void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads) { + size_t width = param.width; + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + param.ver[offsets[j]] += updates[j]; + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + accum[cur_dst] = + accum[cur_dst] + grads[cur_src] * grads[cur_src]; + param[cur_dst] = + param[cur_dst] + - lr * grads[cur_src] / (sqrt(accum[cur_dst]) + eps); + } + } + } + + void InitStates(size_t size) { + accum = new V[size]; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < size; ++j) + accum[j] = init; + } + +private: + float lr; + float init; + float eps; + V *accum; +}; + +template +class AdamOptimizer : public Optimizer { +public: + explicit AdamOptimizer(float learning_rate, float beta1, float beta2, + float epsilon) : + lr(learning_rate), + b1(beta1), b2(beta2), eps(epsilon) { + b1t = 1.0; + b2t = 1.0; + } + + void ApplyDense(Param ¶m, SArray &grads) { + b1t = b1t * b1; + b2t = b2t * b2; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < param.size(); ++j) { + marr[j] = b1 * marr[j] + (1 - b1) * grads[j]; + varr[j] = b2 * varr[j] + (1 - b2) * grads[j] * grads[j]; + param[j] = + param[j] + - lr * marr[j] / (1 - b1t) / (sqrt(varr[j] / (1 - b2t)) + eps); + } + } + + void ApplySparse(Param2D ¶m, SArray &offsets, + SArray &grads) { + size_t width = param.width; +#pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + marr[cur_dst] = b1 * marr[cur_dst] + (1 - b1) * grads[cur_src]; + varr[cur_dst] = b2 * varr[cur_dst] + + (1 - b2) * grads[cur_src] * grads[cur_src]; + param[cur_dst] = + param[cur_dst] + - lr * marr[cur_dst] / (1 - b1t) + / (sqrt(varr[cur_dst] / (1 - b2t)) + eps); + } + } + } + + void ApplyCache(CacheTable ¶m, SArray &updates, + SArray &offsets, SArray &grads) { + size_t width = param.width; + // #pragma omp parallel for num_threads(4) + for (size_t j = 0; j < offsets.size(); ++j) { + param.ver[offsets[j]] += updates[j]; + size_t src_offset = j * width; + size_t dst_offset = offsets[j] * width; + for (size_t k = 0; k < width; ++k) { + size_t cur_src = src_offset + k; + size_t cur_dst = dst_offset + k; + marr[cur_dst] = b1 * marr[cur_dst] + (1 - b1) * grads[cur_src]; + varr[cur_dst] = b2 * varr[cur_dst] + + (1 - b2) * grads[cur_src] * grads[cur_src]; + param[cur_dst] = + param[cur_dst] + - lr * marr[cur_dst] / (1 - b1t) + / (sqrt(varr[cur_dst] / (1 - b2t)) + eps); + } + } + } + + void InitStates(size_t size) { + marr = new V[size](); + varr = new V[size](); + } + +private: + float lr; + float b1; + float b2; + float eps; + float b1t; + float b2t; + V *marr; + V *varr; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/server/param.h b/ps-lite/include/ps/server/param.h new file mode 100644 index 0000000..5617cac --- /dev/null +++ b/ps-lite/include/ps/server/param.h @@ -0,0 +1,140 @@ +#pragma once + +#include + +#include "common/shared_mutex.h" +#include "ps/psf/PSFunc.h" +#include "ps/server/optimizer.h" + +namespace ps { + +enum ParamType { + kParam, + kParam2D, + kCacheTable, +}; + +/* + Param with a read-write lock +*/ +template +class Param { +public: + explicit Param(size_t size, OptType otype, SArray lrs) { + vec_ = new V[size](); + size_ = size; + switch (otype) { + case SGD: + opt = new SGDOptimizer(lrs[0]); + break; + case Momentum: + opt = new MomentumOptimizer(lrs[0], lrs[1]); + break; + case NesterovMomentum: + opt = new NesterovMomentumOptimizer(lrs[0], lrs[1]); + break; + case AdaGrad: + opt = new AdaGradOptimizer(lrs[0], lrs[1], lrs[2]); + break; + case Adam: + opt = new AdamOptimizer(lrs[0], lrs[1], lrs[2], lrs[3]); + break; + case None: + opt = nullptr; + return; + } + opt->InitStates(size); + } + + ~Param() { + delete[] vec_; + } + + Param(const Param &) = delete; + + s_lock<4> read_guard() const noexcept { + return s_lock<4>(mtx); + } + x_lock<4> write_guard() noexcept { + return x_lock<4>(mtx); + } + + inline const V *data() const { + return vec_; + } + inline V *data() { + return vec_; + } + inline V *begin() { + return data(); + } + inline V *end() { + return data() + size(); + } + inline V &operator[](size_t i) { + return vec_[i]; + } + inline const V &operator[](size_t i) const { + return vec_[i]; + } + inline size_t size() const { + return size_; + } + virtual ParamType type() { + return kParam; + } + void updateDense(SArray &grads) { + auto write_lock = write_guard(); + opt->ApplyDense(*this, grads); + } + +private: + mutable shared_mutex<4> mtx; + V *vec_; + size_t size_; + +protected: + Optimizer *opt; +}; + +template +class Param2D : public Param { +public: + explicit Param2D(size_t len, size_t wid, OptType otype, SArray lrs) : + Param(len * wid, otype, lrs) { + length = len; + width = wid; + } + void updateSparse(SArray &offsets, SArray &grads) { + auto write_lock = this->write_guard(); + this->opt->ApplySparse(*this, offsets, grads); + } + ParamType type() { + return kParam2D; + } + size_t length, width; +}; + +template +class CacheTable : public Param2D { +public: + explicit CacheTable(size_t len, size_t wid, OptType otype, + SArray lrs) : + Param2D(len, wid, otype, lrs) { + ver = new version_t[len](); + } + ~CacheTable() { + delete[] ver; + } + void updateCache(SArray &updates, SArray &offsets, + SArray &grads) { + auto write_lock = this->write_guard(); + this->opt->ApplyCache(*this, updates, offsets, grads); + } + ParamType type() { + return kCacheTable; + } + version_t *ver; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/worker/PSAgent.h b/ps-lite/include/ps/worker/PSAgent.h new file mode 100644 index 0000000..edc3a9b --- /dev/null +++ b/ps-lite/include/ps/worker/PSAgent.h @@ -0,0 +1,609 @@ +#pragma once + +#include "ps/ps.h" +#include "ps/worker/kvworker.h" +#include "ps/psf/PSFunc.h" +#include "ps/server/param.h" +#include "common/logging.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ps { + +struct TensorMeta { + ParamType ptype; + size_t length; + size_t width = 1; + /* split a tensor into multiple pieces. [node_name --> + * splitted_dl_array_keys] */ + vector keys; + /* [node_name --> timestamp to be waited] */ + std::vector ts; + std::vector part; +}; + +struct SparseInfos { + // store structures used in sparse operations to avoid memory leak + // if using C++17 can changed to shared_ptr (which supports dynamic arrays) + size_t *in_offset; + size_t *out_offset; + float *in_data; +}; + +/* + * A singleton object for pulling or push to PS. + * Since we enable sparse pull/push in PSVector and the length of each val is + * one, thus the $lens in @kvpairs is not useful. As a result, we use $lens to + * store the offset of each vector. for example, key=1000, lens = {1,2,3}, then + * we are accessing elements with ids as {1000+1, 1000+2, 1000+3} + */ +class PSAgent { +private: + /* The KVWorker used to make requests. */ + KVWorker _kvworker; + Partitioner *_par; + std::unordered_map _id2meta; + std::unordered_map _id2sparseinfo; + + Key _globalId = 0; + + /* for round-robin tensor placement */ + size_t _serverIndex = 0; + + PSAgent() : _kvworker(0, 0) { + _par = _kvworker.par; + } + +public: + static PSAgent *Get() { + static PSAgent e; + return &e; + } + + void wait(int name) { + for (int t : _id2meta[name].ts) + _kvworker.Wait(t); + _id2meta[name].ts.clear(); + } + + void clear(int name) { + _id2meta.erase(name); + // TODO: delete on PS + } + + void clearOnServer(int name) { + TensorMeta &meta = _id2meta[name]; + for (size_t i = 0; i < meta.keys.size(); i++) { + PSFData::Request request(meta.keys[i]); + auto cb = getCallBack(); + meta.ts.push_back(_kvworker.Request(request, cb)); + } + wait(name); + } + + void waitTimestamp(int timestamp) { + _kvworker.Wait(timestamp); + } + + /** + * \brief init the meta information about this data on PS. + * the meta data is stored on each worker. + * \param name the name of the input data + * \param cols the #columns of the data, the data are partitioned by cols. + */ + void registerTensor(const int name, const ParamType ptype, + const size_t length, const size_t width = 1) { + assert(!_id2meta.count(name)); + TensorMeta tm; + tm.ptype = ptype; + tm.length = length; + if (ptype == kParam) { + _par->partitionDense(length, tm.keys, tm.part); + } else { + tm.width = width; + _par->partitionSparse(length, width, tm.keys, tm.part); + SparseInfos sp; + sp.in_offset = nullptr; + sp.out_offset = nullptr; + sp.in_data = nullptr; + _id2sparseinfo[name] = sp; + } + _id2meta[name] = tm; + } + + void vecPushSparse(const int name, float *dup_index, float *vals, + const size_t dup_index_size, int priority = 0) { + TensorMeta &meta = _id2meta[name]; + const std::vector &keys = meta.keys; + const std::vector &lens = meta.part; + size_t width = meta.width; + SparseInfos &sp = _id2sparseinfo[name]; + delete[] sp.in_offset; + delete[] sp.in_data; + + std::map> idx2map; + for (size_t i = 0; i < dup_index_size; ++i) { + size_t idx = (size_t)dup_index[i]; + idx2map[idx].emplace_back(i); + } + + size_t index_size = idx2map.size(); + size_t num_all = index_size * width; + size_t *cp_offset = sp.in_offset = new size_t[index_size]; + float *cp_val = sp.in_data = new float[num_all](); + + size_t cur_index = 0; + size_t cur_offset = 0; + size_t cur_len = 0; + auto iter = idx2map.begin(); + std::vector> ts(keys.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + size_t st_index = cur_index; + size_t st_offset = cur_offset; + while (iter != idx2map.end() && iter->first < cur_len + lens[i]) { + cp_offset[cur_index++] = iter->first - cur_len; + for (auto j : iter->second) { + size_t ori_offset = j * width; + for (size_t k = 0; k < width; ++k) { + cp_val[cur_offset + k] += vals[ori_offset + k]; + } + } + cur_offset += width; + ++iter; + } + if (cur_index > st_index) { + ts[i].first = true; + PSFData::Request request( + keys[i], + SArray(cp_offset + st_index, cur_index - st_index), + SArray(cp_val + st_offset, cur_offset - st_offset)); + auto cb = getCallBack(); + ts[i].second = _kvworker.Request(request, cb); + } else { + ts[i].first = false; + } + cur_len += lens[i]; + } + + for (auto &t : ts) + if (t.first) + meta.ts.push_back(t.second); + return; + } + + void vecPullSparse(const int name, float *dup_index, float *vals, + const size_t dup_index_size, int priority = 0) { + TensorMeta &meta = _id2meta[name]; + const std::vector &keys = meta.keys; + const std::vector &lens = meta.part; + size_t width = meta.width; + SparseInfos &sp = _id2sparseinfo[name]; + delete[] sp.out_offset; + + std::map> idx2map; + for (size_t i = 0; i < dup_index_size; ++i) { + size_t idx = (size_t)dup_index[i]; + idx2map[idx].emplace_back(i); + } + + size_t index_size = idx2map.size(); + size_t *cp_offset = sp.out_offset = new size_t[index_size]; + + size_t cur_index = 0; + size_t cur_len = 0; + auto iter = idx2map.begin(); + std::vector> ts(keys.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + size_t st_index = cur_index; + auto st_iter = iter; + while (iter != idx2map.end() && iter->first < cur_len + lens[i]) { + cp_offset[cur_index++] = iter->first - cur_len; + ++iter; + } + if (cur_index > st_index) { + ts[i].first = true; + PSFData::Request request( + keys[i], + SArray(cp_offset + st_index, cur_index - st_index)); + auto cb = getCallBack( + SArray(vals, dup_index_size * width), + std::move( + std::vector>>( + st_iter, iter)), + cur_len, width); + ts[i].second = _kvworker.Request(request, cb); + } else { + ts[i].first = false; + } + cur_len += lens[i]; + } + + for (auto &t : ts) + if (t.first) + meta.ts.push_back(t.second); + return; + } + + void vecSDPushPull(const int name, float *dup_index, float *vals, + const size_t dup_index_size, float *out_vals, + int priority = 0) { + TensorMeta &meta = _id2meta[name]; + const std::vector &keys = meta.keys; + const std::vector &lens = meta.part; + size_t width = meta.width; + SparseInfos &sp = _id2sparseinfo[name]; + delete[] sp.in_offset; + delete[] sp.in_data; + + std::map> idx2map; + for (size_t i = 0; i < dup_index_size; ++i) { + size_t idx = (size_t)dup_index[i]; + idx2map[idx].emplace_back(i); + } + + size_t index_size = idx2map.size(); + size_t num_all = index_size * width; + size_t *cp_offset = sp.in_offset = new size_t[index_size]; + float *cp_val = sp.in_data = new float[num_all](); + + size_t cur_index = 0; + size_t cur_offset = 0; + size_t cur_len = 0; + size_t pull_offset = 0; + auto iter = idx2map.begin(); + + for (size_t i = 0; i < keys.size(); ++i) { + size_t st_index = cur_index; + size_t st_offset = cur_offset; + size_t local_length = lens[i] * width; + while (iter != idx2map.end() && iter->first < cur_len + lens[i]) { + cp_offset[cur_index++] = iter->first - cur_len; + for (auto j : iter->second) { + size_t ori_offset = j * width; + for (size_t k = 0; k < width; ++k) { + cp_val[cur_offset + k] += vals[ori_offset + k]; + } + } + cur_offset += width; + ++iter; + } + PSFData::Request request( + keys[i], + SArray(cp_offset + st_index, cur_index - st_index), + SArray(cp_val + st_offset, cur_offset - st_offset), + local_length); + auto cb = getCallBack( + SArray(out_vals + pull_offset, local_length)); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_len += lens[i]; + pull_offset += local_length; + } + return; + } + + void vecSSPushPull(const int name, float *in_index, float *in_vals, + float *out_index, float *out_vals, + const size_t dup_index_size, int priority = 0) { + TensorMeta &meta = _id2meta[name]; + const std::vector &keys = meta.keys; + const std::vector &lens = meta.part; + size_t width = meta.width; + SparseInfos &sp = _id2sparseinfo[name]; + delete[] sp.in_offset; + delete[] sp.out_offset; + delete[] sp.in_data; + + std::map> in_idx2map; + std::map> out_idx2map; + for (size_t i = 0; i < dup_index_size; ++i) { + size_t idx = (size_t)in_index[i]; + in_idx2map[idx].emplace_back(i); + idx = (size_t)out_index[i]; + out_idx2map[idx].emplace_back(i); + } + + size_t in_index_size = in_idx2map.size(); + size_t out_index_size = out_idx2map.size(); + size_t in_num_all = in_index_size * width; + size_t *in_cp_offset = sp.in_offset = new size_t[in_index_size]; + size_t *out_cp_offset = sp.out_offset = new size_t[out_index_size]; + float *in_cp_val = sp.in_data = new float[in_num_all](); + + size_t in_cur_index = 0; + size_t in_cur_offset = 0; + size_t cur_len = 0; + size_t out_cur_index = 0; + auto in_iter = in_idx2map.begin(); + auto out_iter = out_idx2map.begin(); + std::vector> ts(keys.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + size_t in_st_index = in_cur_index; + size_t st_offset = in_cur_offset; + while (in_iter != in_idx2map.end() + && in_iter->first < cur_len + lens[i]) { + in_cp_offset[in_cur_index++] = in_iter->first - cur_len; + for (auto j : in_iter->second) { + size_t ori_offset = j * width; + for (size_t k = 0; k < width; ++k) { + in_cp_val[in_cur_offset + k] += in_vals[ori_offset + k]; + } + } + in_cur_offset += width; + ++in_iter; + } + + size_t out_st_index = out_cur_index; + auto st_iter = out_iter; + while (out_iter != out_idx2map.end() + && out_iter->first < cur_len + lens[i]) { + out_cp_offset[out_cur_index++] = out_iter->first - cur_len; + ++out_iter; + } + + if (in_cur_index > in_st_index || out_cur_index > out_st_index) { + ts[i].first = true; + PSFData::Request request( + keys[i], + SArray(in_cp_offset + in_st_index, + in_cur_index - in_st_index), + SArray(in_cp_val + st_offset, + in_cur_offset - st_offset), + SArray(out_cp_offset + out_st_index, + out_cur_index - out_st_index)); + auto cb = getCallBack( + SArray(out_vals, dup_index_size * width), + std::move( + std::vector>>( + st_iter, out_iter)), + cur_len, width); + ts[i].second = _kvworker.Request(request, cb); + } else { + ts[i].first = false; + } + cur_len += lens[i]; + } + + for (auto &t : ts) + if (t.first) + meta.ts.push_back(t.second); + return; + } + + /** + * \brief PSVector: pull pairs from PS. + * \param name name of the PSVector + * \param vals the vals of pullsh vals + */ + void vecDensePush(const int name, float *vals, int priority = 0) { + TensorMeta &meta = _id2meta[name]; + auto cb = getCallBack(); + /* send push request to each partition according to the offsets. */ + size_t cur_len = 0; + for (size_t i = 0; i < meta.keys.size(); i++) { + PSFData::Request request( + meta.keys[i], meta.part[i], + SArray(vals + cur_len, meta.part[i])); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_len += meta.part[i]; + } + } + + void vecDensePull(const int name, float *vals, int priority = 0) { + TensorMeta &meta = _id2meta[name]; + size_t cur_offset = 0; + for (size_t i = 0; i < meta.keys.size(); i++) { + size_t cur_length = meta.part[i] * meta.width; + PSFData::Request request(meta.keys[i], cur_length); + auto cb = getCallBack( + SArray(vals + cur_offset, cur_length)); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_offset += cur_length; + } + } + + void vecDDPushPull(const int name, float *in_vals, float *out_vals, + int priority = 0) { + TensorMeta &meta = _id2meta[name]; + size_t cur_len = 0; + /* send pull request to each partition */ + for (size_t i = 0; i < meta.keys.size(); i++) { + PSFData::Request request( + meta.keys[i], meta.part[i], + SArray(in_vals + cur_len, meta.part[i])); + auto cb = getCallBack( + SArray(out_vals + cur_len, meta.part[i])); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_len += meta.part[i]; + } + } + + void ParameterInit(const int name, InitType init_type, double init_a, + double init_b, unsigned long long seed, OptType otype, + SArray lrs) { + TensorMeta &meta = _id2meta[name]; + /* send pull request to each partition */ + auto cb = getCallBack(); + for (size_t i = 0; i < meta.keys.size(); i++) { + PSFData::Request request( + meta.keys[i], meta.ptype, meta.part[i], meta.width, init_type, + init_a, init_b, seed, otype, lrs); + meta.ts.push_back(_kvworker.Request(request, cb)); + } + } + + void ParameterSave(const int name, char *address) { + TensorMeta &meta = _id2meta[name]; + /* send pull request to each partition */ + auto cb = getCallBack(); + for (size_t i = 0; i < meta.keys.size(); i++) { + std::string local_address = std::string(address) + "/" + + std::to_string(name) + "_" + + std::to_string(i) + ".dat"; + SArray temp_array; + temp_array.CopyFrom(local_address.c_str(), local_address.size()); + PSFData::Request request(meta.keys[i], temp_array, + false); + meta.ts.push_back(_kvworker.Request(request, cb)); + } + } + + void ParameterLoad(const int name, char *address) { + TensorMeta &meta = _id2meta[name]; + /* send pull request to each partition */ + auto cb = getCallBack(); + for (size_t i = 0; i < meta.keys.size(); i++) { + std::string local_address = std::string(address) + "/" + + std::to_string(name) + "_" + + std::to_string(i) + ".dat"; + SArray temp_array; + temp_array.CopyFrom(local_address.c_str(), local_address.size()); + PSFData::Request request(meta.keys[i], temp_array); + meta.ts.push_back(_kvworker.Request(request, cb)); + } + } + + void startRecord(std::string dirPath) { + _kvworker.startRecord(dirPath); + } + + void getLoads() { + _kvworker.recordLoads(); + } + + /* + A simple key mapping for multiple server case + */ + Key mapWkeyToSkey(Key idx) { + const std::vector &server_range = + Postoffice::Get()->GetServerKeyRanges(); + int server = idx % server_range.size(); + Key k = server_range[server].end() - idx - 1; + return k; + } + + /* + Enqueue the Zpush request for PushData + */ + void PushData(Key idx, float *vals, int len, std::vector ×tamp) { + auto cb = getCallBack(); + PSFData::Request request(mapWkeyToSkey(idx), len, + SArray(vals, len)); + int ts = _kvworker.Request(request, cb); + timestamp.push_back(ts); + } + + // This is almost the same as PushData + void PullData(Key idx, float *vals, int len, std::vector ×tamp) { + auto cb = getCallBack(SArray(vals, len)); + PSFData::Request request(mapWkeyToSkey(idx), len); + int ts = _kvworker.Request(request, cb); + timestamp.push_back(ts); + } + + void syncEmbedding(int name, const SArray &rows, + const SArray &ver, version_t bound, + PSFData::Closure closure) { + TensorMeta &meta = _id2meta[name]; + size_t start = 0, end = 0, cur_len = 0; + for (size_t i = 0; i < meta.keys.size(); i++) { + // find the idx range + start = end; + end = std::lower_bound(rows.begin() + start, rows.end(), + cur_len + meta.part[i]) + - rows.begin(); + if (start == end) + continue; + // remove row offset inplace so that index fits with server + SArray new_rows = rows.segment(start, end); + for (size_t i = 0; i < new_rows.size(); i++) + new_rows[i] -= cur_len; + PSFData::Request request( + meta.keys[i], new_rows, ver.segment(start, end), bound); + auto cb = std::bind(closure, std::placeholders::_1, start); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_len += meta.part[i]; + } + } + + void pushEmbedding(int name, const SArray &rows, + const SArray &data, + const SArray &updates) { + TensorMeta &meta = _id2meta[name]; + size_t start = 0, end = 0, cur_len = 0; + auto cb = getCallBack(); + for (size_t i = 0; i < meta.keys.size(); i++) { + // find the idx range + start = end; + end = std::lower_bound(rows.begin() + start, rows.end(), + cur_len + meta.part[i]) + - rows.begin(); + if (start == end) + continue; + // remove row offset inplace so that index fits with server + SArray new_rows = rows.segment(start, end); + for (size_t i = 0; i < new_rows.size(); i++) + new_rows[i] -= cur_len; + PSFData::Request request( + meta.keys[i], new_rows, + data.segment(start * meta.width, end * meta.width), + updates.segment(start, end)); + meta.ts.push_back(_kvworker.Request(request, cb)); + cur_len += meta.part[i]; + } + } + + void pushSyncEmbedding(int name, const SArray &rows, + const SArray &ver, version_t bound, + PSFData::Closure closure, + const SArray &push_rows, + const SArray &data, + const SArray &updates) { + TensorMeta &meta = _id2meta[name]; + size_t start = 0, end = 0, cur_len = 0, push_start = 0, push_end = 0; + for (size_t i = 0; i < meta.keys.size(); i++) { + // find the idx range + start = end; + push_start = push_end; + end = std::lower_bound(rows.begin() + start, rows.end(), + cur_len + meta.part[i]) + - rows.begin(); + push_end = std::lower_bound(push_rows.begin() + push_start, + push_rows.end(), cur_len + meta.part[i]) + - push_rows.begin(); + if (start == end && push_start == push_end) + continue; + // remove row offset inplace so that index fits with server + SArray new_rows = rows.segment(start, end), + new_push_rows = + push_rows.segment(push_start, push_end); + for (size_t i = 0; i < new_rows.size(); i++) + new_rows[i] -= cur_len; + for (size_t i = 0; i < new_push_rows.size(); i++) + new_push_rows[i] -= cur_len; + PSFData::Request request( + meta.keys[i], new_rows, ver.segment(start, end), bound, + new_push_rows, + data.segment(push_start * meta.width, push_end * meta.width), + updates.segment(push_start, push_end)); + auto cb = std::bind(closure, std::placeholders::_1, start); + meta.ts.push_back( + _kvworker.Request(request, cb)); + cur_len += meta.part[i]; + } + } +}; + +} // namespace ps diff --git a/ps-lite/include/ps/worker/callback_store.h b/ps-lite/include/ps/worker/callback_store.h new file mode 100644 index 0000000..c8fec8c --- /dev/null +++ b/ps-lite/include/ps/worker/callback_store.h @@ -0,0 +1,44 @@ +#pragma once + +#include "ps/psf/PSFunc.h" + +#include "unordered_map" +#include "mutex" + +namespace ps { + +// Used to lookup the callback for different ps functions +// Store a callback use store(timestamp, cb) +// Run a callback use run(timestamp, response) +template +class CallbackStore { +public: + using CallBack = function::Response &)>; + static CallbackStore *Get() { + static CallbackStore a; + return &a; + } + void run(int timestamp, const typename PSFData::Response &response) { + mu_.lock(); + auto it = store_.find(timestamp); + if (it != store_.end()) { + mu_.unlock(); + CHECK(it->second); + it->second(response); + mu_.lock(); + store_.erase(it); + } + mu_.unlock(); + } + void store(int ts, const CallBack &cb) { + mu_.lock(); + store_[ts] = cb; + mu_.unlock(); + } + +private: + std::unordered_map store_; + std::mutex mu_; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/worker/hetu_binding.h b/ps-lite/include/ps/worker/hetu_binding.h new file mode 100644 index 0000000..0fd48c1 --- /dev/null +++ b/ps-lite/include/ps/worker/hetu_binding.h @@ -0,0 +1,30 @@ +#pragma once + +// Do not include worker.h or any ps-lite header here +// or we will have multiple PSAgent, PostOffice instance + +#include +#include "ps/psf/PSFunc.h" +using std::vector; + +namespace ps { + +void debug(); + +void syncEmbedding(int node_id, const SArray &keys, + const SArray &ver, version_t bound, + PSFData::Closure closure); + +// Push Grads and Updates +// keys are unique +void PushEmbedding(int node_id, const SArray &keys, + const SArray &data, const SArray &updates); + +void PushSyncEmbedding(int node_id, const SArray &keys, + const SArray &ver, version_t bound, + PSFData::Closure closure, + const SArray &push_keys, + const SArray &data, + const SArray &updates); + +} // namespace ps diff --git a/ps-lite/include/ps/worker/kvworker.h b/ps-lite/include/ps/worker/kvworker.h new file mode 100644 index 0000000..0d7db47 --- /dev/null +++ b/ps-lite/include/ps/worker/kvworker.h @@ -0,0 +1,127 @@ +#pragma once + +#include "ps/psf/PSFunc.h" +#include "ps/psf/serializer.h" +#include "callback_store.h" +#include "ps/kvapp.h" +#include "ps/partitioner.h" +#include +#include +#include +#include +#include + +namespace ps { + +template +struct KVWorkerRegisterHelper; + +class KVWorker : private KVApp { +public: + Partitioner *par; + std::string PsfTypeString[15] = { + "DensePush", "DensePull", "DDPushPull", + "SparsePush", "SparsePull", "SDPushPull", + "SSPushPull", "ParamInit", "ParamClear", + "ParamSave", "ParamLoad", "kSyncEmbedding", + "kPushEmbedding", "kPushSyncEmbedding", "kNumPSfunction"}; + /** + * \brief constructor + * + * \param app_id the app id, should match with \ref KVServer's id + * \param customer_id the customer id which is unique locally + */ + explicit KVWorker(int app_id, int customer_id) : KVApp(app_id) { + KVAppRegisterHelper::init(this); + par = new AveragePartitioner(); // now use naive partitioner + } + + ~KVWorker() { + delete par; + if (logOut.is_open()) + logOut.close(); + } + + void startRecord(std::string dirPath) { + logOut.open(dirPath + "/loads_" + std::to_string(MyRank()) + ".txt"); + assert(logOut.is_open()); + } + + void recordLoads() { + for (auto iter = loads.begin(); iter != loads.end(); ++iter) { + logOut << PsfTypeString[iter->first] << ": " << (iter->second).first + << ' ' << (iter->second).second << std::endl; + } + logOut << std::endl; + loads.clear(); + } + + /** + * \brief Waits until a Request has been finished + * + * Sample usage: + * \code + * _kvworker.Wait(ts); + * \endcode + * + * \param timestamp the timestamp returned by kvworker.Request + */ + void Wait(int timestamp) { + obj_->WaitRequest(timestamp); + } + /** + * \brief make a new Request + * + * Sample usage: + * \code + * int ts = _kvworker.Request(request, callback); + * \endcode + * + * \param request create request by PSFData::Request + * \param cb the callback returned by getCallback(args...) + */ + template + int Request(const Tuple &request, const CallBack &cb) { + int timestamp = obj_->NewRequest(kServerGroup); + CallbackStore::Get()->store(timestamp, cb); + // Find the server + Key key = get<0>(request); + int target_server_id = par->queryServer(key); + // Create message + Message msg; + tupleEncode(request, msg.data); + if (logOut.is_open()) { + for (auto x : msg.data) { + loads[int(ftype)].first += x.size(); + } + } + msg.meta.app_id = obj_->app_id(); + msg.meta.customer_id = obj_->customer_id(); + msg.meta.timestamp = timestamp; + msg.meta.recver = Postoffice::Get()->ServerRankToID(target_server_id); + msg.meta.psftype = ftype; + msg.meta.request = true; + Postoffice::Get()->van()->Send(msg); + return timestamp; + } + +private: + template + void onReceive(const Message &msg) { + typename PSFData::Response response; + if (logOut.is_open()) { + for (auto x : msg.data) { + loads[int(ftype)].second += x.size(); + } + } + tupleDecode(response, msg.data); + int timestamp = msg.meta.timestamp; + CallbackStore::Get()->run(timestamp, response); + } + template + friend struct KVAppRegisterHelper; + std::unordered_map> loads; + std::ofstream logOut; +}; + +} // namespace ps diff --git a/ps-lite/include/ps/worker/worker.h b/ps-lite/include/ps/worker/worker.h new file mode 100644 index 0000000..550769e --- /dev/null +++ b/ps-lite/include/ps/worker/worker.h @@ -0,0 +1,82 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ps/ps.h" +#include "common/dlarray.h" +#include "common/c_runtime_api.h" +#include "ps/worker/PSAgent.h" +#include "ps/server/param.h" +#include "ps/server/optimizer.h" + +using namespace ps; + +class Worker { +public: + Worker(); + + void parameter_init(int node_name, ParamType ptype, size_t len, + size_t width, InitType init_type, double init_a, + double init_b, unsigned long long seed, OptType otype, + SArray lrs); + void parameter_save(int node_name, char *address); + void parameter_load(int node_name, char *address); + // for data push&pull + typedef uint64_t query_t; + /* + for each indice, call PSAgent::PushData to launch a thread + hold the return handle in the global map + immediately return + user should guaruntee value unchanged until waitdata + returns: + an query_t which is a long + use waitdata(query_t) to wait for its success + */ + query_t push_data(const long *indices, int index_size, const DLArray *value, + const long *lengths); + // this is almost the same as push_data + query_t pull_data(const long *indices, int index_size, DLArray *value, + const long *lengths); + /* + wait_data waits until a query success + */ + void wait_data(query_t query); + + void pull(int node_name, DLArray *arr); + void push(int node_name, const DLArray *arr, DLEvent *evt); + void dd_pushpull(int node_name, const DLArray *in_arr, DLArray *out_arr, + DLEvent *evt); + void sparse_pull(int node_name, const DLArray *index, DLArray *value, + size_t index_size); + void sparse_push(int node_name, const DLArray *index, const DLArray *value, + size_t index_size, DLEvent *evt); + void sd_pushpull(int node_name, const DLArray *index, const DLArray *in_arr, + size_t index_size, DLArray *out_arr, DLEvent *evt); + void ss_pushpull(int node_name, const DLArray *inind, const DLArray *in_arr, + const DLArray *outind, DLArray *out_arr, size_t index_size, + DLEvent *evt); + void wait(int node_name); + void clear(int node_name); + void clear_on_server(int node_name); + +private: + // used this hold to thread_pool return object + std::unordered_map> query2timestamp; + // data_pull & data_push query, increase 1 each call + query_t next_query = 0; + // protect query2timestamp and next_query + std::mutex data_mu; + + // for concurrent parameter push&pull + std::unordered_map> node2pullthread; + std::unordered_map> node2pushthread; + + int _thread_num = 3; +}; + +extern Worker worker; diff --git a/ps-lite/proto/meta.proto b/ps-lite/proto/meta.proto new file mode 100644 index 0000000..49f54de --- /dev/null +++ b/ps-lite/proto/meta.proto @@ -0,0 +1,48 @@ +/** + * Copyright (c) 2015 by Contributors + */ +syntax = "proto2"; +package ps; +option optimize_for = LITE_RUNTIME; + +message PBNode { + // the node role + required int32 role = 1; + // node id + optional int32 id = 2; + // hostname or ip + optional string hostname = 3; + // the port this node is binding + optional int32 port = 4; + // whether this node is created by failover + optional bool is_recovery = 5; + // the locally unique id of an customer + optional int32 customer_id = 10; +} + +// system control info +message PBControl { + required int32 cmd = 1; + repeated PBNode node = 2; + optional int32 barrier_group = 3; + optional uint64 msg_sig = 4; +} + +// mete information about a message +message PBMeta { + // if set, then it is system control task. otherwise, it is for app + optional PBControl control = 1; + // true: a request task + // false: the response task to the request task with the same *time* + optional bool request = 2 [default = false]; + // the unique id of an application + optional int32 app_id = 3; + // the locally unique id of an customer + optional int32 customer_id = 4; + // the timestamp of this message + optional int32 timestamp = 5; + // priority + optional int32 priority = 6 [default = 0]; + // psftype + required int32 psftype = 7 [default = 0]; +} diff --git a/ps-lite/src/PSFhandle_embedding.cc b/ps-lite/src/PSFhandle_embedding.cc new file mode 100644 index 0000000..a35e7bc --- /dev/null +++ b/ps-lite/src/PSFhandle_embedding.cc @@ -0,0 +1,81 @@ +#include "ps/server/PSFHandle.h" + +namespace ps { + +void KVServerMatrixHandle::serve( + const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + auto rows = get<1>(request); + auto data = get<2>(request); + auto updates = get<3>(request); + auto iter = const_store.find(k); + CHECK_NE(iter, const_store.end()) << "key does not exist"; + CHECK_EQ(iter->second->type(), kCacheTable) << " key is not Cachetable"; + auto &value_set = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set.width; + CHECK_EQ(updates.size(), rows.size()) + << "PushEmbedding updates size mismatch"; + CHECK_EQ(data.size(), rows.size() * width) + << "PushEmbedding data size mismatch"; + auto write_lock = value_set.write_guard(); + for (size_t i = 0; i < rows.size(); i++) { + value_set.ver[rows[i]] += updates[i]; + for (size_t j = 0; j < width; j++) + value_set[rows[i] * width + j] += data[i * width + j]; + } +} + +void KVServerMatrixHandle::serve( + const PSFData::Request &request, + PSFData::Response &response) { + Key k = get<0>(request); + auto rows = get<1>(request); + auto ver = get<2>(request); + auto bound = get<3>(request); + auto &idx = get<0>(response); + auto &ret_ver = get<1>(response); + auto &data = get<2>(response); + auto iter = const_store.find(k); + CHECK_NE(iter, const_store.end()) << "key does not exist"; + CHECK_EQ(iter->second->type(), kCacheTable) << " key is not Cachetable"; + auto &value_set = + *std::dynamic_pointer_cast>(iter->second); + size_t width = value_set.width; + auto read_lock = value_set.read_guard(); + size_t count = 0; + for (size_t i = 0; i < rows.size(); i++) + if (ver[i] == -1 || value_set.ver[rows[i]] - ver[i] > bound) + count++; + idx.resize(count); + ret_ver.resize(count); + data.resize(count * width); + count = 0; + for (size_t i = 0; i < rows.size(); i++) { + if (ver[i] == -1 || value_set.ver[rows[i]] - ver[i] > bound) { + idx[count] = i; + ret_ver[count] = value_set.ver[rows[i]]; + std::copy(&value_set[rows[i] * width], + &value_set[(rows[i] + 1) * width], &data[count * width]); + count++; + } + } +} + +void KVServerMatrixHandle::serve( + const PSFData::Request &request, + PSFData::Response &response) { + PSFData::Request push_req( + std::get<0>(request), std::get<4>(request), std::get<5>(request), + std::get<6>(request)); + PSFData::Response push_res; + serve(push_req, push_res); + + PSFData::Request sync_req( + std::get<0>(request), std::get<1>(request), std::get<2>(request), + std::get<3>(request)); + serve(sync_req, response); +} + +} // namespace ps diff --git a/ps-lite/src/customer.cc b/ps-lite/src/customer.cc new file mode 100644 index 0000000..6922ecb --- /dev/null +++ b/ps-lite/src/customer.cc @@ -0,0 +1,67 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#include "ps/internal/customer.h" +#include "ps/internal/postoffice.h" +namespace ps { + +const int Node::kEmpty = std::numeric_limits::max(); +const int Meta::kEmpty = std::numeric_limits::max(); + +Customer::Customer(int app_id, int customer_id, + const Customer::RecvHandle &recv_handle) : + app_id_(app_id), + customer_id_(customer_id), recv_handle_(recv_handle) { + cur_timestamp = 0; + Postoffice::Get()->AddCustomer(this); + // for debug set num_threads = 5; + int num_threads = 5; + for (int i = 0; i < num_threads; i++) { + recv_threads_.emplace_back(new std::thread(&Customer::Receiving, this)); + } +} + +Customer::~Customer() { + Postoffice::Get()->RemoveCustomer(this); + Message msg; + msg.meta.control.cmd = Control::TERMINATE; + msg.meta.control.barrier_group = 0; + msg.meta.control.msg_sig = 0; + recv_queue_.Push(msg); + for (auto &thread : recv_threads_) + thread->join(); +} + +int Customer::NewRequest(int recver) { + std::lock_guard lk(tracker_mu_); + assert(recver == kServerGroup); + tracker_[cur_timestamp] = false; + return cur_timestamp++; +} + +void Customer::WaitRequest(int timestamp) { + std::unique_lock lk(tracker_mu_); + tracker_cond_.wait(lk, [this, timestamp] { return tracker_[timestamp]; }); + tracker_.erase(timestamp); +} + +void Customer::Receiving() { + while (true) { + Message recv; + // thread safe + recv_queue_.WaitAndPop(&recv); + if (!recv.meta.control.empty() + && recv.meta.control.cmd == Control::TERMINATE) { + recv_queue_.Push(recv); + break; + } + recv_handle_(recv); + if (!recv.meta.request) { + std::lock_guard lk(tracker_mu_); + tracker_[recv.meta.timestamp] = true; + tracker_cond_.notify_all(); + } + } +} + +} // namespace ps diff --git a/ps-lite/src/hetu_binding.cc b/ps-lite/src/hetu_binding.cc new file mode 100644 index 0000000..cd95031 --- /dev/null +++ b/ps-lite/src/hetu_binding.cc @@ -0,0 +1,35 @@ +#include "ps/worker/worker.h" +#include "ps/worker/hetu_binding.h" + +namespace ps { + +void syncEmbedding(int node_id, const SArray &keys, + const SArray &ver, version_t bound, + PSFData::Closure closure) { + PSAgent::Get()->syncEmbedding(node_id, keys, ver, bound, closure); + PSAgent::Get()->wait(node_id); +} + +void PushEmbedding(int node_id, const SArray &keys, + const SArray &data, + const SArray &updates) { + PSAgent::Get()->pushEmbedding(node_id, keys, data, updates); + PSAgent::Get()->wait(node_id); +} + +void PushSyncEmbedding(int node_id, const SArray &keys, + const SArray &ver, version_t bound, + PSFData::Closure closure, + const SArray &push_keys, + const SArray &data, + const SArray &updates) { + PSAgent::Get()->pushSyncEmbedding(node_id, keys, ver, bound, closure, + push_keys, data, updates); + PSAgent::Get()->wait(node_id); +} + +void debug() { + printf("hetu at %p\n", Postoffice::Get()); +} + +} // namespace ps diff --git a/ps-lite/src/ibverbs_van.h b/ps-lite/src/ibverbs_van.h new file mode 100644 index 0000000..e1f48b3 --- /dev/null +++ b/ps-lite/src/ibverbs_van.h @@ -0,0 +1,1195 @@ +#ifndef PS_IBVERBS_VAN_H_ +#define PS_IBVERBS_VAN_H_ + +#ifdef DMLC_USE_IBVERBS + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ps/internal/threadsafe_queue.h" +#include "ps/internal/van.h" + +namespace ps { + +// Number of context buffers for sending START messages +static const int kStartDepth = 128; + +// Number of context buffers for writing messages +static const int kWriteDepth = kStartDepth; + +// Number of context buffers for receiving messages +static const int kRxDepth = kStartDepth * 2; + +// Number of context buffers for sending REPLY messages +static const int kReplyDepth = kRxDepth; + +// Maximum number of scatter/gather elements in any Work Request +static const int kSGEntry = 4; + +// Time to wait for resolution to complete (in milliseconds) +static const int kTimeoutms = 1000; + +// Number of backlog of incoming connection requests +static const int kRdmaListenBacklog = 128; + +// Number of preallocated work request buffers +static const int kMaxConcurrentWorkRequest = + kRxDepth + kStartDepth + kReplyDepth + kWriteDepth; + +// Length of buffers for storing hostname in the context of a connection request +static const int kMaxHostnameLength = 16; + +// Maximum number of ``data'' in a Message +// TODO(changlan): What if there are more data in Message? +static const int kMaxDataFields = 4; + +// Alignment in Mempool +static const size_t kAlignment = 8; + +template +static inline T align_floor(T v, T align) { + return v - (v % align); +} + +template +static inline T align_ceil(T v, T align) { + return align_floor(v + align - 1, align); +} + +// A simple thread-safe memory pool for RDMA memory regions +class SimpleMempool { +public: + // Allocated an initial ``size'' of registered memory regions + explicit SimpleMempool(struct ibv_pd *pd, size_t size = 0x1000000) { + pd_ = pd; + struct ibv_mr *mr; + char *p = reinterpret_cast(aligned_alloc(kAlignment, size)); + total_allocated_size += size; + CHECK(p); + CHECK(mr = + ibv_reg_mr(pd, p, size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); + // this mr is associated with memory address range [p, p+size] + mr_list.emplace(p + size, mr); + free_list.emplace(size, p); + } + + // Deregister and release all memory regions + ~SimpleMempool() { + std::lock_guard lk(mu_); + for (auto it = mr_list.begin(); it != mr_list.end(); it++) { + CHECK_EQ(ibv_dereg_mr(it->second), 0); + free(it->second->addr); + } + } + + // Take a buffer of ``size'' from the pool. If there is not enough remaining + // space in existing memory regions, allocate and register a new memory + // region. + char *Alloc(size_t size) { + if (size == 0) { + return nullptr; + } + + std::lock_guard lk(mu_); + + // Make sure the memory addresses are aligned by rounding the size up to + // next power of two + size_t proper_size = align_ceil(size, kAlignment); + + // Find a buffer of size greater than or equal to proper_size + auto it = free_list.lower_bound(proper_size); + + if (it + == free_list.end()) { // if there is no space left, need to allocate + // and register new memory + size_t new_mem_size = total_allocated_size; + while (proper_size > new_mem_size) { + new_mem_size *= 2; + } + char *p = reinterpret_cast( + aligned_alloc(kAlignment, new_mem_size)); + CHECK(p); + struct ibv_mr *mr; + CHECK(mr = ibv_reg_mr(pd_, p, new_mem_size, + IBV_ACCESS_LOCAL_WRITE + | IBV_ACCESS_REMOTE_WRITE)); + mr_list.emplace(p + new_mem_size, mr); + free_list.emplace(new_mem_size, p); + it = free_list.lower_bound(proper_size); + PS_VLOG(1) << "Not enough memory in the pool, requested size " + << proper_size << ", new allocated size " + << new_mem_size; + total_allocated_size += new_mem_size; + } + + CHECK_NE(free_list.end(), it) << "Not enough memory"; + CHECK_GE(it->first, proper_size); + + char *addr = it->second; + size_t space_left = it->first - proper_size; + + free_list.erase(it); + CHECK_EQ(used_list.find(addr), used_list.end()) + << "Address is already allocated"; + + used_list.emplace(addr, proper_size); + + if (space_left) { + free_list.emplace(space_left, addr + proper_size); + } + + return addr; + } + + // Return the buffer pointed by ``addr'' into the pool + void Free(char *addr) { + if (!addr) { + return; + } + + std::lock_guard lk(mu_); + + auto it = used_list.find(addr); + CHECK_NE(used_list.end(), it) + << "Cannot find info about address: " << (uintptr_t)addr; + + size_t size = it->second; + used_list.erase(it); + free_list.emplace(size, addr); + } + + uint32_t LocalKey(char *addr) { + struct ibv_mr *mr = Addr2MR(addr); + return mr->lkey; + } + + uint32_t RemoteKey(char *addr) { + struct ibv_mr *mr = Addr2MR(addr); + return mr->rkey; + } + +private: + std::mutex mu_; // for thread safety + struct ibv_pd *pd_; + + // buffer size -> buffer pointer + std::multimap free_list; + // buffer pointer -> buffer size + std::unordered_map used_list; + // first: `end` of this mr address (e.g., for mr with [addr, addr+size), + // point to `addr+size`) + std::map mr_list; + + size_t total_allocated_size = 0; + + // Convert the memory address to its associated RDMA memory region + inline struct ibv_mr *Addr2MR(char *addr) { + auto it = mr_list.lower_bound(addr); + CHECK_NE(it, mr_list.end()) + << "cannot find the associated memory region"; + return it->second; + } +}; + +class Block { +public: + explicit Block(SimpleMempool *pool, char *addr, int count) : + pool(pool), addr(addr), counter(count) { + } + + ~Block() { + CHECK_EQ(counter, 0); + pool->Free(addr); + } + + void Release() { + int v = counter.fetch_sub(1); + if (v == 1) { + delete this; + } + } + +private: + SimpleMempool *pool; + char *addr; + std::atomic counter; +}; + +enum MessageTypes : uint32_t { + kRendezvousStart, + kRendezvousReply, +}; + +struct RendezvousStart { + uint64_t meta_len; + uint64_t data_num; + uint64_t data_len[kMaxDataFields]; + uint64_t origin_addr; +}; + +struct RendezvousReply { + uint64_t addr; + uint64_t origin_addr; + uint32_t rkey; + uint32_t idx; +}; + +enum WRContextType { + kRendezvousStartContext, + kRendezvousReplyContext, + kWriteContext, + kReceiveContext +}; + +struct WRContext { + WRContextType type; + struct ibv_mr *buffer; + void *private_data; +}; + +struct BufferContext { + char *buffer; + size_t meta_len; + size_t data_num; + size_t data_len[kMaxDataFields]; +}; + +struct LocalBufferContext { + size_t meta_len; + char *meta_buf; + std::vector> data; +}; + +struct MessageBuffer { + size_t inline_len; + char *inline_buf; + WRContext *reserved_context; + std::vector> data; + std::vector> mrs; +}; + +struct RequestContext { + uint32_t node; + uint16_t port; + char hostname[kMaxHostnameLength]; +}; + +static_assert(std::is_pod::value, + "RendezvousStart must be a POD type."); +static_assert(std::is_pod::value, + "RendezvousReply must be a POD type."); +static_assert(std::is_pod::value, + "RequestContext must be a POD type."); + +static const size_t kMempoolChunkSize = + std::max(sizeof(RendezvousStart), sizeof(RendezvousReply)); + +template +class AddressPool { +public: + AddressPool() { + std::lock_guard lk(mu_); + for (int i = 0; i < kMaxEntries; i++) { + indices_.push(i); + table_[i] = nullptr; + } + } + + T *GetAddressAndRelease(uint32_t index) { + std::lock_guard lk(mu_); + T *ptr = table_[index]; + CHECK(ptr); + indices_.push(index); + table_[index] = nullptr; + return ptr; + } + + uint32_t StoreAddress(T *ptr) { + std::lock_guard lk(mu_); + CHECK(ptr); + uint32_t idx = indices_.front(); + indices_.pop(); + CHECK_EQ(table_[idx], nullptr); + table_[idx] = ptr; + return idx; + } + +private: + static const int kMaxEntries = 512; + + std::mutex mu_; + std::queue indices_; + T *table_[kMaxEntries]; +}; + +struct Endpoint { + enum ConnectionStatus { IDLE, CONNECTING, CONNECTED, REJECTED }; + + ConnectionStatus status; + int node_id; + std::condition_variable cv; + std::mutex connect_mu; + struct rdma_cm_id *cm_id; + + WRContext rx_ctx[kRxDepth]; + + WRContext start_ctx[kStartDepth]; + WRContext reply_ctx[kReplyDepth]; + WRContext write_ctx[kWriteDepth]; + + ThreadsafeQueue free_start_ctx; + ThreadsafeQueue free_reply_ctx; + ThreadsafeQueue free_write_ctx; + + Endpoint() : status(IDLE), node_id(Node::kEmpty), cm_id(nullptr), rx_ctx() { + } + + ~Endpoint() { + for (int i = 0; i < kRxDepth; ++i) { + if (!(rx_ctx[i].buffer)) { + continue; + } + free(rx_ctx[i].buffer->addr); + CHECK_EQ(ibv_dereg_mr(rx_ctx[i].buffer), 0); + } + + for (int i = 0; i < kStartDepth; ++i) { + if (start_ctx[i].buffer) { + free(start_ctx[i].buffer->addr); + CHECK_EQ(ibv_dereg_mr(start_ctx[i].buffer), 0); + } + } + + for (int i = 0; i < kReplyDepth; ++i) { + if (reply_ctx[i].buffer) { + free(reply_ctx[i].buffer->addr); + CHECK_EQ(ibv_dereg_mr(reply_ctx[i].buffer), 0); + } + } + + for (int i = 0; i < kWriteDepth; ++i) { + if (write_ctx[i].buffer) { + free(write_ctx[i].buffer->addr); + CHECK_EQ(ibv_dereg_mr(write_ctx[i].buffer), 0); + } + } + + rdma_destroy_qp(cm_id); + CHECK_EQ(rdma_destroy_id(cm_id), 0) << strerror(errno); + } + + void Disconnect() { + std::unique_lock lk(connect_mu); + CHECK_EQ(rdma_disconnect(cm_id), 0) << strerror(errno); + cv.wait(lk, [this] { return status == IDLE; }); + } + + void SetNodeID(int id) { + node_id = id; + } + + void InitSendContextHelper(struct ibv_pd *pd, WRContext *ctx, + ThreadsafeQueue *queue, size_t num, + WRContextType type) { + for (size_t i = 0; i < num; ++i) { + void *buf = aligned_alloc(kAlignment, kMempoolChunkSize); + CHECK(buf); + struct ibv_mr *mr = ibv_reg_mr(pd, buf, kMempoolChunkSize, 0); + CHECK(mr); + + ctx[i].type = type; + ctx[i].buffer = mr; + ctx[i].private_data = this; + queue->Push(&ctx[i]); + } + } + + void Init(struct ibv_cq *cq, struct ibv_pd *pd) { + struct ibv_qp_init_attr attr; + memset(&attr, 0, sizeof(ibv_qp_init_attr)); + attr.send_cq = cq; + attr.recv_cq = cq; + attr.cap.max_send_wr = kStartDepth + kReplyDepth + kWriteDepth; + attr.cap.max_recv_wr = kRxDepth; + attr.cap.max_send_sge = kSGEntry; + attr.cap.max_recv_sge = kSGEntry; + attr.qp_type = IBV_QPT_RC; + attr.sq_sig_all = 0; + + CHECK_EQ(rdma_create_qp(cm_id, pd, &attr), 0) + << "Create RDMA queue pair failed"; + + InitSendContextHelper(pd, start_ctx, &free_start_ctx, kStartDepth, + kRendezvousStartContext); + InitSendContextHelper(pd, reply_ctx, &free_reply_ctx, kReplyDepth, + kRendezvousReplyContext); + InitSendContextHelper(pd, write_ctx, &free_write_ctx, kWriteDepth, + kWriteContext); + + for (size_t i = 0; i < kRxDepth; ++i) { + void *buf = aligned_alloc(kAlignment, kMempoolChunkSize); + CHECK(buf); + struct ibv_mr *mr = + ibv_reg_mr(pd, buf, kMempoolChunkSize, IBV_ACCESS_LOCAL_WRITE); + CHECK(mr); + + rx_ctx[i].type = kReceiveContext; + rx_ctx[i].buffer = mr; + rx_ctx[i].private_data = this; + + PostRecv(&rx_ctx[i]); + } + } + + void PostRecv(WRContext *ctx) { + struct ibv_recv_wr wr, *bad_wr = nullptr; + memset(&wr, 0, sizeof(wr)); + + struct ibv_sge sge; + sge.addr = reinterpret_cast(ctx->buffer->addr); + sge.length = kMempoolChunkSize; + sge.lkey = ctx->buffer->lkey; + + wr.wr_id = reinterpret_cast(ctx); + wr.next = nullptr; + wr.sg_list = &sge; + wr.num_sge = 1; + + CHECK_EQ(ibv_post_recv(cm_id->qp, &wr, &bad_wr), 0) + << "ibv_post_recv failed."; + } +}; + +class IBVerbsVan : public Van { +public: + IBVerbsVan() { + } + ~IBVerbsVan() { + } + +protected: + void Start(int customer_id) override { + start_mu_.lock(); + should_stop_ = false; + + if (event_channel_ == nullptr) { + event_channel_ = rdma_create_event_channel(); + CHECK(event_channel_) << "Create RDMA event channel failed"; + + cm_event_polling_thread_.reset( + new std::thread(&IBVerbsVan::PollEvents, this)); + } + + start_mu_.unlock(); + Van::Start(customer_id); + } + + void Stop() override { + PS_VLOG(1) << my_node_.ShortDebugString() << " is stopping"; + Van::Stop(); + + should_stop_ = true; + CHECK(should_stop_); + + PS_VLOG(1) << "Stopping cq_polling_thread_."; + cq_polling_thread_->join(); + cq_polling_thread_.reset(); + + PS_VLOG(1) << "Stopping cm_event_polling_thread_."; + cm_event_polling_thread_->join(); + cm_event_polling_thread_.reset(); + + PS_VLOG(1) << "Clearing mempool."; + mempool_.reset(); + + for (auto &it : allocated_mr_) { + ibv_dereg_mr(it.second); + } + + PS_VLOG(1) << "Clearing endpoints."; + incoming_.clear(); + endpoints_.clear(); + + PS_VLOG(1) << "Destroying cq and pd."; + CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ"; + CHECK(!ibv_destroy_comp_channel(comp_event_channel_)) + << "Failed to destroy channel"; + + // TODO(changlan): ibv_dealloc_pd sometimes complains about busy + // resources + + PS_VLOG(1) << "Destroying listener."; + rdma_destroy_id(listener_); + rdma_destroy_event_channel(event_channel_); + } + + int Bind(const Node &node, int max_retry) override { + CHECK(rdma_create_id(event_channel_, &listener_, nullptr, RDMA_PS_TCP) + == 0) + << "Create RDMA connection identifier failed"; + + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + int port = node.port; + unsigned seed = static_cast(time(NULL) + port); + for (int i = 0; i < max_retry + 1; ++i) { + addr.sin_port = htons(port); + if (rdma_bind_addr(listener_, + reinterpret_cast(&addr)) + == 0) { + break; + } + if (i == max_retry) { + port = -1; + } else { + port = 10000 + rand_r(&seed) % 40000; + } + } + CHECK(rdma_listen(listener_, kRdmaListenBacklog) == 0) + << "Listen RDMA connection failed: " << strerror(errno); + return port; + } + + void Connect(const Node &node) override { + PS_VLOG(1) << "Connecting to " << my_node_.ShortDebugString(); + CHECK_NE(node.id, node.kEmpty); + CHECK_NE(node.port, node.kEmpty); + CHECK(node.hostname.size()); + + // worker doesn't need to connect to the other workers. same for server + if ((node.role == my_node_.role) && (node.id != my_node_.id)) { + return; + } + + std::string node_host_ip = + node.hostname + ":" + std::to_string(node.port); + if (node.id != Node::kEmpty) { + auto it = endpoints_.find(node.id); + + // if there is an endpoint with pending connection + if (it != endpoints_.end()) { + endpoints_.erase(it); + } + + Endpoint *endpoint; + endpoints_[node.id] = std::unique_ptr(new Endpoint()); + endpoint = endpoints_[node.id].get(); + + endpoint->SetNodeID(node.id); + + struct addrinfo *remote_addr; + CHECK_EQ(getaddrinfo(node.hostname.c_str(), + std::to_string(node.port).c_str(), nullptr, + &remote_addr), + 0); + + while (endpoint->status != Endpoint::CONNECTED) { + std::unique_lock lk(endpoint->connect_mu); + endpoint->status = Endpoint::CONNECTING; + + if (endpoint->cm_id != nullptr) { + rdma_destroy_qp(endpoint->cm_id); + CHECK_EQ(rdma_destroy_id(endpoint->cm_id), 0) + << strerror(errno); + endpoint->cm_id = nullptr; + } + + CHECK_EQ(rdma_create_id(event_channel_, &endpoint->cm_id, + nullptr, RDMA_PS_TCP), + 0) + << "Create RDMA connection identifier failed"; + endpoint->cm_id->context = endpoint; + + CHECK_EQ(rdma_resolve_addr(endpoint->cm_id, nullptr, + remote_addr->ai_addr, kTimeoutms), + 0) + << "Resolve RDMA address failed with errno: " << errno; + + endpoint->cv.wait(lk, [endpoint] { + return endpoint->status != Endpoint::CONNECTING; + }); + + if (endpoint->status == Endpoint::CONNECTED) + break; + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + + freeaddrinfo(remote_addr); + } + } + + int SendMsg(const Message &msg) override { + int remote_id = msg.meta.recver; + CHECK_NE(remote_id, Meta::kEmpty); + + PBMeta meta; + PackMetaPB(msg.meta, &meta); + + CHECK_NE(endpoints_.find(remote_id), endpoints_.end()); + Endpoint *endpoint = endpoints_[remote_id].get(); + MessageBuffer *msg_buf = new MessageBuffer(); + + size_t meta_len = meta.ByteSize(); + size_t data_len = msg.meta.data_size; + size_t total_len = meta_len + data_len; + + CHECK(meta_len); + + // For control messages, inline the message content + // into the START message. + // Otherwise, register the data buffer as RDMA memory + // region. + if (!msg.meta.control.empty()) { // control message + msg_buf->inline_len = total_len; + msg_buf->inline_buf = mempool_->Alloc(total_len); + meta.SerializeToArray(msg_buf->inline_buf, meta_len); + char *cur = msg_buf->inline_buf + meta_len; + for (auto &sa : msg.data) { + size_t seg_len = sa.size(); + memcpy(cur, sa.data(), seg_len); + cur += seg_len; + } + } else { // data message + msg_buf->inline_len = meta_len; + msg_buf->inline_buf = mempool_->Alloc(meta_len); + msg_buf->data = msg.data; + meta.SerializeToArray(msg_buf->inline_buf, meta_len); + + for (auto &sa : msg_buf->data) { + if (sa.size() == 0) { + continue; + } + // Optimization: If the memory region has been registered, + // (assuming the previously registered address is not freed) + // re-use the same memory region. + char *p = sa.data(); + auto it = allocated_mr_.find(p); + if (it == allocated_mr_.end()) { + allocated_mr_[p] = ibv_reg_mr(pd_, p, sa.size(), 0); + } + CHECK(allocated_mr_[p]) << "Invalid memory region"; + msg_buf->mrs.push_back({allocated_mr_[p], sa.size()}); + } + } + + // Take the second context buffer first to avoid deadlock + WRContext *context = nullptr, *reserved = nullptr; + endpoint->free_write_ctx.WaitAndPop(&reserved); + endpoint->free_start_ctx.WaitAndPop(&context); + + msg_buf->reserved_context = reserved; + + RendezvousStart *req = + reinterpret_cast(context->buffer->addr); + req->meta_len = meta_len; + + for (size_t i = 0; i < msg.data.size(); ++i) { + req->data_len[i] = msg.data[i].size(); + } + req->data_num = msg.data.size(); + req->origin_addr = reinterpret_cast(msg_buf); + + struct ibv_sge sge; + sge.addr = reinterpret_cast(req); + sge.length = sizeof(RendezvousStart); + sge.lkey = context->buffer->lkey; + + struct ibv_send_wr wr, *bad_wr = nullptr; + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = reinterpret_cast(context); + wr.opcode = IBV_WR_SEND_WITH_IMM; + wr.next = nullptr; + + wr.imm_data = kRendezvousStart; + + wr.send_flags = IBV_SEND_SIGNALED; + wr.sg_list = &sge; + wr.num_sge = 1; + CHECK_EQ(ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0) + << strerror(errno); + + return total_len; + } + + int RecvMsg(Message *msg) override { + msg->data.clear(); + std::tuple notification; + recv_buffers_.WaitAndPop(¬ification); + + Endpoint *endpoint = std::get<0>(notification); + BufferContext *buffer_ctx = std::get<1>(notification); + + int total_len = 0; + + msg->meta.recver = my_node_.id; + msg->meta.sender = endpoint->node_id; + + char *cur = buffer_ctx->buffer; + + UnpackMeta(cur, buffer_ctx->meta_len, &msg->meta); + total_len += buffer_ctx->meta_len; + uint64_t data_num = buffer_ctx->data_num; + cur += buffer_ctx->meta_len; + + if (data_num > 0) { + Block *mem_block = + new Block(mempool_.get(), buffer_ctx->buffer, data_num); + + for (size_t i = 0; i < data_num; i++) { + uint32_t len = buffer_ctx->data_len[i]; + SArray data; + data.reset(cur, len, [mem_block](void *) { + mem_block->Release(); + }); // Defer the deletion of block_ref + msg->data.push_back(data); + cur += len; + total_len += len; + } + } else { + mempool_->Free(buffer_ctx->buffer); + } + + delete buffer_ctx; + return total_len; + } + +private: + void InitContext(struct ibv_context *context) { + context_ = context; + CHECK(context_) << "ibv_context* empty"; + + pd_ = ibv_alloc_pd(context_); + CHECK(pd_) << "Failed to allocate protection domain"; + + mempool_.reset(new SimpleMempool(pd_)); + + comp_event_channel_ = ibv_create_comp_channel(context_); + + // TODO(clan): Replace the rough estimate here + cq_ = ibv_create_cq(context_, kMaxConcurrentWorkRequest * 2, NULL, + comp_event_channel_, 0); + + CHECK(cq_) << "Failed to create completion queue"; + CHECK(!ibv_req_notify_cq(cq_, 0)) + << "Failed to request CQ notification"; + } + + void ReleaseWorkRequestContext(WRContext *context, Endpoint *endpoint) { + switch (context->type) { + case kRendezvousStartContext: + endpoint->free_start_ctx.Push(context); + break; + case kRendezvousReplyContext: + endpoint->free_reply_ctx.Push(context); + break; + case kWriteContext: + endpoint->free_write_ctx.Push(context); + break; + case kReceiveContext: + endpoint->PostRecv(context); + break; + default: + CHECK(0); + } + } + + void PollCQ() { + // Pre-allocated work completions array used for polling + struct ibv_wc wc[kMaxConcurrentWorkRequest]; + while (!should_stop_.load()) { + int ne = ibv_poll_cq(cq_, kMaxConcurrentWorkRequest, wc); + CHECK_GE(ne, 0); + for (int i = 0; i < ne; ++i) { + CHECK(wc[i].status == IBV_WC_SUCCESS) + << "Failed status \n" + << ibv_wc_status_str(wc[i].status) << " " << wc[i].status + << " " << static_cast(wc[i].wr_id) << " " + << wc[i].vendor_err; + + WRContext *context = reinterpret_cast(wc[i].wr_id); + Endpoint *endpoint = + reinterpret_cast(context->private_data); + + CHECK(endpoint); + + switch (wc[i].opcode) { + case IBV_WC_SEND: + // LOG(INFO) << "opcode: IBV_WC_SEND"; + ReleaseWorkRequestContext(context, endpoint); + break; + case IBV_WC_RDMA_WRITE: { + // LOG(INFO) << "opcode: IBV_WC_RDMA_WRITE"; + // Note: This is not a struct ibv_mr* + MessageBuffer *msg_buf = + *reinterpret_cast( + context->buffer->addr); + mempool_->Free(msg_buf->inline_buf); + delete msg_buf; + ReleaseWorkRequestContext(context, endpoint); + } break; + case IBV_WC_RECV_RDMA_WITH_IMM: { + // LOG(INFO) << "opcode: IBV_WC_RECV_RDMA_WITH_IMM"; + uint32_t addr_idx = wc[i].imm_data; + BufferContext *buf_ctx = + addr_pool_.GetAddressAndRelease(addr_idx); + recv_buffers_.Push(std::make_tuple(endpoint, buf_ctx)); + ReleaseWorkRequestContext(context, endpoint); + } break; + case IBV_WC_RECV: { + CHECK(wc[i].wc_flags & IBV_WC_WITH_IMM); + uint32_t imm = wc[i].imm_data; + struct ibv_mr *mr = context->buffer; + + if (imm == kRendezvousStart) { + // LOG(INFO) << "opcode: IBV_WC_RECV kRendezvousStart"; + RendezvousStart *req = + reinterpret_cast(mr->addr); + BufferContext *buf_ctx = new BufferContext(); + + uint64_t len = req->meta_len; + buf_ctx->meta_len = len; + buf_ctx->data_num = req->data_num; + for (size_t i = 0; i < req->data_num; ++i) { + buf_ctx->data_len[i] = req->data_len[i]; + len += req->data_len[i]; + } + + char *buffer = mempool_->Alloc(len); + CHECK(buffer) << "Alloc for " << len + << " bytes, data_num: " << req->data_num; + + buf_ctx->buffer = buffer; + + uint64_t origin_addr = req->origin_addr; + + WRContext *reply_ctx = nullptr; + endpoint->free_reply_ctx.WaitAndPop(&reply_ctx); + RendezvousReply *resp = + reinterpret_cast( + reply_ctx->buffer->addr); + + resp->addr = reinterpret_cast(buffer); + resp->rkey = mempool_->RemoteKey(buffer); + resp->origin_addr = origin_addr; + resp->idx = addr_pool_.StoreAddress(buf_ctx); + + struct ibv_sge sge; + sge.addr = reinterpret_cast(resp); + sge.length = sizeof(RendezvousReply); + sge.lkey = reply_ctx->buffer->lkey; + + struct ibv_send_wr wr, *bad_wr = nullptr; + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = reinterpret_cast(reply_ctx); + wr.opcode = IBV_WR_SEND_WITH_IMM; + wr.next = nullptr; + + wr.imm_data = kRendezvousReply; + + wr.send_flags = IBV_SEND_SIGNALED; + wr.sg_list = &sge; + wr.num_sge = 1; + + CHECK_EQ( + ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0) + << "ibv_post_send failed."; + + } else if (imm == kRendezvousReply) { + // LOG(INFO) << "opcode: IBV_WC_RECV kRendezvousReply"; + RendezvousReply *resp = + reinterpret_cast(mr->addr); + uint64_t remote_addr = resp->addr; + uint64_t origin_addr = resp->origin_addr; + uint32_t rkey = resp->rkey; + uint32_t idx = resp->idx; + + MessageBuffer *msg_buf = + reinterpret_cast(origin_addr); + + struct ibv_sge sge[1 + msg_buf->mrs.size()]; + + sge[0].addr = + reinterpret_cast(msg_buf->inline_buf); + sge[0].length = msg_buf->inline_len; + sge[0].lkey = mempool_->LocalKey(msg_buf->inline_buf); + + size_t num_sge = 1; + for (auto &pair : msg_buf->mrs) { + size_t length = pair.second; + CHECK(length); + sge[num_sge].addr = + reinterpret_cast(pair.first->addr); + sge[num_sge].length = length; + sge[num_sge].lkey = pair.first->lkey; + ++num_sge; + } + + WRContext *write_ctx = msg_buf->reserved_context; + + MessageBuffer **tmp = + reinterpret_cast( + write_ctx->buffer->addr); + *tmp = msg_buf; // write the addr of msg_buf into the mr + // buffer + + struct ibv_send_wr wr, *bad_wr = nullptr; + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = reinterpret_cast(write_ctx); + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.next = nullptr; + + wr.imm_data = idx; + + wr.send_flags = IBV_SEND_SIGNALED; + wr.sg_list = sge; + wr.num_sge = num_sge; + + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + CHECK_EQ( + ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0) + << "ibv_post_send failed."; + + } else { + CHECK(0); + } + ReleaseWorkRequestContext(context, endpoint); + } break; + default: + CHECK(0) << "Unexpected opcode: " << wc[i].opcode; + } + } + } + } + + void PollEvents() { + int flags = fcntl(event_channel_->fd, F_GETFL); + int rc = fcntl(event_channel_->fd, F_SETFL, flags | O_NONBLOCK); + CHECK_GE(rc, 0); + int error_flags = POLLERR | POLLHUP | POLLNVAL; + + while (!should_stop_.load()) { + struct pollfd pfd = { + .fd = event_channel_->fd, .events = POLLIN, .revents = 0}; + int ret = poll(&pfd, 1, 10); + + CHECK_GE(ret, 0) << strerror(errno); + CHECK_EQ(pfd.revents & error_flags, 0); + + if (!(pfd.revents & POLLIN)) { + continue; + } + + struct rdma_cm_event *event; + CHECK_EQ(rdma_get_cm_event(event_channel_, &event), 0); + // TODO(clan): Reorder the list according to the event frequency + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + OnConnectRequest(event); + break; + case RDMA_CM_EVENT_ADDR_RESOLVED: + OnAddrResolved(event); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + OnRouteResolved(event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + OnConnected(event); + break; + case RDMA_CM_EVENT_DISCONNECTED: + OnDisconnected(event); + break; + case RDMA_CM_EVENT_REJECTED: + OnRejected(event); + break; + default: + CHECK(0) << "OnEvent: unknown event " << event->event << " (" + << rdma_event_str(event->event) << ")"; + } + rdma_ack_cm_event(event); + } + } + + void OnRejected(struct rdma_cm_event *event) { + struct rdma_cm_id *id = event->id; + Endpoint *endpoint = reinterpret_cast(id->context); + + auto it = endpoints_.find(endpoint->node_id); + CHECK(it != endpoints_.end()) << "Connection not ready."; + CHECK_EQ(endpoint->status, Endpoint::CONNECTING); + CHECK_EQ(endpoint->cm_id, id); + + PS_VLOG(1) << "Connection rejected, retrying..."; + { + std::lock_guard lk(endpoint->connect_mu); + endpoint->status = Endpoint::REJECTED; + } + endpoint->cv.notify_all(); + } + + void OnConnectRequest(struct rdma_cm_event *event) { + struct rdma_cm_id *id = event->id; + CHECK_NOTNULL(id); + + CHECK_LE(sizeof(RequestContext), event->param.conn.private_data_len) + << "RequestContext size mismatch. Actual: " + << (size_t)event->param.conn.private_data_len + << ", Expected: " << sizeof(RequestContext); + CHECK_NOTNULL(event->param.conn.private_data); + + const RequestContext *remote_ctx = + reinterpret_cast( + event->param.conn.private_data); + + const auto r = + incoming_.emplace(std::unique_ptr(new Endpoint())); + Endpoint *endpoint = r.first->get(); + endpoint->SetNodeID(remote_ctx->node); + endpoint->cm_id = id; + id->context = endpoint; + + if (context_ == nullptr) { + InitContext(id->verbs); + } + + endpoint->Init(cq_, pd_); + + RequestContext ctx; + ctx.node = static_cast(my_node_.id); + ctx.port = static_cast(my_node_.port); + snprintf(ctx.hostname, kMaxHostnameLength, "%s", + my_node_.hostname.c_str()); + + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); + cm_params.retry_count = 7; + cm_params.rnr_retry_count = 7; + cm_params.private_data = &ctx; + cm_params.private_data_len = sizeof(RequestContext); + + CHECK_EQ(rdma_accept(id, &cm_params), 0) + << "Accept RDMA connection failed: " << strerror(errno); + } + + // Resolve a route after address is resolved + void OnAddrResolved(struct rdma_cm_event *event) { + struct rdma_cm_id *id = event->id; + CHECK_EQ(rdma_resolve_route(id, kTimeoutms), 0) + << "Resolve RDMA route failed"; + } + + // Make a connection after route is resolved + void OnRouteResolved(struct rdma_cm_event *event) { + struct rdma_cm_id *id = event->id; + Endpoint *endpoint = reinterpret_cast(id->context); + + if (context_ == nullptr) { + InitContext(id->verbs); + } + + endpoint->Init(cq_, pd_); + + RequestContext ctx; + ctx.node = static_cast(my_node_.id); + ctx.port = static_cast(my_node_.port); + snprintf(ctx.hostname, kMaxHostnameLength, "%s", + my_node_.hostname.c_str()); + + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); + cm_params.retry_count = 7; + cm_params.rnr_retry_count = 7; + cm_params.private_data = &ctx; + cm_params.private_data_len = sizeof(RequestContext); + + CHECK_EQ(rdma_connect(id, &cm_params), 0) + << "RDMA connect failed" << strerror(errno); + } + + void OnConnected(struct rdma_cm_event *event) { + struct rdma_cm_id *id = event->id; + CHECK(id) << "rdma_cm_id not found."; + Endpoint *endpoint = reinterpret_cast(id->context); + CHECK(endpoint) << "Endpoint not found."; + + if (cq_polling_thread_ == nullptr) { + cq_polling_thread_.reset( + new std::thread(&IBVerbsVan::PollCQ, this)); + } + + CHECK_EQ(endpoint->cm_id, id); + { + std::lock_guard lk(endpoint->connect_mu); + endpoint->status = Endpoint::CONNECTED; + } + endpoint->cv.notify_all(); + } + + void OnDisconnected(struct rdma_cm_event *event) { + LOG(INFO) << "OnDisconnected from Node " << my_node_.id; + struct rdma_cm_id *id = event->id; + Endpoint *endpoint = reinterpret_cast(id->context); + { + std::lock_guard lk(endpoint->connect_mu); + endpoint->status = Endpoint::IDLE; + } + endpoint->cv.notify_all(); + } + + AddressPool addr_pool_; + std::unique_ptr mempool_; + + struct rdma_cm_id *listener_ = nullptr; + std::atomic should_stop_; + + std::unordered_map> endpoints_; + std::unordered_set> incoming_; + + struct rdma_event_channel *event_channel_ = nullptr; + struct ibv_context *context_ = nullptr; + + std::unordered_map allocated_mr_; + + // ibverbs protection domain + struct ibv_pd *pd_ = nullptr; + // Completion event channel, to wait for work completions + struct ibv_comp_channel *comp_event_channel_ = nullptr; + // Completion queue, to poll on work completions + struct ibv_cq *cq_ = nullptr; + // cq thread + std::unique_ptr cq_polling_thread_; + // event thread + std::unique_ptr cm_event_polling_thread_; + // Recv buffer queue + ThreadsafeQueue> recv_buffers_; +}; // namespace ps +}; // namespace ps + +#endif // DMLC_USE_IBVERBS +#endif // PS_IBVERBS_VAN_H_ diff --git a/ps-lite/src/network_utils.h b/ps-lite/src/network_utils.h new file mode 100644 index 0000000..953380e --- /dev/null +++ b/ps-lite/src/network_utils.h @@ -0,0 +1,277 @@ +/** + * Copyright (c) 2015 by Contributors + * @file network_utils.h + * @brief network utilities + */ +#ifndef PS_NETWORK_UTILS_H_ +#define PS_NETWORK_UTILS_H_ +#include +#ifdef _MSC_VER +#include +#include +#include +#include +#undef interface +#else +#include +#include +#include +#include +#endif +#include + +namespace ps { + +/** + * \brief return the IP address for given interface eth0, eth1, ... + */ +void GetIP(const std::string &interface, std::string *ip) { +#ifdef _MSC_VER + typedef std::basic_string tstring; + // Try to get the Adapters-info table, so we can given useful names to the + // IP addresses we are returning. Gotta call GetAdaptersInfo() up to 5 + // times to handle the potential race condition between the size-query call + // and the get-data call. I love a well-designed API :^P + IP_ADAPTER_INFO *pAdapterInfo = NULL; + { + ULONG bufLen = 0; + for (int i = 0; i < 5; i++) { + DWORD apRet = GetAdaptersInfo(pAdapterInfo, &bufLen); + if (apRet == ERROR_BUFFER_OVERFLOW) { + free(pAdapterInfo); // in case we had previously allocated it + pAdapterInfo = static_cast(malloc(bufLen)); + } else if (apRet == ERROR_SUCCESS) { + break; + } else { + free(pAdapterInfo); + pAdapterInfo = NULL; + break; + } + } + } + if (pAdapterInfo) { + tstring keybase = + _T( + "SYSTEM\\CurrentControlSet\\Control\\Network\\{4D36E972-E325-11CE-BFC1-08002BE10318}\\"); + tstring connection = _T("\\Connection"); + + IP_ADAPTER_INFO *curpAdapterInfo = pAdapterInfo; + while (curpAdapterInfo) { + HKEY hKEY; + std::string AdapterName = curpAdapterInfo->AdapterName; + // GUID only ascii + tstring key_set = keybase + + tstring(AdapterName.begin(), AdapterName.end()) + + connection; + LPCTSTR data_Set = key_set.c_str(); + LPCTSTR dwValue = NULL; + if (ERROR_SUCCESS + == ::RegOpenKeyEx(HKEY_LOCAL_MACHINE, data_Set, 0, KEY_READ, + &hKEY)) { + DWORD dwSize = 0; + DWORD dwType = REG_SZ; + if (ERROR_SUCCESS + == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType, + (LPBYTE)dwValue, &dwSize)) { + dwValue = new TCHAR[dwSize]; + if (ERROR_SUCCESS + == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType, + (LPBYTE)dwValue, &dwSize)) { + // interface name must only ascii + tstring tstr = dwValue; + std::string s(tstr.begin(), tstr.end()); + if (s == interface) { + *ip = + curpAdapterInfo->IpAddressList.IpAddress.String; + break; + } + } + } + ::RegCloseKey(hKEY); + } + curpAdapterInfo = curpAdapterInfo->Next; + } + free(pAdapterInfo); + } +#else + struct ifaddrs *ifAddrStruct = NULL; + struct ifaddrs *ifa = NULL; + void *tmpAddrPtr = NULL; + + getifaddrs(&ifAddrStruct); + for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family == AF_INET) { + // is a valid IP4 Address + tmpAddrPtr = + &(reinterpret_cast(ifa->ifa_addr)) + ->sin_addr; + char addressBuffer[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, tmpAddrPtr, addressBuffer, INET_ADDRSTRLEN); + if (strncmp(ifa->ifa_name, interface.c_str(), interface.size()) + == 0) { + *ip = addressBuffer; + break; + } + } + } + if (ifAddrStruct != NULL) + freeifaddrs(ifAddrStruct); +#endif +} + +/** + * \brief return the IP address and Interface the first interface which is not + * loopback + * + * only support IPv4 + */ +void GetAvailableInterfaceAndIP(std::string *interface, std::string *ip) { +#ifdef _MSC_VER + typedef std::basic_string tstring; + IP_ADAPTER_INFO *pAdapterInfo = NULL; + { + ULONG bufLen = 0; + for (int i = 0; i < 5; i++) { + DWORD apRet = GetAdaptersInfo(pAdapterInfo, &bufLen); + if (apRet == ERROR_BUFFER_OVERFLOW) { + free(pAdapterInfo); // in case we had previously allocated it + pAdapterInfo = static_cast(malloc(bufLen)); + } else if (apRet == ERROR_SUCCESS) { + break; + } else { + free(pAdapterInfo); + pAdapterInfo = NULL; + break; + } + } + } + if (pAdapterInfo) { + tstring keybase = + _T( + "SYSTEM\\CurrentControlSet\\Control\\Network\\{4D36E972-E325-11CE-BFC1-08002BE10318}\\"); + tstring connection = _T("\\Connection"); + + IP_ADAPTER_INFO *curpAdapterInfo = pAdapterInfo; + HKEY hKEY = NULL; + while (curpAdapterInfo) { + std::string curip(curpAdapterInfo->IpAddressList.IpAddress.String); + if (curip == "127.0.0.1") { + curpAdapterInfo = curpAdapterInfo->Next; + continue; + } + if (curip == "0.0.0.0") { + curpAdapterInfo = curpAdapterInfo->Next; + continue; + } + + std::string AdapterName = curpAdapterInfo->AdapterName; + // GUID only ascii + tstring key_set = keybase + + tstring(AdapterName.begin(), AdapterName.end()) + + connection; + LPCTSTR data_Set = key_set.c_str(); + LPCTSTR dwValue = NULL; + if (ERROR_SUCCESS + == ::RegOpenKeyEx(HKEY_LOCAL_MACHINE, data_Set, 0, KEY_READ, + &hKEY)) { + DWORD dwSize = 0; + DWORD dwType = REG_SZ; + if (ERROR_SUCCESS + == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType, + (LPBYTE)dwValue, &dwSize)) { + dwValue = new TCHAR[dwSize]; + if (ERROR_SUCCESS + == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType, + (LPBYTE)dwValue, &dwSize)) { + // interface name must only ascii + tstring tstr = dwValue; + std::string s(tstr.begin(), tstr.end()); + + *interface = s; + *ip = curip; + break; + } + } + ::RegCloseKey(hKEY); + hKEY = NULL; + } + curpAdapterInfo = curpAdapterInfo->Next; + } + if (hKEY != NULL) { + ::RegCloseKey(hKEY); + } + free(pAdapterInfo); + } +#else + struct ifaddrs *ifAddrStruct = nullptr; + struct ifaddrs *ifa = nullptr; + + interface->clear(); + ip->clear(); + getifaddrs(&ifAddrStruct); + for (ifa = ifAddrStruct; ifa != nullptr; ifa = ifa->ifa_next) { + if (nullptr == ifa->ifa_addr) + continue; + + if (AF_INET == ifa->ifa_addr->sa_family + && 0 == (ifa->ifa_flags & IFF_LOOPBACK)) { + char address_buffer[INET_ADDRSTRLEN]; + void *sin_addr_ptr = + &(reinterpret_cast(ifa->ifa_addr)) + ->sin_addr; + inet_ntop(AF_INET, sin_addr_ptr, address_buffer, INET_ADDRSTRLEN); + + *ip = address_buffer; + *interface = ifa->ifa_name; + + break; + } + } + if (nullptr != ifAddrStruct) + freeifaddrs(ifAddrStruct); + return; +#endif +} + +/** + * \brief return an available port on local machine + * + * only support IPv4 + * \return 0 on failure + */ +int GetAvailablePort() { + struct sockaddr_in addr; + addr.sin_port = + htons(0); // have system pick up a random port available for me + addr.sin_family = AF_INET; // IPV4 + addr.sin_addr.s_addr = htonl(INADDR_ANY); // set our addr to any interface + + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (0 != bind(sock, (struct sockaddr *)&addr, sizeof(struct sockaddr_in))) { + perror("bind():"); + return 0; + } +#ifdef _MSC_VER + int addr_len = sizeof(struct sockaddr_in); +#else + socklen_t addr_len = sizeof(struct sockaddr_in); +#endif + + if (0 != getsockname(sock, (struct sockaddr *)&addr, &addr_len)) { + perror("getsockname():"); + return 0; + } + + int ret_port = ntohs(addr.sin_port); +#ifdef _MSC_VER + closesocket(sock); +#else + close(sock); +#endif + return ret_port; +} + +} // namespace ps +#endif // PS_NETWORK_UTILS_H_ diff --git a/ps-lite/src/p3_van.h b/ps-lite/src/p3_van.h new file mode 100644 index 0000000..fb2e880 --- /dev/null +++ b/ps-lite/src/p3_van.h @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_P3_VAN_H_ +#define PS_P3_VAN_H_ +#include +namespace ps { + +/** + * \brief P3 based Van implementation + */ +class P3Van : public ZMQVan { +public: + P3Van() { + } + virtual ~P3Van() { + } + +protected: + void Start(int customer_id) override { + start_mu_.lock(); + if (init_stage == 0) { + // start sender + // set num_threads= 5 for debug + int num_threads = 5; + for (int i = 0; i < num_threads; i++) + sender_threads_.emplace_back( + new std::thread(&P3Van::Sending, this)); + // sender_thread_ = std::unique_ptr( + // new std::thread(&P3Van::Sending, this)); + init_stage++; + } + start_mu_.unlock(); + ZMQVan::Start(customer_id); + } + + void Stop() override { + ZMQVan::Stop(); + for (auto &thread : sender_threads_) + thread->join(); + } + + int SendMsg(const Message &msg) override { + send_queue_.Push(msg); + return 0; + } + + void Sending() { + while (true) { + Message msg; + send_queue_.WaitAndPop(&msg); + ZMQVan::SendMsg(msg); + if (!msg.meta.control.empty() + && msg.meta.control.cmd == Control::TERMINATE) { + // debug for stop + send_queue_.Push(msg); + break; + } + } + } + +private: + /** the thread for sending messages */ + // using multithread + std::vector> sender_threads_; + ThreadsafePQueue send_queue_; + int init_stage = 0; +}; +} // namespace ps + +#endif // PS_P3_VAN_H_ diff --git a/ps-lite/src/postoffice.cc b/ps-lite/src/postoffice.cc new file mode 100644 index 0000000..0d91ab7 --- /dev/null +++ b/ps-lite/src/postoffice.cc @@ -0,0 +1,222 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#include +#include +#include +#include "ps/internal/postoffice.h" +#include "ps/internal/message.h" +#include "ps/base.h" +#include "common/logging.h" + +namespace ps { +Postoffice::Postoffice() { + env_ref_ = Environment::_GetSharedRef(); +} + +void Postoffice::InitEnvironment() { + const char *val = NULL; + std::string van_type = GetEnv("DMLC_PS_VAN_TYPE", "zmq"); + van_ = Van::Create(van_type); + val = CHECK_NOTNULL(Environment::Get()->find("DMLC_NUM_WORKER")); + num_workers_ = atoi(val); + val = CHECK_NOTNULL(Environment::Get()->find("DMLC_NUM_SERVER")); + num_servers_ = atoi(val); + val = CHECK_NOTNULL(Environment::Get()->find("DMLC_ROLE")); + std::string role(val); + is_worker_ = role == "worker"; + is_server_ = role == "server"; + is_scheduler_ = role == "scheduler"; + verbose_ = GetEnv("PS_VERBOSE", 0); +} + +void Postoffice::Start(int customer_id, const char *argv0, + const bool do_barrier) { + start_mu_.lock(); + if (init_stage_ == 0) { + InitEnvironment(); + // init glog + if (argv0) { + dmlc::InitLogging(argv0); + } else { + dmlc::InitLogging("ps-lite\0"); + } + + // init node info. + for (int i = 0; i < num_workers_; ++i) { + int id = WorkerRankToID(i); + for (int g : {id, kWorkerGroup, kWorkerGroup + kServerGroup, + kWorkerGroup + kScheduler, + kWorkerGroup + kServerGroup + kScheduler}) { + node_ids_[g].push_back(id); + } + } + + for (int i = 0; i < num_servers_; ++i) { + int id = ServerRankToID(i); + for (int g : {id, kServerGroup, kWorkerGroup + kServerGroup, + kServerGroup + kScheduler, + kWorkerGroup + kServerGroup + kScheduler}) { + node_ids_[g].push_back(id); + } + } + + for (int g : {kScheduler, kScheduler + kServerGroup + kWorkerGroup, + kScheduler + kWorkerGroup, kScheduler + kServerGroup}) { + node_ids_[g].push_back(kScheduler); + } + init_stage_++; + } + start_mu_.unlock(); + + // start van + van_->Start(customer_id); + + start_mu_.lock(); + if (init_stage_ == 1) { + // record start time + start_time_ = time(NULL); + init_stage_++; + } + start_mu_.unlock(); + // do a barrier here + if (do_barrier) + Barrier(customer_id, kWorkerGroup + kServerGroup + kScheduler); +} + +void Postoffice::Finalize(const int customer_id, const bool do_barrier) { + if (init_stage_ == 0) + return; + if (do_barrier) + Barrier(customer_id, kWorkerGroup + kServerGroup + kScheduler); + if (customer_id == 0) { + num_workers_ = 0; + num_servers_ = 0; + van_->Stop(); + init_stage_ = 0; + customers_.clear(); + node_ids_.clear(); + barrier_done_.clear(); + server_key_ranges_.clear(); + heartbeats_.clear(); + if (exit_callback_) + exit_callback_(); + } +} + +void Postoffice::AddCustomer(Customer *customer) { + std::lock_guard lk(mu_); + int app_id = CHECK_NOTNULL(customer)->app_id(); + // check if the customer id has existed + int customer_id = CHECK_NOTNULL(customer)->customer_id(); + CHECK_EQ(customers_[app_id].count(customer_id), (size_t)0) + << "customer_id " << customer_id << " already exists\n"; + customers_[app_id].insert(std::make_pair(customer_id, customer)); + std::unique_lock ulk(barrier_mu_); + barrier_done_[app_id].insert(std::make_pair(customer_id, false)); +} + +void Postoffice::RemoveCustomer(Customer *customer) { + std::lock_guard lk(mu_); + int app_id = CHECK_NOTNULL(customer)->app_id(); + int customer_id = CHECK_NOTNULL(customer)->customer_id(); + customers_[app_id].erase(customer_id); + if (customers_[app_id].empty()) { + customers_.erase(app_id); + } +} + +Customer *Postoffice::GetCustomer(int app_id, int customer_id, + int timeout) const { + Customer *obj = nullptr; + for (int i = 0; i < timeout * 1000 + 1; ++i) { + { + std::lock_guard lk(mu_); + const auto it = customers_.find(app_id); + if (it != customers_.end()) { + std::unordered_map customers_in_app = + it->second; + obj = customers_in_app[customer_id]; + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + return obj; +} + +void Postoffice::Barrier(int customer_id, int node_group) { + if (GetNodeIDs(node_group).size() <= 1) + return; + auto role = van_->my_node().role; + if (role == Node::SCHEDULER) { + CHECK(node_group & kScheduler); + } else if (role == Node::WORKER) { + CHECK(node_group & kWorkerGroup); + } else if (role == Node::SERVER) { + CHECK(node_group & kServerGroup); + } + + std::unique_lock ulk(barrier_mu_); + barrier_done_[0][customer_id] = false; + Message req; + req.meta.recver = kScheduler; + req.meta.request = true; + req.meta.control.cmd = Control::BARRIER; + req.meta.app_id = 0; + req.meta.customer_id = customer_id; + req.meta.control.barrier_group = node_group; + req.meta.timestamp = van_->GetTimestamp(); + van_->Send(req); + barrier_cond_.wait( + ulk, [this, customer_id] { return barrier_done_[0][customer_id]; }); +} + +const std::vector &Postoffice::GetServerKeyRanges() { + server_key_ranges_mu_.lock(); + if (server_key_ranges_.empty()) { + for (int i = 0; i < num_servers_; ++i) { + server_key_ranges_.push_back(Range( + kMaxKey / num_servers_ * i, kMaxKey / num_servers_ * (i + 1))); + } + } + server_key_ranges_mu_.unlock(); + return server_key_ranges_; +} + +void Postoffice::Manage(const Message &recv) { + CHECK(!recv.meta.control.empty()); + const auto &ctrl = recv.meta.control; + if (ctrl.cmd == Control::BARRIER && !recv.meta.request) { + barrier_mu_.lock(); + auto size = barrier_done_[recv.meta.app_id].size(); + for (size_t customer_id = 0; customer_id < size; customer_id++) { + barrier_done_[recv.meta.app_id][customer_id] = true; + } + barrier_mu_.unlock(); + barrier_cond_.notify_all(); + } +} + +std::vector Postoffice::GetDeadNodes(int t) { + std::vector dead_nodes; + if (!van_->IsReady() || t == 0) + return dead_nodes; + + time_t curr_time = time(NULL); + const auto &nodes = is_scheduler_ ? + GetNodeIDs(kWorkerGroup + kServerGroup) : + GetNodeIDs(kScheduler); + { + std::lock_guard lk(heartbeat_mu_); + for (int r : nodes) { + auto it = heartbeats_.find(r); + if ((it == heartbeats_.end() || it->second + t < curr_time) + && start_time_ + t < curr_time) { + dead_nodes.push_back(r); + } + } + } + return dead_nodes; +} +} // namespace ps diff --git a/ps-lite/src/python_binding.cc b/ps-lite/src/python_binding.cc new file mode 100644 index 0000000..e4f5b6d --- /dev/null +++ b/ps-lite/src/python_binding.cc @@ -0,0 +1,140 @@ +#include "ps/worker/worker.h" + +#include "ps/ps.h" +#include "ps/server/kvserver.h" + +extern "C" { + +void Init() { + if (Postoffice::Get()->van()) + return; + Start(0); +} + +void Finalize() { + Finalize(0, true); +} + +void Pull(int node_name, DLArray *arr) { + worker.pull(node_name, arr); +} + +void Push(int node_name, const DLArray *arr, DLEvent *evt) { + worker.push(node_name, arr, evt); +} + +void DDPushPull(int node_name, const DLArray *in_arr, DLArray *out_arr, + DLEvent *evt) { + worker.dd_pushpull(node_name, in_arr, out_arr, evt); +} + +void SparsePush(int node_name, const DLArray *index, const DLArray *value, + DLEvent *evt) { + size_t index_size = 1; + for (int i = 0; i < index->ndim; i++) + index_size *= index->shape[i]; + worker.sparse_push(node_name, index, value, index_size, evt); +} + +void SparsePull(int node_name, const DLArray *index, DLArray *value) { + size_t index_size = 1; + for (int i = 0; i < index->ndim; i++) + index_size *= index->shape[i]; + worker.sparse_pull(node_name, index, value, index_size); +} + +void SDPushPull(int node_name, const DLArray *index, const DLArray *in_arr, + DLArray *out_arr, DLEvent *evt) { + size_t index_size = 1; + for (int i = 0; i < index->ndim; i++) + index_size *= index->shape[i]; + worker.sd_pushpull(node_name, index, in_arr, index_size, out_arr, evt); +} + +void SSPushPull(int node_name, const DLArray *inindices, const DLArray *in_arr, + const DLArray *outindices, DLArray *out_arr, DLEvent *evt) { + size_t index_size = 1; + assert(inindices->ndim == outindices->ndim); + for (int i = 0; i < inindices->ndim; ++i) { + assert(inindices->shape[i] == outindices->shape[i]); + index_size *= inindices->shape[i]; + } + worker.ss_pushpull(node_name, inindices, in_arr, outindices, out_arr, + index_size, evt); +} + +/** + * args: + * index, example index + * value, the example value + * length, length of every example + */ +Worker::query_t PushData(const long *index, int index_size, + const DLArray *value, const long *length) { + return worker.push_data(index, index_size, value, length); +} + +Worker::query_t PullData(const long *index, int index_size, DLArray *value, + const long *length) { + return worker.pull_data(index, index_size, value, length); +} + +void Wait(int node_id) { + worker.wait(node_id); +} + +void WaitData(Worker::query_t query) { + worker.wait_data(query); +} + +void BarrierWorker() { + Postoffice::Get()->Barrier(0, kWorkerGroup); +} + +void InitTensor(int node_name, int ptype, int len, int width, int init_type, + double init_a, double init_b, unsigned long long seed, + int otype, float lrs[], int nlr) { + worker.parameter_init( + node_name, static_cast(ptype), static_cast(len), + static_cast(width), static_cast(init_type), init_a, + init_b, seed, static_cast(otype), SArray(lrs, nlr)); +} + +void Clear(int node_name) { + worker.clear(node_name); +} + +void ClearOnServer(int node_name) { + worker.clear_on_server(node_name); +} + +void SaveParam(int node_name, char *address) { + worker.parameter_save(node_name, address); +} + +void LoadParam(int node_name, char *address) { + worker.parameter_load(node_name, address); +} + +void startRecord(char *dirPath) { + PSAgent::Get()->startRecord(std::string(dirPath)); +} + +void getLoads() { + PSAgent::Get()->getLoads(); +} + +void StartServer() { + auto server = new KVServer(0); + RegisterExitCallback([server]() { delete server; }); +} + +int rank() { + return Postoffice::Get()->my_rank(); +} + +int nrank() { + return Postoffice::Get()->num_workers(); +} + +} // extern "C" diff --git a/ps-lite/src/resender.h b/ps-lite/src/resender.h new file mode 100644 index 0000000..e96bcbc --- /dev/null +++ b/ps-lite/src/resender.h @@ -0,0 +1,150 @@ +/** + * Copyright (c) 2015 by Contributors + */ +#ifndef PS_RESENDER_H_ +#define PS_RESENDER_H_ +#include +#include +#include +#include +namespace ps { + +/** + * \brief resend a messsage if no ack is received within a given time + */ +class Resender { +public: + /** + * \param timeout timeout in millisecond + */ + Resender(int timeout, int max_num_retry, Van *van) { + timeout_ = timeout; + max_num_retry_ = max_num_retry; + van_ = van; + monitor_ = new std::thread(&Resender::Monitoring, this); + } + ~Resender() { + exit_ = true; + monitor_->join(); + delete monitor_; + } + + /** + * \brief add an outgoining message + * + */ + void AddOutgoing(const Message &msg) { + if (msg.meta.control.cmd == Control::ACK) + return; + CHECK_NE(msg.meta.timestamp, Meta::kEmpty) << msg.DebugString(); + auto key = GetKey(msg); + std::lock_guard lk(mu_); + // already buffered, which often due to call Send by the monitor thread + if (send_buff_.find(key) != send_buff_.end()) + return; + + auto &ent = send_buff_[key]; + ent.msg = msg; + ent.send = Now(); + ent.num_retry = 0; + } + + /** + * \brief add an incomming message + * \brief return true if msg has been added before or a ACK message + */ + bool AddIncomming(const Message &msg) { + // a message can be received by multiple times + if (msg.meta.control.cmd == Control::TERMINATE) { + return false; + } else if (msg.meta.control.cmd == Control::ACK) { + mu_.lock(); + auto key = msg.meta.control.msg_sig; + auto it = send_buff_.find(key); + if (it != send_buff_.end()) + send_buff_.erase(it); + mu_.unlock(); + return true; + } else { + mu_.lock(); + auto key = GetKey(msg); + auto it = acked_.find(key); + bool duplicated = it != acked_.end(); + if (!duplicated) + acked_.insert(key); + mu_.unlock(); + // send back ack message (even if it is duplicated) + Message ack; + ack.meta.recver = msg.meta.sender; + ack.meta.sender = msg.meta.recver; + ack.meta.control.cmd = Control::ACK; + ack.meta.control.msg_sig = key; + van_->Send(ack); + // warning + if (duplicated) + LOG(WARNING) << "Duplicated message: " << msg.DebugString(); + return duplicated; + } + } + +private: + using Time = std::chrono::milliseconds; + // the buffer entry + struct Entry { + Message msg; + Time send; + int num_retry = 0; + }; + std::unordered_map send_buff_; + + uint64_t GetKey(const Message &msg) { + CHECK_NE(msg.meta.timestamp, Meta::kEmpty) << msg.DebugString(); + uint16_t id = msg.meta.app_id; + uint8_t sender = msg.meta.sender == Node::kEmpty ? van_->my_node().id : + msg.meta.sender; + uint8_t recver = msg.meta.recver; + return (static_cast(id) << 48) + | (static_cast(sender) << 40) + | (static_cast(recver) << 32) + | (msg.meta.timestamp << 1) | msg.meta.request; + } + Time Now() { + return std::chrono::duration_cast