| @@ -0,0 +1,6 @@ | |||||
| [submodule "third_party/GraphMix"] | |||||
| path = third_party/GraphMix | |||||
| url = https://github.com/nox-410/GraphMix.git | |||||
| [submodule "third_party/HetuML"] | |||||
| path = third_party/HetuML | |||||
| url = https://github.com/ccchengff/HetuML.git | |||||
| @@ -0,0 +1,59 @@ | |||||
| cmake_minimum_required(VERSION 3.18) | |||||
| project(Hetu CXX) | |||||
| include(cmake/config.cmake) | |||||
| list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) | |||||
| set(CMAKE_CXX_STANDARD 11) | |||||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | |||||
| set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) | |||||
| set(CMAKE_CXX_FLAGS "-O3 -Wall") | |||||
| # openmp | |||||
| find_package(OpenMP REQUIRED) | |||||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | |||||
| # compile flag | |||||
| if(${HETU_VERSION} STREQUAL "all") | |||||
| set(HETU_COMPILE_GPU ON) | |||||
| set(HETU_COMPILE_MKL ON) | |||||
| elseif(${HETU_VERSION} STREQUAL "gpu") | |||||
| set(HETU_COMPILE_GPU ON) | |||||
| set(HETU_COMPILE_MKL OFF) | |||||
| elseif(${HETU_VERSION} STREQUAL "mkl") | |||||
| set(HETU_COMPILE_GPU OFF) | |||||
| set(HETU_COMPILE_MKL ON) | |||||
| else() | |||||
| message(FATAL_ERROR "unknown hetu version") | |||||
| endif() | |||||
| message(STATUS "HETU version: ${HETU_VERSION}") | |||||
| # cuda | |||||
| if(${HETU_COMPILE_GPU}) | |||||
| set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc) | |||||
| file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION) | |||||
| string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION}) | |||||
| if(${CUDA_VERSION} VERSION_LESS "10.1") | |||||
| message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}") | |||||
| endif() | |||||
| find_package(CUDAToolkit REQUIRED) | |||||
| enable_language(CUDA) | |||||
| endif() | |||||
| include(FetchContent) # download third_party | |||||
| add_subdirectory(${CMAKE_SOURCE_DIR}/src) | |||||
| if(${HETU_PS}) | |||||
| add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite) | |||||
| endif() | |||||
| if(${HETU_GEOMETRIC}) | |||||
| add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix) | |||||
| endif() | |||||
| if (HETU_ML) | |||||
| add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML) | |||||
| endif() | |||||
| enable_testing() | |||||
| @@ -0,0 +1,33 @@ | |||||
| ## Committer | |||||
| Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer. | |||||
| One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers. | |||||
| Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md). | |||||
| ## Committer Members | |||||
| ### Current Committer | |||||
| - [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)> | |||||
| - [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)> | |||||
| - [codecaution](https://github.com/codecaution) - **Xiaonan Nie** | |||||
| - [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang** | |||||
| - [nox-410](https://github.com/nox-410) - **Yining Shi** | |||||
| - [initzhang](https://github.com/initzhang) - **Xin Zhang** | |||||
| - [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang** | |||||
| - [AFDWang](https://github.com/AFDWang) - **Yujie Wang** | |||||
| - [sj1104](https://github.com/sj1104) - **Jia Shen** | |||||
| - [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou** | |||||
| - [zmxdream](https://github.com/zmxdream) - **Minxu Zhang** | |||||
| We would like to sincerely thank the following community members for their contributions to Hetu. | |||||
| - [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)** | |||||
| - [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)** | |||||
| - [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)** | |||||
| - [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)** | |||||
| - [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)** | |||||
| - [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao** | |||||
| - [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue** | |||||
| - **Lizi Su** | |||||
| - **Dong Li** | |||||
| @@ -0,0 +1,53 @@ | |||||
| # Contributing to Hetu | |||||
| Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing. | |||||
| ## Issues | |||||
| We use Github Issues to track public bugs and feature requests. | |||||
| ### Search Known Issues First | |||||
| Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant. | |||||
| ### Reporting New Issues | |||||
| If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash. | |||||
| ## Pull Requests | |||||
| We strongly welcome your pull request to make Hetu better. | |||||
| ### Branch Management | |||||
| There are three main branches here: | |||||
| 1. `main` branch. | |||||
| (1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`... | |||||
| (2). **Don't submit any PR on `main` branch.** | |||||
| 2. `specific version` branchs. | |||||
| (1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing branch. After full testing, `specific version` branch will be merged to `main` branch for the next release. | |||||
| (2). **You are recommended to submit bugfix or feature PR on `specific version` branch.** | |||||
| Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release. | |||||
| ### Make Pull Requests | |||||
| The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay. | |||||
| Before submitting a pull request, please make sure the followings are done: | |||||
| 1. Fork the repo and create your branch from `main` or `specific version`. | |||||
| 2. Update code or documentation if you have changed APIs. | |||||
| 3. Add the copyright notice to the top of any new files you've added. | |||||
| 4. Check your code lints and checkstyles. | |||||
| 5. Test and test again your code. | |||||
| 6. Now, you can submit your pull request on `specific version` branch. | |||||
| ## Code Style Guide | |||||
| Use [Code Style](./.clang-format) for Python and C++. | |||||
| ## License | |||||
| By contributing to Hetu, you agree that your contributions will be licensed | |||||
| under [License](LICENSE) | |||||
| @@ -1,124 +1,201 @@ | |||||
| 木兰宽松许可证, 第2版 | |||||
| 2020年1月 http://license.coscl.org.cn/MulanPSL2 | |||||
| 您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: | |||||
| 0. 定义 | |||||
| “软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 | |||||
| “贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 | |||||
| “贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 | |||||
| “法人实体” 是指提交贡献的机构及其“关联实体”。 | |||||
| “关联实体” 是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 | |||||
| 1. 授予版权许可 | |||||
| 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 | |||||
| 2. 授予专利许可 | |||||
| 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 | |||||
| 3. 无商标许可 | |||||
| “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 | |||||
| 4. 分发限制 | |||||
| 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 | |||||
| 5. 免责声明与责任限制 | |||||
| “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 | |||||
| 6. 语言 | |||||
| “本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 | |||||
| 条款结束 | |||||
| 如何将木兰宽松许可证,第2版,应用到您的软件 | |||||
| 如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: | |||||
| 1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; | |||||
| 2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; | |||||
| 3, 请将如下声明文本放入每个源文件的头部注释中。 | |||||
| Copyright (c) [Year] [name of copyright holder] | |||||
| [Software Name] is licensed under Mulan PSL v2. | |||||
| You can use this software according to the terms and conditions of the Mulan PSL v2. | |||||
| You may obtain a copy of Mulan PSL v2 at: | |||||
| http://license.coscl.org.cn/MulanPSL2 | |||||
| THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, | |||||
| EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, | |||||
| MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. | |||||
| See the Mulan PSL v2 for more details. | |||||
| Mulan Permissive Software License,Version 2 | |||||
| Mulan Permissive Software License,Version 2 (Mulan PSL v2) | |||||
| January 2020 http://license.coscl.org.cn/MulanPSL2 | |||||
| Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: | |||||
| 0. Definition | |||||
| Software means the program and related documents which are licensed under this License and comprise all Contribution(s). | |||||
| Contribution means the copyrightable work licensed by a particular Contributor under this License. | |||||
| Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. | |||||
| Legal Entity means the entity making a Contribution and all its Affiliates. | |||||
| Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. | |||||
| 1. Grant of Copyright License | |||||
| Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. | |||||
| 2. Grant of Patent License | |||||
| Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. | |||||
| 3. No Trademark License | |||||
| No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4. | |||||
| 4. Distribution Restriction | |||||
| You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. | |||||
| 5. Disclaimer of Warranty and Limitation of Liability | |||||
| THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. | |||||
| 6. Language | |||||
| THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. | |||||
| END OF THE TERMS AND CONDITIONS | |||||
| How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software | |||||
| To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: | |||||
| Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; | |||||
| Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package; | |||||
| Attach the statement to the appropriate annotated syntax at the beginning of each source file. | |||||
| Copyright (c) [Year] [name of copyright holder] | |||||
| [Software Name] is licensed under Mulan PSL v2. | |||||
| You can use this software according to the terms and conditions of the Mulan PSL v2. | |||||
| You may obtain a copy of Mulan PSL v2 at: | |||||
| http://license.coscl.org.cn/MulanPSL2 | |||||
| THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, | |||||
| EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, | |||||
| MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. | |||||
| See the Mulan PSL v2 for more details. | |||||
| Apache License | |||||
| Version 2.0, January 2004 | |||||
| http://www.apache.org/licenses/ | |||||
| TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | |||||
| 1. Definitions. | |||||
| "License" shall mean the terms and conditions for use, reproduction, | |||||
| and distribution as defined by Sections 1 through 9 of this document. | |||||
| "Licensor" shall mean the copyright owner or entity authorized by | |||||
| the copyright owner that is granting the License. | |||||
| "Legal Entity" shall mean the union of the acting entity and all | |||||
| other entities that control, are controlled by, or are under common | |||||
| control with that entity. For the purposes of this definition, | |||||
| "control" means (i) the power, direct or indirect, to cause the | |||||
| direction or management of such entity, whether by contract or | |||||
| otherwise, or (ii) ownership of fifty percent (50%) or more of the | |||||
| outstanding shares, or (iii) beneficial ownership of such entity. | |||||
| "You" (or "Your") shall mean an individual or Legal Entity | |||||
| exercising permissions granted by this License. | |||||
| "Source" form shall mean the preferred form for making modifications, | |||||
| including but not limited to software source code, documentation | |||||
| source, and configuration files. | |||||
| "Object" form shall mean any form resulting from mechanical | |||||
| transformation or translation of a Source form, including but | |||||
| not limited to compiled object code, generated documentation, | |||||
| and conversions to other media types. | |||||
| "Work" shall mean the work of authorship, whether in Source or | |||||
| Object form, made available under the License, as indicated by a | |||||
| copyright notice that is included in or attached to the work | |||||
| (an example is provided in the Appendix below). | |||||
| "Derivative Works" shall mean any work, whether in Source or Object | |||||
| form, that is based on (or derived from) the Work and for which the | |||||
| editorial revisions, annotations, elaborations, or other modifications | |||||
| represent, as a whole, an original work of authorship. For the purposes | |||||
| of this License, Derivative Works shall not include works that remain | |||||
| separable from, or merely link (or bind by name) to the interfaces of, | |||||
| the Work and Derivative Works thereof. | |||||
| "Contribution" shall mean any work of authorship, including | |||||
| the original version of the Work and any modifications or additions | |||||
| to that Work or Derivative Works thereof, that is intentionally | |||||
| submitted to Licensor for inclusion in the Work by the copyright owner | |||||
| or by an individual or Legal Entity authorized to submit on behalf of | |||||
| the copyright owner. For the purposes of this definition, "submitted" | |||||
| means any form of electronic, verbal, or written communication sent | |||||
| to the Licensor or its representatives, including but not limited to | |||||
| communication on electronic mailing lists, source code control systems, | |||||
| and issue tracking systems that are managed by, or on behalf of, the | |||||
| Licensor for the purpose of discussing and improving the Work, but | |||||
| excluding communication that is conspicuously marked or otherwise | |||||
| designated in writing by the copyright owner as "Not a Contribution." | |||||
| "Contributor" shall mean Licensor and any individual or Legal Entity | |||||
| on behalf of whom a Contribution has been received by Licensor and | |||||
| subsequently incorporated within the Work. | |||||
| 2. Grant of Copyright License. Subject to the terms and conditions of | |||||
| this License, each Contributor hereby grants to You a perpetual, | |||||
| worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||||
| copyright license to reproduce, prepare Derivative Works of, | |||||
| publicly display, publicly perform, sublicense, and distribute the | |||||
| Work and such Derivative Works in Source or Object form. | |||||
| 3. Grant of Patent License. Subject to the terms and conditions of | |||||
| this License, each Contributor hereby grants to You a perpetual, | |||||
| worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||||
| (except as stated in this section) patent license to make, have made, | |||||
| use, offer to sell, sell, import, and otherwise transfer the Work, | |||||
| where such license applies only to those patent claims licensable | |||||
| by such Contributor that are necessarily infringed by their | |||||
| Contribution(s) alone or by combination of their Contribution(s) | |||||
| with the Work to which such Contribution(s) was submitted. If You | |||||
| institute patent litigation against any entity (including a | |||||
| cross-claim or counterclaim in a lawsuit) alleging that the Work | |||||
| or a Contribution incorporated within the Work constitutes direct | |||||
| or contributory patent infringement, then any patent licenses | |||||
| granted to You under this License for that Work shall terminate | |||||
| as of the date such litigation is filed. | |||||
| 4. Redistribution. You may reproduce and distribute copies of the | |||||
| Work or Derivative Works thereof in any medium, with or without | |||||
| modifications, and in Source or Object form, provided that You | |||||
| meet the following conditions: | |||||
| (a) You must give any other recipients of the Work or | |||||
| Derivative Works a copy of this License; and | |||||
| (b) You must cause any modified files to carry prominent notices | |||||
| stating that You changed the files; and | |||||
| (c) You must retain, in the Source form of any Derivative Works | |||||
| that You distribute, all copyright, patent, trademark, and | |||||
| attribution notices from the Source form of the Work, | |||||
| excluding those notices that do not pertain to any part of | |||||
| the Derivative Works; and | |||||
| (d) If the Work includes a "NOTICE" text file as part of its | |||||
| distribution, then any Derivative Works that You distribute must | |||||
| include a readable copy of the attribution notices contained | |||||
| within such NOTICE file, excluding those notices that do not | |||||
| pertain to any part of the Derivative Works, in at least one | |||||
| of the following places: within a NOTICE text file distributed | |||||
| as part of the Derivative Works; within the Source form or | |||||
| documentation, if provided along with the Derivative Works; or, | |||||
| within a display generated by the Derivative Works, if and | |||||
| wherever such third-party notices normally appear. The contents | |||||
| of the NOTICE file are for informational purposes only and | |||||
| do not modify the License. You may add Your own attribution | |||||
| notices within Derivative Works that You distribute, alongside | |||||
| or as an addendum to the NOTICE text from the Work, provided | |||||
| that such additional attribution notices cannot be construed | |||||
| as modifying the License. | |||||
| You may add Your own copyright statement to Your modifications and | |||||
| may provide additional or different license terms and conditions | |||||
| for use, reproduction, or distribution of Your modifications, or | |||||
| for any such Derivative Works as a whole, provided Your use, | |||||
| reproduction, and distribution of the Work otherwise complies with | |||||
| the conditions stated in this License. | |||||
| 5. Submission of Contributions. Unless You explicitly state otherwise, | |||||
| any Contribution intentionally submitted for inclusion in the Work | |||||
| by You to the Licensor shall be under the terms and conditions of | |||||
| this License, without any additional terms or conditions. | |||||
| Notwithstanding the above, nothing herein shall supersede or modify | |||||
| the terms of any separate license agreement you may have executed | |||||
| with Licensor regarding such Contributions. | |||||
| 6. Trademarks. This License does not grant permission to use the trade | |||||
| names, trademarks, service marks, or product names of the Licensor, | |||||
| except as required for reasonable and customary use in describing the | |||||
| origin of the Work and reproducing the content of the NOTICE file. | |||||
| 7. Disclaimer of Warranty. Unless required by applicable law or | |||||
| agreed to in writing, Licensor provides the Work (and each | |||||
| Contributor provides its Contributions) on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| implied, including, without limitation, any warranties or conditions | |||||
| of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | |||||
| PARTICULAR PURPOSE. You are solely responsible for determining the | |||||
| appropriateness of using or redistributing the Work and assume any | |||||
| risks associated with Your exercise of permissions under this License. | |||||
| 8. Limitation of Liability. In no event and under no legal theory, | |||||
| whether in tort (including negligence), contract, or otherwise, | |||||
| unless required by applicable law (such as deliberate and grossly | |||||
| negligent acts) or agreed to in writing, shall any Contributor be | |||||
| liable to You for damages, including any direct, indirect, special, | |||||
| incidental, or consequential damages of any character arising as a | |||||
| result of this License or out of the use or inability to use the | |||||
| Work (including but not limited to damages for loss of goodwill, | |||||
| work stoppage, computer failure or malfunction, or any and all | |||||
| other commercial damages or losses), even if such Contributor | |||||
| has been advised of the possibility of such damages. | |||||
| 9. Accepting Warranty or Additional Liability. While redistributing | |||||
| the Work or Derivative Works thereof, You may choose to offer, | |||||
| and charge a fee for, acceptance of support, warranty, indemnity, | |||||
| or other liability obligations and/or rights consistent with this | |||||
| License. However, in accepting such obligations, You may act only | |||||
| on Your own behalf and on Your sole responsibility, not on behalf | |||||
| of any other Contributor, and only if You agree to indemnify, | |||||
| defend, and hold each Contributor harmless for any liability | |||||
| incurred by, or claims asserted against, such Contributor by reason | |||||
| of your accepting any such warranty or additional liability. | |||||
| END OF TERMS AND CONDITIONS | |||||
| APPENDIX: How to apply the Apache License to your work. | |||||
| To apply the Apache License to your work, attach the following | |||||
| boilerplate notice, with the fields enclosed by brackets "[]" | |||||
| replaced with your own identifying information. (Don't include | |||||
| the brackets!) The text should be enclosed in the appropriate | |||||
| comment syntax for the file format. We also recommend that a | |||||
| file or class name and description of purpose be included on the | |||||
| same "printed page" as the copyright notice for easier | |||||
| identification within third-party archives. | |||||
| Copyright [2021] [Peking University] | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| @@ -1,20 +1,177 @@ | |||||
| #### 从命令行创建一个新的仓库 | |||||
| <div align=center> | |||||
| <img src="./img/hetu.png" width="300" /> | |||||
| </div> | |||||
| # HETU | |||||
| <!--- [](LICENSE) ---> | |||||
| [Documentation](https://hetu-doc.readthedocs.io) | | |||||
| [Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html) | |||||
| Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by <a href="http://net.pku.edu.cn/~cuibin/" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics: | |||||
| - Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms. | |||||
| - Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks. | |||||
| - Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce. | |||||
| - Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark | |||||
| - Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search. | |||||
| We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details. | |||||
| ## Installation | |||||
| 1. Clone the repository. | |||||
| 2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used: | |||||
| ```conda env create -f environment.yml``` . | |||||
| Please prepare Cuda toolkit and CuDNN in advance. | |||||
| 3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix. | |||||
| ```bash | ```bash | ||||
| touch README.md | |||||
| git init | |||||
| git add README.md | |||||
| git commit -m "first commit" | |||||
| git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git | |||||
| git push -u origin master | |||||
| # modify paths and configurations in cmake/config.cmake | |||||
| # generate Makefile | |||||
| mkdir build && cd build && cmake .. | |||||
| # compile | |||||
| # make all | |||||
| make -j 8 | |||||
| # make hetu, version is specified in cmake/config.cmake | |||||
| make hetu -j 8 | |||||
| # make allreduce module | |||||
| make allreduce -j 8 | |||||
| # make ps module | |||||
| make ps -j 8 | |||||
| # make geometric module | |||||
| make geometric -j 8 | |||||
| # make hetu-cache module | |||||
| make hetu_cache -j 8 | |||||
| ``` | ``` | ||||
| #### 从命令行推送已经创建的仓库 | |||||
| 4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` . | |||||
| ## Usage | |||||
| Train logistic regression on gpu: | |||||
| ```bash | ```bash | ||||
| git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git | |||||
| git push -u origin master | |||||
| bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST | |||||
| ``` | |||||
| Train a 3-layer mlp on gpu: | |||||
| ```bash | |||||
| bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10 | |||||
| ``` | |||||
| Train a 3-layer cnn with gpu: | |||||
| ```bash | |||||
| bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST | |||||
| ``` | ``` | ||||
| Train a 3-layer mlp with allreduce on 8 gpus (use mpirun): | |||||
| ```bash | |||||
| bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10 | |||||
| ``` | |||||
| Train a 3-layer mlp with PS on 1 server and 2 workers: | |||||
| ```bash | |||||
| # in the script we launch the scheduler and server, and two workers | |||||
| bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10 | |||||
| ``` | |||||
| ## More Examples | |||||
| Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks. | |||||
| ## Community | |||||
| * Email: xupeng.miao@pku.edu.cn | |||||
| * Slack: coming soon | |||||
| * Hetu homepage: https://hetu-doc.readthedocs.io | |||||
| * [Committers & Contributors](COMMITTERS.md) | |||||
| * [Contributing to Hetu](CONTRIBUTING.md) | |||||
| * [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html) | |||||
| ## Enterprise Users | |||||
| If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here. | |||||
| <img src="./img/tencent.png" width = "200"/> | |||||
| <img src="./img/alibabacloud.png" width = "200"/> | |||||
| <img src="./img/kuaishou.png" width = "200"/> | |||||
| ## License | |||||
| The entire codebase is under [license](LICENSE) | |||||
| ## Papers | |||||
| 1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021 | |||||
| 2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021 | |||||
| 3. coming soon | |||||
| ## Acknowledgements | |||||
| We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel). | |||||
| ## Appendix | |||||
| The prerequisites for different modules in Hetu is listed as follows: | |||||
| ``` | |||||
| "*" means you should prepare by yourself, while others support auto-download | |||||
| Hetu: OpenMP(*), CMake(*) | |||||
| Hetu (version mkl): MKL 1.6.1 | |||||
| Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*) | |||||
| Hetu (version all): both | |||||
| Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version | |||||
| Hetu-PS: Protobuf(*), ZeroMQ 4.3.2 | |||||
| Hetu-Geometric: Pybind11(*), Metis(*) | |||||
| Hetu-Cache: Pybind11(*), this module needs PS module | |||||
| ################################################################## | |||||
| Tips for preparing the prerequisites | |||||
| Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment): | |||||
| 1. download from https://developer.nvidia.com | |||||
| 2. install | |||||
| 3. modify paths in cmake/config.cmake if necessary | |||||
| Preparing OpenMP: | |||||
| Your just need to ensure your compiler support openmp. | |||||
| Preparing CMake, Protobuf, Pybind11, Metis: | |||||
| Install by anaconda: | |||||
| conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis | |||||
| Preparing OpenMPI (not necessary): | |||||
| install by anaconda: `conda install -c conda-forge openmpi=4.0.3` | |||||
| or | |||||
| 1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz | |||||
| 2. build openmpi by `./configure /path/to/build && make -j8 && make install` | |||||
| 3. modify MPI_HOME to /path/to/build in cmake/config.cmake | |||||
| Preparing MKL (not necessary): | |||||
| install by anaconda: `conda install -c conda-forge onednn` | |||||
| or | |||||
| 1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz | |||||
| 2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` | |||||
| 3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake | |||||
| Preparing ZeroMQ (not necessary): | |||||
| install by anaconda: `conda install -c anaconda zeromq=4.3.2` | |||||
| or | |||||
| 1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip | |||||
| 2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` | |||||
| 3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake | |||||
| ``` | |||||
| @@ -0,0 +1,2 @@ | |||||
| #!/bin/bash | |||||
| python $(cd $(dirname $0); pwd)/../python/runner.py $@ | |||||
| @@ -0,0 +1,75 @@ | |||||
| # Find the CUDNN libraries | |||||
| # | |||||
| # The following variables are optionally searched for defaults | |||||
| # CUDNN_ROOT: Base directory where CUDNN is found | |||||
| # CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for | |||||
| # CUDNN_LIBRARY: Directory where CUDNN library is searched for | |||||
| # CUDNN_STATIC: Are we looking for a static library? (default: no) | |||||
| # | |||||
| # The following are set after configuration is done: | |||||
| # CUDNN_FOUND | |||||
| # CUDNN_INCLUDE_PATH | |||||
| # CUDNN_LIBRARY_PATH | |||||
| # | |||||
| set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN") | |||||
| if (DEFINED $ENV{CUDNN_ROOT_DIR}) | |||||
| message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.") | |||||
| endif() | |||||
| list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) | |||||
| # Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. | |||||
| list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT}) | |||||
| set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files") | |||||
| find_path(CUDNN_INCLUDE_PATH cudnn.h | |||||
| HINTS ${CUDNN_INCLUDE_DIR} | |||||
| PATH_SUFFIXES cuda/include cuda include | |||||
| REQUIRED) | |||||
| option(CUDNN_STATIC "Look for static CUDNN" OFF) | |||||
| if (CUDNN_STATIC) | |||||
| set(CUDNN_LIBNAME "libcudnn_static.a") | |||||
| else() | |||||
| set(CUDNN_LIBNAME "cudnn") | |||||
| endif() | |||||
| set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)") | |||||
| if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC) | |||||
| message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.") | |||||
| endif() | |||||
| find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME} | |||||
| PATHS ${CUDNN_LIBRARY} | |||||
| PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64 | |||||
| REQUIRED) | |||||
| set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc") | |||||
| file(WRITE ${file} " | |||||
| #include <iostream> | |||||
| #include \"${CUDNN_INCLUDE_PATH}/cudnn.h\" | |||||
| int main() | |||||
| { | |||||
| std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL; | |||||
| int x = cudnnGetVersion(); | |||||
| return x == CUDNN_VERSION; | |||||
| } | |||||
| ") | |||||
| try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} | |||||
| RUN_OUTPUT_VARIABLE CUDNN_VERSION | |||||
| CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}" | |||||
| LINK_LIBRARIES ${CUDNN_LIBRARY_PATH}) | |||||
| if (NOT CUDNN_VERSION_MATCHED) | |||||
| message(FATAL_ERROR "Found CUDNN header version and library version do not match! \ | |||||
| (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.") | |||||
| endif() | |||||
| message(STATUS "CUDNN version: ${CUDNN_VERSION}") | |||||
| include(FindPackageHandleStandardArgs) | |||||
| find_package_handle_standard_args( | |||||
| CUDNN | |||||
| REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH | |||||
| VERSION_VAR CUDNN_VERSION) | |||||
| mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY) | |||||
| @@ -0,0 +1,70 @@ | |||||
| # Accepts the following variables: | |||||
| # | |||||
| # METIS_ROOT: Prefix where METIS is installed. | |||||
| # METIS_LIB_NAME: Name of the METIS library (default: metis). | |||||
| # METIS_LIBRARY: Full path of the METIS library. | |||||
| # Sets the following variables: | |||||
| # | |||||
| # METIS_LIBRARY: Full path of the METIS library. | |||||
| # METIS_FOUND: True if ParMETIS was found. | |||||
| # METIS_LIBRARIES: List of all libraries needed for linking with METIS, | |||||
| # | |||||
| # Provides the following macros: | |||||
| # | |||||
| # find_package(METIS) | |||||
| # | |||||
| # Searches for METIS (See above) | |||||
| # search metis header | |||||
| find_path(METIS_INCLUDE_DIR metis.h | |||||
| PATHS ${METIS_DIR} ${METIS_ROOT} | |||||
| PATH_SUFFIXES metis include include/metis Lib METISLib | |||||
| NO_DEFAULT_PATH | |||||
| DOC "Include directory of metis") | |||||
| find_path(METIS_INCLUDE_DIR metis.h | |||||
| PATH_SUFFIXES metis include include/metis Lib METISLib) | |||||
| set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library") | |||||
| # search metis library | |||||
| if(NOT METIS_LIB_NAME) | |||||
| set(METIS_LIB_NAME metis) | |||||
| endif(NOT METIS_LIB_NAME) | |||||
| find_library(METIS_LIBRARY ${METIS_LIB_NAME} | |||||
| PATHS ${METIS_DIR} ${METIS_ROOT} | |||||
| PATH_SUFFIXES lib | |||||
| NO_DEFAULT_PATH) | |||||
| find_library(METIS_LIBRARY ${METIS_LIB_NAME} | |||||
| PATH_SUFFIXES lib | |||||
| ) | |||||
| # behave like a CMake module is supposed to behave | |||||
| include(FindPackageHandleStandardArgs) | |||||
| find_package_handle_standard_args( | |||||
| "METIS" | |||||
| DEFAULT_MSG | |||||
| METIS_INCLUDE_DIR | |||||
| METIS_LIBRARY | |||||
| ) | |||||
| mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME) | |||||
| # if both headers and library are found, store results | |||||
| if(METIS_FOUND) | |||||
| set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR}) | |||||
| set(METIS_LIBRARIES ${METIS_LIBRARY}) | |||||
| # log result | |||||
| file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log | |||||
| "Determing location of METIS succeded:\n" | |||||
| "Include directory: ${METIS_INCLUDE_DIRS}\n" | |||||
| "Library directory: ${METIS_LIBRARIES}\n\n") | |||||
| else(METIS_FOUND) | |||||
| # log errornous result | |||||
| file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log | |||||
| "Determing location of METIS failed:\n" | |||||
| "Include directory: ${METIS_INCLUDE_DIRS}\n" | |||||
| "Library directory: ${METIS_LIBRARIES}\n\n") | |||||
| endif(METIS_FOUND) | |||||
| @@ -0,0 +1,14 @@ | |||||
| # - Try to find DNNL(MKL-DNN) | |||||
| # Once done this will define | |||||
| # DNNL_FOUND - System has DNNL | |||||
| # DNNL_INCLUDE_DIR - The DNNL include directories | |||||
| # DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build | |||||
| # DNNL_LIBRARY - The libraries needed to use DNNL | |||||
| # DNNL_DEFINITIONS - Compiler switches required for using DNNL | |||||
| find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include ) | |||||
| find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include ) | |||||
| find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src ) | |||||
| include ( FindPackageHandleStandardArgs ) | |||||
| find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR ) | |||||
| @@ -0,0 +1,97 @@ | |||||
| # Try to find NCCL | |||||
| # | |||||
| # The following variables are optionally searched for defaults | |||||
| # NCCL_ROOT: Base directory where all NCCL components are found | |||||
| # NCCL_ROOT_DIR: Base directory where all NCCL components are found | |||||
| # NCCL_INCLUDE_DIR: Directory where NCCL header is found | |||||
| # NCCL_LIB_DIR: Directory where NCCL library is found | |||||
| # | |||||
| # The following are set after configuration is done: | |||||
| # NCCL_FOUND | |||||
| # NCCL_INCLUDE_DIRS | |||||
| # NCCL_LIBRARIES | |||||
| # | |||||
| # The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks | |||||
| # install NCCL in the same location as the CUDA toolkit. | |||||
| # See https://github.com/caffe2/caffe2/issues/1601 | |||||
| if (NOT DEFINED NCCL_ROOT) | |||||
| set(NCCL_ROOT $ENV{CONDA_PREFIX}) | |||||
| endif() | |||||
| set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL") | |||||
| find_path(NCCL_INCLUDE_DIRS | |||||
| NAMES nccl.h | |||||
| HINTS | |||||
| ${NCCL_ROOT} | |||||
| ${NCCL_ROOT}/include | |||||
| ${NCCL_INCLUDE_DIR} | |||||
| ${NCCL_ROOT_DIR} | |||||
| ${NCCL_ROOT_DIR}/include | |||||
| ${CUDA_TOOLKIT_ROOT_DIR}/include | |||||
| REQUIRED) | |||||
| if ($ENV{USE_STATIC_NCCL}) | |||||
| message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library") | |||||
| set(NCCL_LIBNAME "libnccl_static.a") | |||||
| else() | |||||
| set(NCCL_LIBNAME "nccl") | |||||
| endif() | |||||
| find_library(NCCL_LIBRARIES | |||||
| NAMES ${NCCL_LIBNAME} | |||||
| HINTS | |||||
| ${NCCL_LIB_DIR} | |||||
| ${NCCL_ROOT} | |||||
| ${NCCL_ROOT}/lib | |||||
| ${NCCL_ROOT}/lib/x86_64-linux-gnu | |||||
| ${NCCL_ROOT}/lib64 | |||||
| ${NCCL_ROOT_DIR} | |||||
| ${NCCL_ROOT_DIR}/lib | |||||
| ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu | |||||
| ${NCCL_ROOT_DIR}/lib64 | |||||
| ${CUDA_TOOLKIT_ROOT_DIR}/lib64 | |||||
| REQUIRED) | |||||
| set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h") | |||||
| message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...") | |||||
| set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) | |||||
| list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR}) | |||||
| include(CheckCXXSymbolExists) | |||||
| check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED) | |||||
| if (NCCL_VERSION_DEFINED) | |||||
| set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc") | |||||
| file(WRITE ${file} " | |||||
| #include <iostream> | |||||
| #include \"${NCCL_HEADER_FILE}\" | |||||
| int main() | |||||
| { | |||||
| std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH; | |||||
| int x; | |||||
| ncclGetVersion(&x); | |||||
| return x == NCCL_VERSION_CODE; | |||||
| } | |||||
| ") | |||||
| try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} | |||||
| RUN_OUTPUT_VARIABLE NCCL_VERSION | |||||
| CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}" | |||||
| LINK_LIBRARIES ${NCCL_LIBRARIES}) | |||||
| if (NOT NCCL_VERSION_MATCHED) | |||||
| message(FATAL_ERROR "Found NCCL header version and library version do not match! \ | |||||
| (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.") | |||||
| endif() | |||||
| message(STATUS "NCCL version: ${NCCL_VERSION}") | |||||
| else() | |||||
| message(STATUS "NCCL version < 2.3.5-5") | |||||
| endif () | |||||
| set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES}) | |||||
| mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES) | |||||
| include(FindPackageHandleStandardArgs) | |||||
| find_package_handle_standard_args( | |||||
| NCCL | |||||
| REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES | |||||
| VERSION_VAR NCCL_VERSION) | |||||
| @@ -0,0 +1,47 @@ | |||||
| # - Try to find ZMQ | |||||
| # Once done this will define | |||||
| # ZMQ_FOUND - System has ZMQ | |||||
| # ZMQ_INCLUDE_DIRS - The ZMQ include directories | |||||
| # ZMQ_LIBRARIES - The libraries needed to use ZMQ | |||||
| # ZMQ_DEFINITIONS - Compiler switches required for using ZMQ | |||||
| find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include ) | |||||
| find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib ) | |||||
| set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} ) | |||||
| set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} ) | |||||
| if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS) | |||||
| set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc") | |||||
| file(WRITE ${file} " | |||||
| #include <iostream> | |||||
| #include \"${ZMQ_INCLUDE_DIRS}/zmq.h\" | |||||
| int main() | |||||
| { | |||||
| std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH; | |||||
| int x, y, z; | |||||
| zmq_version(&x, &y, &z); | |||||
| return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH; | |||||
| } | |||||
| ") | |||||
| try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} | |||||
| RUN_OUTPUT_VARIABLE ZMQ_VERSION | |||||
| LINK_LIBRARIES ${ZMQ_LIBRARIES}) | |||||
| if (NOT ZMQ_VERSION_MATCHED) | |||||
| message(WARNING "Found ZMQ header version and library version do not match! \ | |||||
| (include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.") | |||||
| unset(ZMQ_INCLUDE_DIRS) | |||||
| unset(ZMQ_LIBRARIES) | |||||
| unset(ZMQ_VERSION) | |||||
| else () | |||||
| message(STATUS "ZMQ version: ${ZMQ_VERSION}") | |||||
| endif() | |||||
| endif() | |||||
| include ( FindPackageHandleStandardArgs ) | |||||
| # handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE | |||||
| # if all listed variables are TRUE | |||||
| find_package_handle_standard_args ( | |||||
| ZMQ | |||||
| REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS | |||||
| VERSION_VAR ZMQ_VERSION) | |||||
| @@ -0,0 +1,55 @@ | |||||
| ###################### | |||||
| ### Set targets ###### | |||||
| ###################### | |||||
| # hetu main version, choose from (mkl, gpu, all) | |||||
| # if using mkl (for CPU) or all, OpenMP(*), mkl required | |||||
| # if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required | |||||
| set(HETU_VERSION "all") | |||||
| # whether to compile allreduce module | |||||
| # nccl(*), openmpi required | |||||
| set(HETU_ALLREDUCE ON) | |||||
| # whether to compile ps module | |||||
| # protobuf(*), zeromq required | |||||
| set(HETU_PS ON) | |||||
| # whether to compile geometric module (for GNNs) | |||||
| # pybind11(*), metis(*) required | |||||
| set(HETU_GEOMETRIC ON) | |||||
| # whether to compile cache module (for PS) | |||||
| # to enable this, you must turn HETU_PS on | |||||
| # pybind11(*) required | |||||
| set(HETU_CACHE ON) | |||||
| # whether to compile Hetu ML Module | |||||
| set(HETU_ML ON) | |||||
| set(HETU_PARALLEL_ML ON) | |||||
| ###################### | |||||
| ### Set paths ######## | |||||
| ###################### | |||||
| # CUDA version >= 10.1 | |||||
| set(CUDAToolkit_ROOT /usr/local/cuda) | |||||
| # NCCL version >= 2.8 | |||||
| set(NCCL_ROOT $ENV{CONDA_PREFIX}) | |||||
| set(CUDNN_ROOT) | |||||
| # MPI version >= 3.1 (OpenMPI version >= 4.0.3) | |||||
| # if valid version not found, we'll download and compile it in time (openmpi-4.0.3) | |||||
| set(MPI_HOME $ENV{CONDA_PREFIX}) | |||||
| # MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl | |||||
| # if not found, we'll download and compile it in time | |||||
| set(MKL_ROOT $ENV{CONDA_PREFIX}) | |||||
| set(MKL_BUILD $ENV{CONDA_PREFIX}) | |||||
| # ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq | |||||
| # if not found, we'll download and compile it in time | |||||
| set(ZMQ_ROOT $ENV{CONDA_PREFIX}) | |||||
| set(ZMQ_BUILD $ENV{CONDA_PREFIX}) | |||||
| @@ -0,0 +1,84 @@ | |||||
| name: hetu | |||||
| channels: | |||||
| - conda-forge | |||||
| - defaults | |||||
| dependencies: | |||||
| - _libgcc_mutex=0.1=main | |||||
| - _openmp_mutex=4.5=1_gnu | |||||
| - bcrypt=3.2.0=py37h5e8e339_1 | |||||
| - blas=1.0=mkl | |||||
| - bzip2=1.0.8=h7b6447c_0 | |||||
| - ca-certificates=2021.7.5=h06a4308_1 | |||||
| - certifi=2021.5.30=py37h06a4308_0 | |||||
| - cffi=1.14.6=py37hc58025e_0 | |||||
| - cmake=3.18.2=ha30ef3c_0 | |||||
| - cryptography=3.4.7=py37h5d9358c_0 | |||||
| - cudatoolkit=10.1.243=h6bb024c_0 | |||||
| - expat=2.4.1=h2531618_2 | |||||
| - intel-openmp=2021.3.0=h06a4308_3350 | |||||
| - joblib=1.0.1=pyhd3eb1b0_0 | |||||
| - krb5=1.18.2=h173b8e3_0 | |||||
| - ld_impl_linux-64=2.35.1=h7274673_9 | |||||
| - libcurl=7.71.1=h20c2e04_1 | |||||
| - libedit=3.1.20210216=h27cfd23_1 | |||||
| - libffi=3.3=he6710b0_2 | |||||
| - libgcc-ng=9.3.0=h5101ec6_17 | |||||
| - libgfortran-ng=7.5.0=h14aa051_19 | |||||
| - libgfortran4=7.5.0=h14aa051_19 | |||||
| - libgomp=9.3.0=h5101ec6_17 | |||||
| - libprotobuf=3.15.8=h780b84a_0 | |||||
| - libsodium=1.0.18=h7b6447c_0 | |||||
| - libssh2=1.9.0=h1ba5d50_1 | |||||
| - libstdcxx-ng=9.3.0=hd4cf53a_17 | |||||
| - libuv=1.40.0=h7b6447c_0 | |||||
| - lz4-c=1.9.3=h2531618_0 | |||||
| - metis=5.1.0=hf484d3e_4 | |||||
| - mkl=2021.3.0=h06a4308_520 | |||||
| - mkl-service=2.4.0=py37h7f8727e_0 | |||||
| - mkl_fft=1.3.0=py37h42c9631_2 | |||||
| - mkl_random=1.2.2=py37h51133e4_0 | |||||
| - mpi=1.0=openmpi | |||||
| - nccl=2.8.3.1=hcaf9a05_0 | |||||
| - ncurses=6.2=he6710b0_1 | |||||
| - numpy=1.20.3=py37hf144106_0 | |||||
| - numpy-base=1.20.3=py37h74d4b33_0 | |||||
| - onednn=2.3=omp_hf4ef041_0 | |||||
| - onnx=1.9.0=py37h284874a_0 | |||||
| - onnxruntime=1.7.2=py37he8cb6d3_1 | |||||
| - openmpi=4.0.3=hdf1f1ad_1 | |||||
| - openssl=1.1.1k=h27cfd23_0 | |||||
| - pandas=1.2.5=py37h295c915_0 | |||||
| - paramiko=2.7.2=pyh9f0ad1d_0 | |||||
| - pip=21.1.3=py37h06a4308_0 | |||||
| - protobuf=3.15.8=py37hcd2ae1e_0 | |||||
| - psutil=5.8.0=py37h5e8e339_1 | |||||
| - pybind11=2.6.2=py37hff7bd54_1 | |||||
| - pycparser=2.20=pyh9f0ad1d_2 | |||||
| - pynacl=1.4.0=py37h5e8e339_2 | |||||
| - python=3.7.10=h12debd9_4 | |||||
| - python-dateutil=2.8.2=pyhd3eb1b0_0 | |||||
| - python_abi=3.7=2_cp37m | |||||
| - pytz=2021.1=pyhd3eb1b0_0 | |||||
| - pyyaml=5.4.1=py37h27cfd23_1 | |||||
| - re2=2021.04.01=h9c3ff4c_0 | |||||
| - readline=8.1=h27cfd23_0 | |||||
| - rhash=1.4.1=h3c74f83_1 | |||||
| - scikit-learn=0.24.2=py37ha9443f7_0 | |||||
| - scipy=1.6.2=py37had2a1c9_1 | |||||
| - setuptools=52.0.0=py37h06a4308_0 | |||||
| - six=1.16.0=pyhd3eb1b0_0 | |||||
| - sqlite=3.36.0=hc218d9a_0 | |||||
| - threadpoolctl=2.2.0=pyhb85f177_0 | |||||
| - tk=8.6.10=hbc83047_0 | |||||
| - tqdm=4.61.2=pyhd3eb1b0_1 | |||||
| - typing-extensions=3.10.0.0=hd8ed1ab_0 | |||||
| - typing_extensions=3.10.0.0=pyha770c72_0 | |||||
| - wheel=0.36.2=pyhd3eb1b0_0 | |||||
| - xz=5.2.5=h7b6447c_0 | |||||
| - yaml=0.2.5=h7b6447c_0 | |||||
| - zeromq=4.3.2=he6710b0_3 | |||||
| - zlib=1.2.11=h7b6447c_3 | |||||
| - zstd=1.4.9=haebb681_0 | |||||
| - pip: | |||||
| - cloudpickle==1.6.0 | |||||
| - wget==3.2 | |||||
| @@ -0,0 +1,49 @@ | |||||
| # CNN Examples | |||||
| In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison. | |||||
| ## Structure | |||||
| ``` | |||||
| - cnn | |||||
| - models/ CNN models in HETU | |||||
| - pytorch_models/ CNN models in PyTorch | |||||
| - tf_models/ CNN models in TensorFlow | |||||
| - scripts/ Test scripts | |||||
| - main.py Trainer for HETU | |||||
| - run_tf_horovod.py Trainer for Horovod | |||||
| - tf_launch_server.py Trainer for TF-PS (role: server) | |||||
| - tf_launch_worker.py Trainer for TF-PS (role: worker) | |||||
| - tf_main.py Trainer for TensorFlow | |||||
| - torch_main.py Trainer for Pytorch | |||||
| - | |||||
| ``` | |||||
| ## Usage | |||||
| Here are some examples of running scripts. | |||||
| ```bash | |||||
| bash scripts/hetu_1gpu.sh mlp CIFAR10 # mlp with CIFAR10 dataset in hetu | |||||
| bash scripts/hetu_8gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (1-node) | |||||
| bash scripts/hetu_16gpu.sh mlp CIFAR10 # mlp with CIFAR10 in hetu with 8-GPU (2-nodes) | |||||
| ``` | |||||
| To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples. | |||||
| We can change the setting in scripts. See `mnist_mlp.sh` below. | |||||
| ```bash | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../main.py | |||||
| ### validate and timing | |||||
| python ${mainpy} --model mlp --dataset CIFAR10 --validate --timing | |||||
| ### run in cpu | |||||
| # python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing | |||||
| ``` | |||||
| For more details about training setting, please refer to `main.py`. | |||||
| ## Models | |||||
| We provide following models with specific datasets. | |||||
| ``` | |||||
| CIFAR100: VGG, ResNet | |||||
| CIFAR10: MLP, VGG, ResNet | |||||
| MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN | |||||
| ``` | |||||
| @@ -0,0 +1,10 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13030 | |||||
| DMLC_NUM_WORKER : 2 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| launch : | |||||
| worker : 0 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| @@ -0,0 +1,202 @@ | |||||
| import hetu as ht | |||||
| import models | |||||
| import os | |||||
| import numpy as np | |||||
| import argparse | |||||
| import json | |||||
| import logging | |||||
| from time import time | |||||
| logging.basicConfig(level=logging.INFO, | |||||
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |||||
| logger = logging.getLogger(__name__) | |||||
| def print_rank0(msg): | |||||
| if device_id == 0: | |||||
| logger.info(msg) | |||||
| if __name__ == "__main__": | |||||
| # argument parser | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model', type=str, required=True, | |||||
| help='model to be tested') | |||||
| parser.add_argument('--dataset', type=str, required=True, | |||||
| help='dataset to be trained on') | |||||
| parser.add_argument('--batch-size', type=int, | |||||
| default=128, help='batch size') | |||||
| parser.add_argument('--learning-rate', type=float, | |||||
| default=0.1, help='learning rate') | |||||
| parser.add_argument('--opt', type=str, default='sgd', | |||||
| help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') | |||||
| parser.add_argument('--num-epochs', type=int, | |||||
| default=10, help='epoch number') | |||||
| parser.add_argument('--gpu', type=int, default=0, | |||||
| help='gpu to be used, -1 means cpu') | |||||
| parser.add_argument('--validate', action='store_true', | |||||
| help='whether to use validation') | |||||
| parser.add_argument('--timing', action='store_true', | |||||
| help='whether to time the training phase') | |||||
| parser.add_argument('--comm-mode', default=None, help='communication mode') | |||||
| args = parser.parse_args() | |||||
| global device_id | |||||
| device_id = 0 | |||||
| print_rank0("Training {} on HETU".format(args.model)) | |||||
| if args.comm_mode in ('AllReduce', 'Hybrid'): | |||||
| comm, device_id = ht.mpi_nccl_init() | |||||
| executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0) | |||||
| else: | |||||
| if args.gpu == -1: | |||||
| executor_ctx = ht.cpu(0) | |||||
| print_rank0('Use CPU.') | |||||
| else: | |||||
| executor_ctx = ht.gpu(args.gpu) | |||||
| print_rank0('Use GPU %d.' % args.gpu) | |||||
| if args.comm_mode in ('PS', 'Hybrid'): | |||||
| settings_file = open(os.path.join(os.path.abspath( | |||||
| os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu)) | |||||
| settings = json.load(settings_file) | |||||
| for key in settings: | |||||
| if type(settings[key]) == str: | |||||
| os.environ[key] = settings[key] | |||||
| else: | |||||
| os.environ[key] = str(settings[key]) # type is str | |||||
| assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \ | |||||
| 'Model not supported!' | |||||
| model = eval('models.' + args.model) | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| assert args.opt in ['sgd', 'momentum', 'nesterov', | |||||
| 'adagrad', 'adam'], 'Optimizer not supported!' | |||||
| if args.opt == 'sgd': | |||||
| print_rank0('Use SGD Optimizer.') | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate) | |||||
| elif args.opt == 'momentum': | |||||
| print_rank0('Use Momentum Optimizer.') | |||||
| opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate) | |||||
| elif args.opt == 'nesterov': | |||||
| print_rank0('Use Nesterov Momentum Optimizer.') | |||||
| opt = ht.optim.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, nesterov=True) | |||||
| elif args.opt == 'adagrad': | |||||
| print_rank0('Use AdaGrad Optimizer.') | |||||
| opt = ht.optim.AdaGradOptimizer( | |||||
| learning_rate=args.learning_rate, initial_accumulator_value=0.1) | |||||
| else: | |||||
| print_rank0('Use Adam Optimizer.') | |||||
| opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate) | |||||
| # data loading | |||||
| print_rank0('Loading %s data...' % dataset) | |||||
| if dataset == 'MNIST': | |||||
| datasets = ht.data.mnist() | |||||
| train_set_x, train_set_y = datasets[0] | |||||
| valid_set_x, valid_set_y = datasets[1] | |||||
| test_set_x, test_set_y = datasets[2] | |||||
| # train_set_x: (50000, 784), train_set_y: (50000, 10) | |||||
| # valid_set_x: (10000, 784), valid_set_y: (10000, 10) | |||||
| # x_shape = (args.batch_size, 784) | |||||
| # y_shape = (args.batch_size, 10) | |||||
| elif dataset == 'CIFAR10': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( | |||||
| num_class=10) | |||||
| if args.model == "mlp": | |||||
| train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) | |||||
| valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) | |||||
| # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10) | |||||
| # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10) | |||||
| # x_shape = (args.batch_size, 3, 32, 32) | |||||
| # y_shape = (args.batch_size, 10) | |||||
| elif dataset == 'CIFAR100': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( | |||||
| num_class=100) | |||||
| # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100) | |||||
| # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| # model definition | |||||
| print_rank0('Building model {}'.format(args.model)) | |||||
| x = ht.dataloader_op([ | |||||
| ht.Dataloader(train_set_x, args.batch_size, 'train'), | |||||
| ht.Dataloader(valid_set_x, args.batch_size, 'validate'), | |||||
| ]) | |||||
| y_ = ht.dataloader_op([ | |||||
| ht.Dataloader(train_set_y, args.batch_size, 'train'), | |||||
| ht.Dataloader(valid_set_y, args.batch_size, 'validate'), | |||||
| ]) | |||||
| if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100': | |||||
| loss, y = model(x, y_, 100) | |||||
| else: | |||||
| loss, y = model(x, y_) | |||||
| train_op = opt.minimize(loss) | |||||
| eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]} | |||||
| executor = ht.Executor(eval_nodes, ctx=executor_ctx, | |||||
| comm_mode=args.comm_mode) | |||||
| n_train_batches = executor.get_batch_num('train') | |||||
| n_valid_batches = executor.get_batch_num('validate') | |||||
| # training | |||||
| print_rank0("Start training loop...") | |||||
| running_time = 0 | |||||
| for i in range(args.num_epochs + 1): | |||||
| print_rank0("Epoch %d" % i) | |||||
| loss_all = 0 | |||||
| batch_num = 0 | |||||
| if args.timing: | |||||
| start = time() | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_train_batches): | |||||
| loss_val, predict_y, y_val, _ = executor.run( | |||||
| 'train', eval_node_list=[loss, y, y_, train_op]) | |||||
| # Loss for this minibatch | |||||
| predict_y = predict_y.asnumpy() | |||||
| y_val = y_val.asnumpy() | |||||
| loss_all += loss_val.asnumpy() | |||||
| batch_num += 1 | |||||
| # Predict accuracy for this minibatch | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(predict_y, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Train loss = %f" % loss_all) | |||||
| print_rank0("Train accuracy = %f" % accuracy) | |||||
| if args.timing: | |||||
| end = time() | |||||
| during_time = end - start | |||||
| print_rank0("Running time of current epoch = %fs" % (during_time)) | |||||
| if i != 0: | |||||
| running_time += during_time | |||||
| if args.validate: | |||||
| val_loss_all = 0 | |||||
| batch_num = 0 | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_valid_batches): | |||||
| loss_val, valid_y_predicted, y_val = executor.run( | |||||
| 'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True) | |||||
| val_loss_all += loss_val | |||||
| batch_num += 1 | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(valid_y_predicted, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| val_loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Validation loss = %f" % val_loss_all) | |||||
| print_rank0("Validation accuracy = %f" % accuracy) | |||||
| print_rank0("*"*50) | |||||
| print_rank0("Running time of total %d epoch = %fs" % | |||||
| (args.num_epochs, running_time)) | |||||
| if args.comm_mode in ('AllReduce', 'Hybrid'): | |||||
| ht.mpi_nccl_finish(comm) | |||||
| @@ -0,0 +1,61 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False): | |||||
| weight = init.random_normal( | |||||
| shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight') | |||||
| bn_scale = init.random_normal( | |||||
| shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale') | |||||
| bn_bias = init.random_normal( | |||||
| shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias') | |||||
| x = ht.conv2d_op(x, weight, stride=1, padding=1) | |||||
| x = ht.batch_normalization_op(x, bn_scale, bn_bias) | |||||
| if with_relu: | |||||
| x = ht.relu_op(x) | |||||
| if with_pool: | |||||
| x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0) | |||||
| return x | |||||
| def fc(x, shape, name, with_relu=True): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') | |||||
| bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| x = x + ht.broadcastto_op(bias, x) | |||||
| if with_relu: | |||||
| x = ht.relu_op(x) | |||||
| return x | |||||
| def alexnet(x, y_): | |||||
| ''' | |||||
| AlexNet model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print('Building AlexNet model...') | |||||
| x = ht.array_reshape_op(x, [-1, 1, 28, 28]) | |||||
| x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1', | |||||
| with_relu=True, with_pool=True) | |||||
| x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2', | |||||
| with_relu=True, with_pool=True) | |||||
| x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3', | |||||
| with_relu=True, with_pool=False) | |||||
| x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4', | |||||
| with_relu=True, with_pool=False) | |||||
| x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5', | |||||
| with_relu=False, with_pool=True) | |||||
| x = ht.array_reshape_op(x, (-1, 256*3*3)) | |||||
| x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True) | |||||
| x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True) | |||||
| y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,41 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def conv_relu_avg(x, shape): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1) | |||||
| x = ht.conv2d_op(x, weight, padding=2, stride=1) | |||||
| x = ht.relu_op(x) | |||||
| x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| return x | |||||
| def fc(x, shape): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1) | |||||
| bias = init.random_normal(shape=shape[-1:], stddev=0.1) | |||||
| x = ht.array_reshape_op(x, (-1, shape[0])) | |||||
| x = ht.matmul_op(x, weight) | |||||
| y = x + ht.broadcastto_op(bias, x) | |||||
| return y | |||||
| def cnn_3_layers(x, y_): | |||||
| ''' | |||||
| 3-layer-CNN model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print('Building 3-layer-CNN model...') | |||||
| x = ht.array_reshape_op(x, [-1, 1, 28, 28]) | |||||
| x = conv_relu_avg(x, (32, 1, 5, 5)) | |||||
| x = conv_relu_avg(x, (64, 32, 5, 5)) | |||||
| y = fc(x, (7 * 7 * 64, 10)) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,90 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| def lstm(x, y_): | |||||
| ''' | |||||
| LSTM model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| diminput = 28 | |||||
| dimhidden = 128 | |||||
| dimoutput = 10 | |||||
| nsteps = 28 | |||||
| forget_gate_w = init.random_normal( | |||||
| shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w") | |||||
| forget_gate_u = init.random_normal( | |||||
| shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u") | |||||
| forget_gate_b = init.random_normal( | |||||
| shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b") | |||||
| input_gate_w = init.random_normal( | |||||
| shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w") | |||||
| input_gate_u = init.random_normal( | |||||
| shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u") | |||||
| input_gate_b = init.random_normal( | |||||
| shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b") | |||||
| output_gate_w = init.random_normal( | |||||
| shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w") | |||||
| output_gate_u = init.random_normal( | |||||
| shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u") | |||||
| output_gate_b = init.random_normal( | |||||
| shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b") | |||||
| tanh_w = init.random_normal( | |||||
| shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w") | |||||
| tanh_u = init.random_normal( | |||||
| shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u") | |||||
| tanh_b = init.random_normal( | |||||
| shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b") | |||||
| out_weights = init.random_normal( | |||||
| shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight") | |||||
| out_bias = init.random_normal( | |||||
| shape=(dimoutput,), stddev=0.1, name="lstm_out_bias") | |||||
| initial_state = ht.Variable(value=np.zeros((1,)).astype( | |||||
| np.float32), name='initial_state', trainable=False) | |||||
| for i in range(nsteps): | |||||
| cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput)) | |||||
| # forget gate | |||||
| if i == 0: | |||||
| temp = ht.matmul_op(cur_x, forget_gate_w) | |||||
| last_c_state = ht.broadcastto_op(initial_state, temp) | |||||
| last_h_state = ht.broadcastto_op(initial_state, temp) | |||||
| cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp | |||||
| else: | |||||
| cur_forget = ht.matmul_op( | |||||
| last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w) | |||||
| cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget) | |||||
| cur_forget = ht.sigmoid_op(cur_forget) | |||||
| # input gate | |||||
| cur_input = ht.matmul_op( | |||||
| last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w) | |||||
| cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input) | |||||
| cur_input = ht.sigmoid_op(cur_input) | |||||
| # output gate | |||||
| cur_output = ht.matmul_op( | |||||
| last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w) | |||||
| cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output) | |||||
| cur_output = ht.sigmoid_op(cur_output) | |||||
| # tanh | |||||
| cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \ | |||||
| ht.matmul_op(cur_x, tanh_w) | |||||
| cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh) | |||||
| cur_tanh = ht.tanh_op(cur_tanh) | |||||
| last_c_state = ht.mul_op(last_c_state, cur_forget) + \ | |||||
| ht.mul_op(cur_input, cur_tanh) | |||||
| last_h_state = ht.tanh_op(last_c_state) * cur_output | |||||
| x = ht.matmul_op(last_h_state, out_weights) | |||||
| y = x + ht.broadcastto_op(out_bias, x) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,46 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def conv_pool(x, in_channel, out_channel, name): | |||||
| weight = init.random_normal( | |||||
| shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight') | |||||
| x = ht.conv2d_op(x, weight, padding=2, stride=1) | |||||
| x = ht.relu_op(x) | |||||
| x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| return x | |||||
| def fc(x, shape, name, with_relu=True): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') | |||||
| bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| x = x + ht.broadcastto_op(bias, x) | |||||
| if with_relu: | |||||
| x = ht.relu_op(x) | |||||
| return x | |||||
| def lenet(x, y_): | |||||
| ''' | |||||
| LeNet model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print('Building LeNet model...') | |||||
| x = ht.array_reshape_op(x, (-1, 1, 28, 28)) | |||||
| x = conv_pool(x, 1, 6, name='lenet_conv1') | |||||
| x = conv_pool(x, 6, 16, name='lenet_conv2') | |||||
| x = ht.array_reshape_op(x, (-1, 7*7*16)) | |||||
| x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True) | |||||
| x = fc(x, (120, 84), name='lenet_fc2', with_relu=True) | |||||
| y = fc(x, (84, 10), name='lenet_fc3', with_relu=False) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,24 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def logreg(x, y_): | |||||
| ''' | |||||
| Logistic Regression model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print("Build logistic regression model...") | |||||
| weight = init.zeros((784, 10), name='logreg_weight') | |||||
| bias = init.zeros((10,), name='logreg_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| y = x + ht.broadcastto_op(bias, x) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,33 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def fc(x, shape, name, with_relu=True): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') | |||||
| bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| x = x + ht.broadcastto_op(bias, x) | |||||
| if with_relu: | |||||
| x = ht.relu_op(x) | |||||
| return x | |||||
| def mlp(x, y_): | |||||
| ''' | |||||
| MLP model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print("Building MLP model...") | |||||
| x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True) | |||||
| x = fc(x, (256, 256), 'mlp_fc2', with_relu=True) | |||||
| y = fc(x, (256, 10), 'mlp_fc3', with_relu=False) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,56 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| def rnn(x, y_): | |||||
| ''' | |||||
| RNN model, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| print("Building RNN model...") | |||||
| diminput = 28 | |||||
| dimhidden = 128 | |||||
| dimoutput = 10 | |||||
| nsteps = 28 | |||||
| weight1 = init.random_normal( | |||||
| shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1') | |||||
| bias1 = init.random_normal( | |||||
| shape=(dimhidden, ), stddev=0.1, name='rnn_bias1') | |||||
| weight2 = init.random_normal( | |||||
| shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2') | |||||
| bias2 = init.random_normal( | |||||
| shape=(dimhidden, ), stddev=0.1, name='rnn_bias2') | |||||
| weight3 = init.random_normal( | |||||
| shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3') | |||||
| bias3 = init.random_normal( | |||||
| shape=(dimoutput, ), stddev=0.1, name='rnn_bias3') | |||||
| last_state = ht.Variable(value=np.zeros((1,)).astype( | |||||
| np.float32), name='initial_state', trainable=False) | |||||
| for i in range(nsteps): | |||||
| cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput)) | |||||
| h = ht.matmul_op(cur_x, weight1) | |||||
| h = h + ht.broadcastto_op(bias1, h) | |||||
| if i == 0: | |||||
| last_state = ht.broadcastto_op(last_state, h) | |||||
| s = ht.concat_op(h, last_state, axis=1) | |||||
| s = ht.matmul_op(s, weight2) | |||||
| s = s + ht.broadcastto_op(bias2, s) | |||||
| last_state = ht.relu_op(s) | |||||
| final_state = last_state | |||||
| x = ht.matmul_op(final_state, weight3) | |||||
| y = x + ht.broadcastto_op(bias3, x) | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| @@ -0,0 +1,125 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''): | |||||
| weight = init.random_normal( | |||||
| shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight') | |||||
| x = ht.conv2d_op(x, weight, stride=stride, padding=padding) | |||||
| return x | |||||
| def batch_norm_with_relu(x, hidden, name): | |||||
| scale = init.random_normal( | |||||
| shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale') | |||||
| bias = init.random_normal(shape=(1, hidden, 1, 1), | |||||
| stddev=0.1, name=name+'_bias') | |||||
| x = ht.batch_normalization_op(x, scale, bias) | |||||
| x = ht.relu_op(x) | |||||
| return x | |||||
| def resnet_block(x, in_channel, num_blocks, is_first=False, name=''): | |||||
| if is_first: | |||||
| out_channel = in_channel | |||||
| identity = x | |||||
| x = conv2d(x, in_channel, out_channel, stride=1, | |||||
| padding=1, name=name+'_conv1') | |||||
| x = batch_norm_with_relu(x, out_channel, name+'_bn1') | |||||
| x = conv2d(x, out_channel, out_channel, stride=1, | |||||
| padding=1, name=name+'_conv2') | |||||
| x = x + identity | |||||
| else: | |||||
| out_channel = 2 * in_channel | |||||
| identity = x | |||||
| x = batch_norm_with_relu(x, in_channel, name+'_bn0') | |||||
| x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]]) | |||||
| x = conv2d(x, in_channel, out_channel, stride=2, | |||||
| padding=0, name=name+'_conv1') | |||||
| x = batch_norm_with_relu(x, out_channel, name+'_bn1') | |||||
| x = conv2d(x, out_channel, out_channel, stride=1, | |||||
| padding=1, name=name+'_conv2') | |||||
| identity = ht.avg_pool2d_op( | |||||
| identity, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| identity = ht.pad_op( | |||||
| identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) | |||||
| x = x + identity | |||||
| for i in range(1, num_blocks): | |||||
| identity = x | |||||
| x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i)) | |||||
| x = conv2d(x, out_channel, out_channel, stride=1, | |||||
| padding=1, name=name+'_conv%d' % (2 * i + 1)) | |||||
| x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1)) | |||||
| x = conv2d(x, out_channel, out_channel, stride=1, | |||||
| padding=1, name=name+'_conv%d' % (2 * i + 2)) | |||||
| x = x + identity | |||||
| return x | |||||
| def fc(x, shape, name): | |||||
| weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight') | |||||
| bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| x = x + ht.broadcastto_op(bias, x) | |||||
| return x | |||||
| def resnet(x, y_, num_layers=18, num_class=10): | |||||
| ''' | |||||
| ResNet model, for CIFAR10 dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| num_layers: 18 or 34 | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| base_size = 16 | |||||
| x = conv2d(x, 3, base_size, stride=1, padding=1, | |||||
| name='resnet_initial_conv') | |||||
| x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn') | |||||
| if num_layers == 18: | |||||
| print("Building ResNet-18 model...") | |||||
| x = resnet_block(x, base_size, num_blocks=2, | |||||
| is_first=True, name='resnet_block1') | |||||
| x = resnet_block(x, base_size, num_blocks=2, | |||||
| is_first=False, name='resnet_block2') | |||||
| x = resnet_block(x, 2 * base_size, num_blocks=2, | |||||
| is_first=False, name='resnet_block3') | |||||
| x = resnet_block(x, 4 * base_size, num_blocks=2, | |||||
| is_first=False, name='resnet_block4') | |||||
| elif num_layers == 34: | |||||
| print("Building ResNet-34 model...") | |||||
| x = resnet_block(x, base_size, num_blocks=3, | |||||
| is_first=True, name='resnet_block1') | |||||
| x = resnet_block(x, base_size, num_blocks=4, | |||||
| is_first=False, name='resnet_block2') | |||||
| x = resnet_block(x, 2 * base_size, num_blocks=6, | |||||
| is_first=False, name='resnet_block3') | |||||
| x = resnet_block(x, 4 * base_size, num_blocks=3, | |||||
| is_first=False, name='resnet_block4') | |||||
| else: | |||||
| assert False, "Number of layers should be 18 or 34 !" | |||||
| x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn') | |||||
| x = ht.array_reshape_op(x, (-1, 128 * base_size)) | |||||
| y = fc(x, (128 * base_size, num_class), name='resnet_final_fc') | |||||
| # here we don't use cudnn for softmax crossentropy to avoid overflows | |||||
| loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| def resnet18(x, y_, num_class=10): | |||||
| return resnet(x, y_, 18, num_class) | |||||
| def resnet34(x, y_, num_class=10): | |||||
| return resnet(x, y_, 34, num_class) | |||||
| @@ -0,0 +1,100 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def conv_bn_relu(x, in_channel, out_channel, name): | |||||
| weight = init.random_normal(shape=(out_channel, in_channel, 3, 3), | |||||
| stddev=0.1, name=name+'_weight') | |||||
| bn_scale = init.random_normal(shape=(1, out_channel, 1, 1), | |||||
| stddev=0.1, name=name+'_bn_scale') | |||||
| bn_bias = init.random_normal(shape=(1, out_channel, 1, 1), | |||||
| stddev=0.1, name=name+'_bn_bias') | |||||
| x = ht.conv2d_op(x, weight, padding=1, stride=1) | |||||
| x = ht.batch_normalization_op(x, bn_scale, bn_bias) | |||||
| act = ht.relu_op(x) | |||||
| return act | |||||
| def vgg_2block(x, in_channel, out_channel, name): | |||||
| x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') | |||||
| x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| return x | |||||
| def vgg_3block(x, in_channel, out_channel, name): | |||||
| x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3') | |||||
| x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| return x | |||||
| def vgg_4block(x, in_channel, out_channel, name): | |||||
| x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3') | |||||
| x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4') | |||||
| x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2) | |||||
| return x | |||||
| def vgg_fc(x, in_feat, out_feat, name): | |||||
| weight = init.random_normal(shape=(in_feat, out_feat), | |||||
| stddev=0.1, name=name+'_weight') | |||||
| bias = init.random_normal(shape=(out_feat,), | |||||
| stddev=0.1, name=name+'_bias') | |||||
| x = ht.matmul_op(x, weight) | |||||
| x = x + ht.broadcastto_op(bias, x) | |||||
| return x | |||||
| def vgg(x, y_, num_layers, num_class=10): | |||||
| ''' | |||||
| VGG model, for CIFAR10/CIFAR100 dataset. | |||||
| Parameters: | |||||
| x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) | |||||
| y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| num_layers: 16 or 19 | |||||
| Return: | |||||
| loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) | |||||
| y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) | |||||
| ''' | |||||
| if num_layers == 16: | |||||
| print('Building VGG-16 model...') | |||||
| x = vgg_2block(x, 3, 64, 'vgg_block1') | |||||
| x = vgg_2block(x, 64, 128, 'vgg_block2') | |||||
| x = vgg_3block(x, 128, 256, 'vgg_block3') | |||||
| x = vgg_3block(x, 256, 512, 'vgg_block4') | |||||
| x = vgg_3block(x, 512, 512, 'vgg_block5') | |||||
| elif num_layers == 19: | |||||
| print('Building VGG-19 model...') | |||||
| x = vgg_2block(x, 3, 64, 'vgg_block1') | |||||
| x = vgg_2block(x, 64, 128, 'vgg_block2') | |||||
| x = vgg_4block(x, 128, 256, 'vgg_block3') | |||||
| x = vgg_4block(x, 256, 512, 'vgg_block4') | |||||
| x = vgg_4block(x, 512, 512, 'vgg_block5') | |||||
| else: | |||||
| assert False, 'VGG model should have 16 or 19 layers!' | |||||
| x = ht.array_reshape_op(x, (-1, 512)) | |||||
| x = vgg_fc(x, 512, 4096, 'vgg_fc1') | |||||
| x = vgg_fc(x, 4096, 4096, 'vgg_fc2') | |||||
| y = vgg_fc(x, 4096, num_class, 'vgg_fc3') | |||||
| loss = ht.softmaxcrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| return loss, y | |||||
| def vgg16(x, y_, num_class=10): | |||||
| return vgg(x, y_, 16, num_class) | |||||
| def vgg19(x, y_, num_class=10): | |||||
| return vgg(x, y_, 19, num_class) | |||||
| @@ -0,0 +1,9 @@ | |||||
| from .VGG import vgg, vgg16, vgg19 | |||||
| from .LogReg import logreg | |||||
| from .CNN import cnn_3_layers | |||||
| from .AlexNet import alexnet | |||||
| from .LeNet import lenet | |||||
| from .MLP import mlp | |||||
| from .RNN import rnn | |||||
| from .LSTM import lstm | |||||
| from .ResNet import resnet, resnet18, resnet34 | |||||
| @@ -0,0 +1,4 @@ | |||||
| from .mlp import mlp | |||||
| from .resnet import resnet18, resnet34, resnet50 | |||||
| from .vgg import vgg16, vgg19 | |||||
| from .rnn import rnn | |||||
| @@ -0,0 +1,20 @@ | |||||
| import torch.nn.functional as F | |||||
| import torch.nn as nn | |||||
| class MLP(nn.Module): | |||||
| def __init__(self): | |||||
| super(MLP, self).__init__() | |||||
| self.fc1 = nn.Linear(3072, 256) | |||||
| self.fc2 = nn.Linear(256, 256) | |||||
| self.fc3 = nn.Linear(256, 10) | |||||
| def forward(self, x): | |||||
| x = F.relu(self.fc1(x)) | |||||
| x = F.relu(self.fc2(x)) | |||||
| out = self.fc3(x) | |||||
| return out | |||||
| def mlp(): | |||||
| return MLP() | |||||
| @@ -0,0 +1,116 @@ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class BasicBlock(nn.Module): | |||||
| expansion = 1 | |||||
| def __init__(self, in_planes, planes, stride=1): | |||||
| super(BasicBlock, self).__init__() | |||||
| self.conv1 = nn.Conv2d( | |||||
| in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(planes) | |||||
| self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, | |||||
| stride=1, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(planes) | |||||
| self.shortcut = nn.Sequential() | |||||
| if stride != 1 or in_planes != self.expansion*planes: | |||||
| self.shortcut = nn.Sequential( | |||||
| nn.Conv2d(in_planes, self.expansion*planes, | |||||
| kernel_size=1, stride=stride, bias=False), | |||||
| nn.BatchNorm2d(self.expansion*planes) | |||||
| ) | |||||
| def forward(self, x): | |||||
| out = F.relu(self.bn1(self.conv1(x))) | |||||
| out = self.bn2(self.conv2(out)) | |||||
| out += self.shortcut(x) | |||||
| out = F.relu(out) | |||||
| return out | |||||
| class Bottleneck(nn.Module): | |||||
| expansion = 4 | |||||
| def __init__(self, in_planes, planes, stride=1): | |||||
| super(Bottleneck, self).__init__() | |||||
| self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(planes) | |||||
| self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, | |||||
| stride=stride, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(planes) | |||||
| self.conv3 = nn.Conv2d(planes, self.expansion * | |||||
| planes, kernel_size=1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(self.expansion*planes) | |||||
| self.shortcut = nn.Sequential() | |||||
| if stride != 1 or in_planes != self.expansion*planes: | |||||
| self.shortcut = nn.Sequential( | |||||
| nn.Conv2d(in_planes, self.expansion*planes, | |||||
| kernel_size=1, stride=stride, bias=False), | |||||
| nn.BatchNorm2d(self.expansion*planes) | |||||
| ) | |||||
| def forward(self, x): | |||||
| out = F.relu(self.bn1(self.conv1(x))) | |||||
| out = F.relu(self.bn2(self.conv2(out))) | |||||
| out = self.bn3(self.conv3(out)) | |||||
| out += self.shortcut(x) | |||||
| out = F.relu(out) | |||||
| return out | |||||
| class ResNet(nn.Module): | |||||
| def __init__(self, block, num_blocks, num_classes=10): | |||||
| super(ResNet, self).__init__() | |||||
| self.in_planes = 64 | |||||
| self.conv1 = nn.Conv2d(3, 64, kernel_size=3, | |||||
| stride=1, padding=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(64) | |||||
| self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) | |||||
| self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) | |||||
| self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) | |||||
| self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) | |||||
| self.linear = nn.Linear(512*block.expansion, num_classes) | |||||
| def _make_layer(self, block, planes, num_blocks, stride): | |||||
| strides = [stride] + [1]*(num_blocks-1) | |||||
| layers = [] | |||||
| for stride in strides: | |||||
| layers.append(block(self.in_planes, planes, stride)) | |||||
| self.in_planes = planes * block.expansion | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| out = F.relu(self.bn1(self.conv1(x))) | |||||
| out = self.layer1(out) | |||||
| out = self.layer2(out) | |||||
| out = self.layer3(out) | |||||
| out = self.layer4(out) | |||||
| out = F.avg_pool2d(out, 4) | |||||
| out = out.view(out.size(0), -1) | |||||
| out = self.linear(out) | |||||
| return out | |||||
| def resnet18(num_classes=10): | |||||
| return ResNet(BasicBlock, [2, 2, 2, 2], num_classes) | |||||
| def resnet34(num_classes=10): | |||||
| return ResNet(BasicBlock, [3, 4, 6, 3], num_classes) | |||||
| def resnet50(num_classes=10): | |||||
| return ResNet(Bottleneck, [3, 4, 6, 3], num_classes) | |||||
| def resnet101(num_classes=10): | |||||
| return ResNet(Bottleneck, [3, 4, 23, 3], num_classes) | |||||
| def resnet152(num_classes=10): | |||||
| return ResNet(Bottleneck, [3, 8, 36, 3], num_classes) | |||||
| @@ -0,0 +1,36 @@ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class RNN(nn.Module): | |||||
| def __init__(self, diminput, dimoutput, dimhidden, nsteps): | |||||
| super(RNN, self).__init__() | |||||
| self.diminput = diminput | |||||
| self.dimoutput = dimoutput | |||||
| self.dimhidden = dimhidden | |||||
| self.nsteps = nsteps | |||||
| self.fc1 = nn.Linear(diminput, dimhidden) | |||||
| self.fc2 = nn.Linear(dimhidden*2, dimhidden) | |||||
| self.fc3 = nn.Linear(dimhidden, dimoutput) | |||||
| def forward(self, x): | |||||
| last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device) | |||||
| for i in range(self.nsteps): | |||||
| t = i % self.nsteps | |||||
| index = torch.Tensor([idx for idx in range( | |||||
| t*self.diminput, (t+1)*self.diminput)]).long().to(x.device) | |||||
| cur_x = torch.index_select(x, 1, index) | |||||
| h = self.fc1(cur_x) | |||||
| s = torch.cat([h, last_state], axis=1) | |||||
| s = self.fc2(s) | |||||
| last_state = F.relu(s) | |||||
| final_state = last_state | |||||
| y = self.fc3(final_state) | |||||
| return y | |||||
| def rnn(diminput, dimoutput, dimhidden, nsteps): | |||||
| return RNN(diminput, dimoutput, dimhidden, nsteps) | |||||
| @@ -0,0 +1,48 @@ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| cfg = { | |||||
| 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], | |||||
| 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], | |||||
| 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], | |||||
| 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], | |||||
| } | |||||
| class VGG(nn.Module): | |||||
| def __init__(self, vgg_name, num_class=10): | |||||
| super(VGG, self).__init__() | |||||
| self.features = self._make_layers(cfg[vgg_name]) | |||||
| self.fc1 = nn.Linear(512, 4096) | |||||
| self.fc2 = nn.Linear(4096, 4096) | |||||
| self.classifier = nn.Linear(4096, num_class) | |||||
| def forward(self, x): | |||||
| out = self.features(x) | |||||
| out = out.view(out.size(0), -1) | |||||
| out = self.fc2(self.fc1(out)) | |||||
| out = self.classifier(out) | |||||
| return out | |||||
| def _make_layers(self, cfg): | |||||
| layers = [] | |||||
| in_channels = 3 | |||||
| for x in cfg: | |||||
| if x == 'M': | |||||
| layers += [nn.MaxPool2d(kernel_size=2, stride=2)] | |||||
| else: | |||||
| layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), | |||||
| nn.BatchNorm2d(x), | |||||
| nn.ReLU(inplace=True)] | |||||
| in_channels = x | |||||
| layers += [nn.AvgPool2d(kernel_size=1, stride=1)] | |||||
| return nn.Sequential(*layers) | |||||
| def vgg16(num_class=10): | |||||
| return VGG('VGG16', num_class) | |||||
| def vgg19(num_class=10): | |||||
| return VGG('VGG19', num_class) | |||||
| @@ -0,0 +1,309 @@ | |||||
| import os | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| import tf_models | |||||
| import time | |||||
| import argparse | |||||
| from tqdm import tqdm | |||||
| from sklearn import metrics | |||||
| import horovod.tensorflow as hvd | |||||
| import hetu as ht | |||||
| import logging | |||||
| logging.basicConfig(level=logging.INFO, | |||||
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |||||
| logger = logging.getLogger(__name__) | |||||
| def print_rank0(msg): | |||||
| if rank % 8 == 0: | |||||
| logger.info(msg) | |||||
| def pop_env(): | |||||
| for k in ['https_proxy', 'http_proxy']: | |||||
| if k in os.environ: | |||||
| os.environ.pop(k) | |||||
| pop_env() | |||||
| # horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model | |||||
| # horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model | |||||
| # horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8 | |||||
| # python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn | |||||
| # if using multi nodes setting in conda, need to modify /etc/bash.bashrc | |||||
| # we can also use mpirun (default gloo): | |||||
| # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ | |||||
| # -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model | |||||
| ''' | |||||
| def train(model, args): | |||||
| hvd.init() | |||||
| def get_current_shard(data): | |||||
| part_size = data.shape[0] // hvd.size() | |||||
| start = part_size * hvd.rank() | |||||
| end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0] | |||||
| return data[start:end] | |||||
| batch_size = 128 | |||||
| if args.model == 'tf_resnet34': | |||||
| train_images, train_labels, test_images,\ | |||||
| test_labels = ht.data.tf_normalize_cifar10() | |||||
| x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3]) | |||||
| y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10]) | |||||
| else: | |||||
| datasets = ht.data.mnist() | |||||
| train_images, train_labels = datasets[0] | |||||
| test_images, test_labels = datasets[2] | |||||
| x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784]) | |||||
| y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10]) | |||||
| n_train_batches = train_images.shape[0] // batch_size | |||||
| loss, y = model(x, y_) | |||||
| opt = tf.train.GradientDescentOptimizer(learning_rate=0.01) | |||||
| global_step = tf.train.get_or_create_global_step() | |||||
| # here in DistributedOptimizer by default all tensor are reduced on GPU | |||||
| # can use device_sparse=xxx, device_dense=xxx to modify | |||||
| # if using device_sparse='/cpu:0', the performance degrades | |||||
| train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step) | |||||
| gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank())) | |||||
| # here horovod default use gpu to initialize, which will cause OOM | |||||
| hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] | |||||
| sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) | |||||
| iterations = train_images.shape[0] // batch_size | |||||
| total_epoch = 10 | |||||
| start_index = 0 | |||||
| total_time = 0 | |||||
| for ep in range(total_epoch + 1): | |||||
| print("epoch %d" % ep) | |||||
| st_time = time.time() | |||||
| train_loss, train_acc = [], [] | |||||
| for it in range(n_train_batches): | |||||
| x_val = train_images[start_index: start_index + batch_size] | |||||
| y_val = train_labels[start_index : start_index+batch_size] | |||||
| start_index += batch_size | |||||
| if start_index + batch_size > train_images.shape[0]: | |||||
| start_index = 0 | |||||
| loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val}) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| en_time = time.time() | |||||
| train_time = en_time - st_time | |||||
| if ep != 0: | |||||
| total_time += train_time | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\ | |||||
| % (tra_loss, tra_accuracy, train_time) | |||||
| print("training time:", total_time) | |||||
| def main(): | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--model", type=str, required=True, help="model to be tested") | |||||
| parser.add_argument("--all", action="store_true", help="whether to use all data") | |||||
| args = parser.parse_args() | |||||
| raw_model = args.model | |||||
| import tf_models | |||||
| model = eval('tf_models.' + raw_model) | |||||
| print('Model:', raw_model) | |||||
| train(model, args) | |||||
| if __name__ == '__main__': | |||||
| main() | |||||
| ''' | |||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model', type=str, required=True, | |||||
| help='model to be tested') | |||||
| parser.add_argument('--dataset', type=str, required=True, | |||||
| help='dataset to be trained on') | |||||
| parser.add_argument('--batch-size', type=int, | |||||
| default=128, help='batch size') | |||||
| parser.add_argument('--learning-rate', type=float, | |||||
| default=0.1, help='learning rate') | |||||
| parser.add_argument('--opt', type=str, default='sgd', | |||||
| help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') | |||||
| parser.add_argument('--num-epochs', type=int, | |||||
| default=20, help='epoch number') | |||||
| parser.add_argument('--validate', action='store_true', | |||||
| help='whether to use validation') | |||||
| parser.add_argument('--timing', action='store_true', | |||||
| help='whether to time the training phase') | |||||
| args = parser.parse_args() | |||||
| hvd.init() | |||||
| global rank | |||||
| rank = hvd.rank() | |||||
| assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ | |||||
| 'Model not supported now.' | |||||
| model = eval('tf_models.' + args.model) | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| assert args.opt in ['sgd', 'momentum', 'nesterov', | |||||
| 'adagrad', 'adam'], 'Optimizer not supported!' | |||||
| if args.opt == 'sgd': | |||||
| print_rank0('Use SGD Optimizer.') | |||||
| opt = tf.train.GradientDescentOptimizer( | |||||
| learning_rate=args.learning_rate) | |||||
| elif args.opt == 'momentum': | |||||
| print_rank0('Use Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9) | |||||
| elif args.opt == 'nesterov': | |||||
| print_rank0('Use Nesterov Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) | |||||
| elif args.opt == 'adagrad': | |||||
| print_rank0('Use AdaGrad Optimizer.') | |||||
| opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) | |||||
| else: | |||||
| print_rank0('Use Adam Optimizer.') | |||||
| opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |||||
| if dataset == 'MNIST': | |||||
| datasets = ht.data.mnist() | |||||
| train_set_x, train_set_y = datasets[0] | |||||
| valid_set_x, valid_set_y = datasets[1] | |||||
| test_set_x, test_set_y = datasets[2] | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 784), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 784), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR10': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=10) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| if args.model == "tf_mlp": | |||||
| train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) | |||||
| valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR100': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=100) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| if dataset == 'MNIST': | |||||
| x = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 784), name='x') | |||||
| y_ = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_) | |||||
| elif dataset == 'CIFAR10': | |||||
| if args.model == "tf_mlp": | |||||
| x = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 3072), name='x') | |||||
| y_ = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| else: | |||||
| x = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 32, 32, 3), name='x') | |||||
| y_ = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_, 10) | |||||
| elif dataset == 'CIFAR100': | |||||
| x = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 32, 32, 3), name='x') | |||||
| y_ = tf.compat.v1.placeholder( | |||||
| dtype=tf.float32, shape=(None, 100), name='y_') | |||||
| loss, y = model(x, y_, 100) | |||||
| global_step = tf.train.get_or_create_global_step() | |||||
| # here in DistributedOptimizer by default all tensor are reduced on GPU | |||||
| # can use device_sparse=xxx, device_dense=xxx to modify | |||||
| # if using device_sparse='/cpu:0', the performance degrades | |||||
| train_op = hvd.DistributedOptimizer( | |||||
| opt).minimize(loss, global_step=global_step) | |||||
| gpu_options = tf.compat.v1.GPUOptions( | |||||
| allow_growth=True, visible_device_list=str(hvd.local_rank())) | |||||
| # here horovod default use gpu to initialize, which will cause OOM | |||||
| hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] | |||||
| sess = tf.compat.v1.train.MonitoredTrainingSession( | |||||
| hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) | |||||
| # sess.run(tf.compat.v1.global_variables_initializer()) | |||||
| # training | |||||
| print_rank0("Start training loop...") | |||||
| running_time = 0 | |||||
| for i in range(args.num_epochs + 1): | |||||
| print_rank0("Epoch %d" % i) | |||||
| loss_all = 0 | |||||
| batch_num = 0 | |||||
| if args.timing: | |||||
| start = time.time() | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_train_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| x_val = train_set_x[minibatch_start:minibatch_end] | |||||
| y_val = train_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, predict_y, _ = sess.run([loss, y, train_op], | |||||
| feed_dict={x: x_val, y_: y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(predict_y, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| batch_num += 1 | |||||
| loss_all += loss_val | |||||
| loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Train loss = %f" % loss_all) | |||||
| print_rank0("Train accuracy = %f" % accuracy) | |||||
| if args.timing: | |||||
| end = time.time() | |||||
| print_rank0("Running time of current epoch = %fs" % (end - start)) | |||||
| if i != 0: | |||||
| running_time += (end - start) | |||||
| if args.validate: | |||||
| val_loss_all = 0 | |||||
| batch_num = 0 | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_valid_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| valid_x_val = valid_set_x[minibatch_start:minibatch_end] | |||||
| valid_y_val = valid_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, valid_y_predicted = sess.run([loss, y], | |||||
| feed_dict={x: valid_x_val, y_: valid_y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(valid_y_val, 1), | |||||
| np.argmax(valid_y_predicted, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| val_loss_all += loss_all | |||||
| batch_num += 1 | |||||
| val_loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Validation loss = %f" % val_loss_all) | |||||
| print_rank0("Validation accuracy = %f" % accuracy) | |||||
| print_rank0("*"*50) | |||||
| print_rank0("Running time of total %d epoch = %fs" % | |||||
| (args.num_epochs, running_time)) | |||||
| @@ -0,0 +1,9 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../main.py | |||||
| depsdir=${workdir}/../../.. | |||||
| echo $depsdir | |||||
| ### validate and timing | |||||
| $depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce | |||||
| @@ -0,0 +1,11 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../main.py | |||||
| # model: | |||||
| # e.g. bash hetu_1gpu.sh mlp CIFAR10 | |||||
| ### validate and timing | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing | |||||
| @@ -0,0 +1,10 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../main.py | |||||
| ### validate and timing | |||||
| python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched & | |||||
| python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 & | |||||
| python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 & | |||||
| wait | |||||
| @@ -0,0 +1,8 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../main.py | |||||
| depsdir=${workdir}/../../.. | |||||
| ### validate and timing | |||||
| # | |||||
| NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce | |||||
| @@ -0,0 +1,11 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_tf_horovod.py | |||||
| # horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing | |||||
| horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing | |||||
| # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ | |||||
| # -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_tf_horovod.py | |||||
| horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing | |||||
| @@ -0,0 +1,18 @@ | |||||
| #!/bin/bash | |||||
| GPUS_PER_NODE=8 | |||||
| # Change for multinode config | |||||
| MASTER_ADDR=162.105.146.117 | |||||
| MASTER_PORT=6000 | |||||
| NNODES=2 | |||||
| NODE_RANK=0 | |||||
| WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../torch_main.py | |||||
| DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" | |||||
| python -m torch.distributed.launch $DISTRIBUTED_ARGS \ | |||||
| ${mainpy} \ | |||||
| --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed | |||||
| @@ -0,0 +1,18 @@ | |||||
| #!/bin/bash | |||||
| GPUS_PER_NODE=8 | |||||
| # Change for multinode config | |||||
| MASTER_ADDR=162.105.146.117 | |||||
| MASTER_PORT=39575 | |||||
| NNODES=2 | |||||
| NODE_RANK=1 | |||||
| WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../torch_main.py | |||||
| DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" | |||||
| python -m torch.distributed.launch $DISTRIBUTED_ARGS \ | |||||
| ${mainpy} \ | |||||
| --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed | |||||
| @@ -0,0 +1,7 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../torch_main.py | |||||
| ## validate and timing | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing | |||||
| @@ -0,0 +1,18 @@ | |||||
| #!/bin/bash | |||||
| GPUS_PER_NODE=8 | |||||
| # Change for multinode config | |||||
| MASTER_ADDR=localhost | |||||
| MASTER_PORT=6000 | |||||
| NNODES=1 | |||||
| NODE_RANK=0 | |||||
| WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../torch_main.py | |||||
| DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" | |||||
| python -m torch.distributed.launch $DISTRIBUTED_ARGS \ | |||||
| ${mainpy} \ | |||||
| --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed | |||||
| @@ -0,0 +1,15 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../tf_launch_worker.py | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate & | |||||
| wait | |||||
| @@ -0,0 +1,14 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../tf_launch_worker.py | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate & | |||||
| wait | |||||
| @@ -0,0 +1,10 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../tf_main.py | |||||
| ### validate and timing | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing | |||||
| ### run in cpu | |||||
| # python ${mainpy} --model tf_mlp --gpu -1 --validate --timing | |||||
| @@ -0,0 +1,15 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../tf_launch_worker.py | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate & | |||||
| python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate & | |||||
| wait | |||||
| @@ -0,0 +1,23 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "162.105.146.117:34569", | |||||
| "162.105.146.117:34568", | |||||
| "162.105.146.117:34567", | |||||
| "162.105.146.117:34566", | |||||
| "162.105.146.117:34565", | |||||
| "162.105.146.117:34564", | |||||
| "162.105.146.117:34563", | |||||
| "162.105.146.117:34562", | |||||
| "162.105.146.118:34779", | |||||
| "162.105.146.118:34778", | |||||
| "162.105.146.118:34777", | |||||
| "162.105.146.118:34776", | |||||
| "162.105.146.118:34775", | |||||
| "162.105.146.118:34774", | |||||
| "162.105.146.118:34773", | |||||
| "162.105.146.118:34772" | |||||
| ], | |||||
| "ps": [ | |||||
| "162.105.146.117:34575" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,11 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "162.105.146.119:34569", | |||||
| "162.105.146.119:34568", | |||||
| "162.105.146.119:34567", | |||||
| "162.105.146.119:34566" | |||||
| ], | |||||
| "ps": [ | |||||
| "162.105.146.119:34575" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,15 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "162.105.146.119:34569", | |||||
| "162.105.146.119:34568", | |||||
| "162.105.146.119:34567", | |||||
| "162.105.146.119:34566", | |||||
| "162.105.146.119:34565", | |||||
| "162.105.146.119:34564", | |||||
| "162.105.146.119:34563", | |||||
| "162.105.146.119:34562" | |||||
| ], | |||||
| "ps": [ | |||||
| "162.105.146.119:34575" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,49 @@ | |||||
| import os | |||||
| import tensorflow as tf | |||||
| import multiprocessing | |||||
| import signal | |||||
| import json | |||||
| import argparse | |||||
| def pop_env(): | |||||
| for k in ['https_proxy', 'http_proxy']: | |||||
| if k in os.environ: | |||||
| os.environ.pop(k) | |||||
| os.environ['CUDA_VISIBLE_DEVICES'] = '' | |||||
| pop_env() | |||||
| def start_server(cluster, task_id): | |||||
| server = tf.train.Server(cluster, job_name='ps', task_index=task_id) | |||||
| server.join() | |||||
| def main(): | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument( | |||||
| "--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path") | |||||
| parser.add_argument("--id", type=int, required=True) | |||||
| args = parser.parse_args() | |||||
| raw_config = args.config | |||||
| config = json.load(open(raw_config)) | |||||
| cluster = tf.train.ClusterSpec(config) | |||||
| global proc | |||||
| proc = multiprocessing.Process( | |||||
| target=start_server, args=[cluster, args.id, ]) | |||||
| proc.start() | |||||
| signal.signal(signal.SIGINT, signal_handler) | |||||
| proc.join() | |||||
| def signal_handler(signal, frame): | |||||
| print("SIGINT signal caught, stop Training") | |||||
| global proc | |||||
| proc.kill() | |||||
| exit(0) | |||||
| if __name__ == '__main__': | |||||
| main() | |||||
| @@ -0,0 +1,234 @@ | |||||
| import tensorflow as tf | |||||
| import tf_models | |||||
| import hetu as ht | |||||
| import numpy as np | |||||
| import argparse | |||||
| import json | |||||
| from time import time | |||||
| import os | |||||
| import logging | |||||
| logging.basicConfig(level=logging.INFO, | |||||
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |||||
| logger = logging.getLogger(__name__) | |||||
| def print_rank0(msg): | |||||
| if task_id % 8 == 0: | |||||
| logger.info(msg) | |||||
| def pop_env(): | |||||
| for k in ['https_proxy', 'http_proxy']: | |||||
| if k in os.environ: | |||||
| os.environ.pop(k) | |||||
| pop_env() | |||||
| if __name__ == "__main__": | |||||
| # argument parser | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model', type=str, required=True, | |||||
| help='model to be tested') | |||||
| parser.add_argument('--dataset', type=str, required=True, | |||||
| help='dataset to be trained on') | |||||
| parser.add_argument('--batch-size', type=int, | |||||
| default=128, help='batch size') | |||||
| parser.add_argument('--learning-rate', type=float, | |||||
| default=0.1, help='learning rate') | |||||
| parser.add_argument('--opt', type=str, default='sgd', | |||||
| help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') | |||||
| parser.add_argument('--num-epochs', type=int, | |||||
| default=20, help='epoch number') | |||||
| parser.add_argument('--gpu', type=int, default=0, | |||||
| help='gpu to be used, -1 means cpu') | |||||
| parser.add_argument('--validate', action='store_true', | |||||
| help='whether to use validation') | |||||
| parser.add_argument('--timing', action='store_true', | |||||
| help='whether to time the training phase') | |||||
| parser.add_argument("--rank", type=int, required=True, | |||||
| help="rank of process") | |||||
| parser.add_argument( | |||||
| "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path") | |||||
| args = parser.parse_args() | |||||
| global task_id | |||||
| task_id = int(args.rank) | |||||
| print_rank0("task id %d" % (task_id)) | |||||
| raw_config = args.config | |||||
| if args.gpu == -1: | |||||
| device = '/job:worker/task:%d/cpu:0' % (task_id) | |||||
| print_rank0('Use CPU.') | |||||
| else: | |||||
| device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu) | |||||
| print_rank0('Use GPU %d.' % args.gpu) | |||||
| config = json.load(open(raw_config)) | |||||
| cluster = tf.train.ClusterSpec(config) | |||||
| assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ | |||||
| 'Model not supported now.' | |||||
| model = eval('tf_models.' + args.model) | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| assert args.opt in ['sgd', 'momentum', 'nesterov', | |||||
| 'adagrad', 'adam'], 'Optimizer not supported!' | |||||
| if args.opt == 'sgd': | |||||
| print_rank0('Use SGD Optimizer.') | |||||
| opt = tf.train.GradientDescentOptimizer( | |||||
| learning_rate=args.learning_rate) | |||||
| elif args.opt == 'momentum': | |||||
| print_rank0('Use Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9) | |||||
| elif args.opt == 'nesterov': | |||||
| print_rank0('Use Nesterov Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) | |||||
| elif args.opt == 'adagrad': | |||||
| print_rank0('Use AdaGrad Optimizer.') | |||||
| opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) | |||||
| else: | |||||
| print_rank0('Use Adam Optimizer.') | |||||
| opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |||||
| with tf.device( | |||||
| tf.compat.v1.train.replica_device_setter( | |||||
| worker_device=device, | |||||
| cluster=cluster)): | |||||
| # data loading | |||||
| print_rank0('Loading %s data...' % dataset) | |||||
| if dataset == 'MNIST': | |||||
| datasets = ht.data.mnist() | |||||
| train_set_x, train_set_y = datasets[0] | |||||
| valid_set_x, valid_set_y = datasets[1] | |||||
| test_set_x, test_set_y = datasets[2] | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 784), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 784), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR10': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=10) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| if args.model == "tf_mlp": | |||||
| train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) | |||||
| valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR100': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=100) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| if dataset == 'MNIST': | |||||
| x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x') | |||||
| y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_) | |||||
| elif dataset == 'CIFAR10': | |||||
| if args.model == "tf_mlp": | |||||
| x = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 3072), name='x') | |||||
| y_ = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| else: | |||||
| x = tf.placeholder(dtype=tf.float32, shape=( | |||||
| None, 32, 32, 3), name='x') | |||||
| y_ = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_, 10) | |||||
| elif dataset == 'CIFAR100': | |||||
| x = tf.placeholder(dtype=tf.float32, shape=( | |||||
| None, 32, 32, 3), name='x') | |||||
| y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_') | |||||
| loss, y = model(x, y_, 100) | |||||
| train_op = opt.minimize(loss) | |||||
| server = tf.train.Server( | |||||
| cluster, job_name="worker", task_index=task_id) | |||||
| init = tf.compat.v1.global_variables_initializer() | |||||
| sv = tf.train.Supervisor( | |||||
| is_chief=(task_id == 0), | |||||
| init_op=init, | |||||
| recovery_wait_secs=1) | |||||
| sess_config = tf.compat.v1.ConfigProto( | |||||
| allow_soft_placement=True, | |||||
| log_device_placement=False, | |||||
| device_filters=["/job:ps", | |||||
| "/job:worker/task:%d" % task_id]) | |||||
| sess = sv.prepare_or_wait_for_session( | |||||
| server.target, config=sess_config) | |||||
| sess.run(init) | |||||
| # training | |||||
| print_rank0("Start training loop...") | |||||
| running_time = 0 | |||||
| for i in range(args.num_epochs + 1): | |||||
| print_rank0("Epoch %d" % i) | |||||
| loss_all = 0 | |||||
| batch_num = 0 | |||||
| if args.timing: | |||||
| start = time() | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_train_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| x_val = train_set_x[minibatch_start:minibatch_end] | |||||
| y_val = train_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, predict_y, _ = sess.run([loss, y, train_op], | |||||
| feed_dict={x: x_val, y_: y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(predict_y, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| batch_num += 1 | |||||
| loss_all += loss_val | |||||
| loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Train loss = %f" % loss_all) | |||||
| print_rank0("Train accuracy = %f" % accuracy) | |||||
| if args.timing: | |||||
| end = time() | |||||
| print_rank0("Running time of current epoch = %fs" % | |||||
| (end - start)) | |||||
| if i != 0: | |||||
| running_time += (end - start) | |||||
| if args.validate: | |||||
| val_loss_all = 0 | |||||
| batch_num = 0 | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_valid_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| valid_x_val = valid_set_x[minibatch_start:minibatch_end] | |||||
| valid_y_val = valid_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, valid_y_predicted = sess.run([loss, y], | |||||
| feed_dict={x: valid_x_val, y_: valid_y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(valid_y_val, 1), | |||||
| np.argmax(valid_y_predicted, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| val_loss_all += loss_all | |||||
| batch_num += 1 | |||||
| val_loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Validation loss = %f" % val_loss_all) | |||||
| print_rank0("Validation accuracy = %f" % accuracy) | |||||
| print_rank0("*"*50) | |||||
| print_rank0("Running time of total %d epoch = %fs" % | |||||
| (args.num_epochs, running_time)) | |||||
| @@ -0,0 +1,194 @@ | |||||
| import tensorflow as tf | |||||
| import tf_models | |||||
| import hetu as ht | |||||
| import numpy as np | |||||
| import argparse | |||||
| from time import time | |||||
| import logging | |||||
| logging.basicConfig(level=logging.INFO, | |||||
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |||||
| logger = logging.getLogger(__name__) | |||||
| def print_rank0(msg): | |||||
| logger.info(msg) | |||||
| if __name__ == "__main__": | |||||
| # argument parser | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model', type=str, required=True, | |||||
| help='model to be tested') | |||||
| parser.add_argument('--dataset', type=str, required=True, | |||||
| help='dataset to be trained on') | |||||
| parser.add_argument('--batch-size', type=int, | |||||
| default=128, help='batch size') | |||||
| parser.add_argument('--learning-rate', type=float, | |||||
| default=0.1, help='learning rate') | |||||
| parser.add_argument('--opt', type=str, default='sgd', | |||||
| help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') | |||||
| parser.add_argument('--num-epochs', type=int, | |||||
| default=20, help='epoch number') | |||||
| parser.add_argument('--gpu', type=int, default=0, | |||||
| help='gpu to be used, -1 means cpu') | |||||
| parser.add_argument('--validate', action='store_true', | |||||
| help='whether to use validation') | |||||
| parser.add_argument('--timing', action='store_true', | |||||
| help='whether to time the training phase') | |||||
| args = parser.parse_args() | |||||
| if args.gpu == -1: | |||||
| device = '/cpu:0' | |||||
| print_rank0('Use CPU.') | |||||
| else: | |||||
| device = '/gpu:%d' % args.gpu | |||||
| print_rank0('Use GPU %d.' % args.gpu) | |||||
| print_rank0("Training {} on TensorFlow".format(args.model)) | |||||
| assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \ | |||||
| 'Model not supported now.' | |||||
| model = eval('tf_models.' + args.model) | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| assert args.opt in ['sgd', 'momentum', 'nesterov', | |||||
| 'adagrad', 'adam'], 'Optimizer not supported!' | |||||
| if args.opt == 'sgd': | |||||
| print_rank0('Use SGD Optimizer.') | |||||
| opt = tf.train.GradientDescentOptimizer( | |||||
| learning_rate=args.learning_rate) | |||||
| elif args.opt == 'momentum': | |||||
| print_rank0('Use Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9) | |||||
| elif args.opt == 'nesterov': | |||||
| print_rank0('Use Nesterov Momentum Optimizer.') | |||||
| opt = tf.train.MomentumOptimizer( | |||||
| learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True) | |||||
| elif args.opt == 'adagrad': | |||||
| print_rank0('Use AdaGrad Optimizer.') | |||||
| opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate) | |||||
| else: | |||||
| print_rank0('Use Adam Optimizer.') | |||||
| opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |||||
| # model definition | |||||
| print_rank0('Building model...') | |||||
| with tf.device(device): | |||||
| if dataset == 'MNIST': | |||||
| x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x') | |||||
| y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_) | |||||
| elif dataset == 'CIFAR10': | |||||
| if args.model == "tf_mlp": | |||||
| x = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 3072), name='x') | |||||
| y_ = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| else: | |||||
| x = tf.placeholder(dtype=tf.float32, shape=( | |||||
| None, 32, 32, 3), name='x') | |||||
| y_ = tf.placeholder( | |||||
| dtype=tf.float32, shape=(None, 10), name='y_') | |||||
| loss, y = model(x, y_, 10) | |||||
| elif dataset == 'CIFAR100': | |||||
| x = tf.placeholder(dtype=tf.float32, shape=( | |||||
| None, 32, 32, 3), name='x') | |||||
| y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_') | |||||
| loss, y = model(x, y_, 100) | |||||
| train_op = opt.minimize(loss) | |||||
| # data loading | |||||
| print_rank0('Loading %s data...' % dataset) | |||||
| if dataset == 'MNIST': | |||||
| datasets = ht.data.mnist() | |||||
| train_set_x, train_set_y = datasets[0] | |||||
| valid_set_x, valid_set_y = datasets[1] | |||||
| test_set_x, test_set_y = datasets[2] | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 784), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 784), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR10': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=10) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| if args.model == "tf_mlp": | |||||
| train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) | |||||
| valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| elif dataset == 'CIFAR100': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar( | |||||
| num_class=100) | |||||
| n_train_batches = train_set_x.shape[0] // args.batch_size | |||||
| n_valid_batches = valid_set_x.shape[0] // args.batch_size | |||||
| # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,) | |||||
| # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| # training | |||||
| print_rank0("Start training loop...") | |||||
| running_time = 0 | |||||
| with tf.Session() as sess: | |||||
| sess.run(tf.global_variables_initializer()) | |||||
| for i in range(args.num_epochs + 1): | |||||
| print_rank0("Epoch %d" % i) | |||||
| loss_all = 0 | |||||
| batch_num = 0 | |||||
| if args.timing: | |||||
| start = time() | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_train_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| x_val = train_set_x[minibatch_start:minibatch_end] | |||||
| y_val = train_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, predict_y, _ = sess.run([loss, y, train_op], | |||||
| feed_dict={x: x_val, y_: y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(predict_y, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| batch_num += 1 | |||||
| loss_all += loss_val | |||||
| loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Train loss = %f" % loss_all) | |||||
| print_rank0("Train accuracy = %f" % accuracy) | |||||
| if args.timing: | |||||
| end = time() | |||||
| print_rank0("Running time of current epoch = %fs" % | |||||
| (end - start)) | |||||
| if i != 0: | |||||
| running_time += (end - start) | |||||
| if args.validate: | |||||
| val_loss_all = 0 | |||||
| batch_num = 0 | |||||
| correct_predictions = [] | |||||
| for minibatch_index in range(n_valid_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| valid_x_val = valid_set_x[minibatch_start:minibatch_end] | |||||
| valid_y_val = valid_set_y[minibatch_start:minibatch_end] | |||||
| loss_val, valid_y_predicted = sess.run([loss, y], | |||||
| feed_dict={x: valid_x_val, y_: valid_y_val}) | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(valid_y_val, 1), | |||||
| np.argmax(valid_y_predicted, 1)).astype(np.float32) | |||||
| correct_predictions.extend(correct_prediction) | |||||
| val_loss_all += loss_all | |||||
| batch_num += 1 | |||||
| val_loss_all /= batch_num | |||||
| accuracy = np.mean(correct_predictions) | |||||
| print_rank0("Validation loss = %f" % val_loss_all) | |||||
| print_rank0("Validation accuracy = %f" % accuracy) | |||||
| print_rank0("*"*50) | |||||
| print_rank0("Running time of total %d epoch = %fs" % | |||||
| (args.num_epochs, running_time)) | |||||
| @@ -0,0 +1,8 @@ | |||||
| from .tf_LogReg import tf_logreg | |||||
| from .tf_CNN import tf_cnn_3_layers | |||||
| from .tf_LeNet import tf_lenet | |||||
| from .tf_MLP import tf_mlp | |||||
| from .tf_RNN import tf_rnn | |||||
| from .tf_LSTM import tf_lstm | |||||
| from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34 | |||||
| from .tf_VGG import tf_vgg16, tf_vgg19 | |||||
| @@ -0,0 +1,45 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_conv_relu_avg(x, shape): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32)) | |||||
| x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1]) | |||||
| x = tf.nn.relu(x) | |||||
| x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1], | |||||
| padding='VALID', strides=[1, 2, 2, 1]) | |||||
| return x | |||||
| def tf_fc(x, shape): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape[-1:]).astype(np.float32)) | |||||
| x = tf.reshape(x, (-1, shape[0])) | |||||
| y = tf.matmul(x, weight) + bias | |||||
| return y | |||||
| def tf_cnn_3_layers(x, y_): | |||||
| ''' | |||||
| 3-layer-CNN model in TensorFlow, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print('Building 3-layer-CNN model in tensorflow...') | |||||
| x = tf.reshape(x, [-1, 28, 28, 1]) | |||||
| x = tf_conv_relu_avg(x, (32, 1, 5, 5)) | |||||
| x = tf_conv_relu_avg(x, (64, 32, 5, 5)) | |||||
| x = tf.transpose(x, [0, 3, 1, 2]) | |||||
| y = tf_fc(x, (7 * 7 * 64, 10)) | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,81 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_lstm(x, y_): | |||||
| ''' | |||||
| LSTM model in TensorFlow, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print("Building LSTM model in tensorflow...") | |||||
| diminput = 28 | |||||
| dimhidden = 128 | |||||
| dimoutput = 10 | |||||
| nsteps = 28 | |||||
| forget_gate_w = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) | |||||
| forget_gate_u = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) | |||||
| forget_gate_b = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden,)).astype(np.float32)) | |||||
| input_gate_w = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) | |||||
| input_gate_u = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) | |||||
| input_gate_b = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden,)).astype(np.float32)) | |||||
| output_gate_w = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) | |||||
| output_gate_u = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) | |||||
| output_gate_b = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden,)).astype(np.float32)) | |||||
| tanh_w = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) | |||||
| tanh_u = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32)) | |||||
| tanh_b = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden,)).astype(np.float32)) | |||||
| out_weights = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32)) | |||||
| out_bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimoutput,)).astype(np.float32)) | |||||
| initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32) | |||||
| last_c_state = initial_state | |||||
| last_h_state = initial_state | |||||
| for i in range(nsteps): | |||||
| cur_x = tf.slice(x, (0, i * diminput), (-1, diminput)) | |||||
| # forget gate | |||||
| cur_forget = tf.matmul(last_h_state, forget_gate_u) + \ | |||||
| tf.matmul(cur_x, forget_gate_w) + forget_gate_b | |||||
| cur_forget = tf.sigmoid(cur_forget) | |||||
| # input gate | |||||
| cur_input = tf.matmul(last_h_state, input_gate_u) + \ | |||||
| tf.matmul(cur_x, input_gate_w) + input_gate_b | |||||
| cur_input = tf.sigmoid(cur_input) | |||||
| # output gate | |||||
| cur_output = tf.matmul(last_h_state, output_gate_u) + \ | |||||
| tf.matmul(cur_x, output_gate_w) + output_gate_b | |||||
| cur_output = tf.sigmoid(cur_output) | |||||
| # tanh | |||||
| cur_tanh = tf.matmul(last_h_state, tanh_u) + \ | |||||
| tf.matmul(cur_x, tanh_w) + tanh_b | |||||
| cur_tanh = tf.tanh(cur_tanh) | |||||
| last_c_state = last_c_state * cur_forget + cur_input * cur_tanh | |||||
| last_h_state = tf.tanh(last_c_state) * cur_output | |||||
| y = tf.matmul(last_h_state, out_weights) + out_bias | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,49 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_conv_pool(x, in_channel, out_channel): | |||||
| weight = tf.Variable(np.random.normal(scale=0.1, size=( | |||||
| out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32)) | |||||
| x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1]) | |||||
| x = tf.nn.relu(x) | |||||
| x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], | |||||
| padding='VALID', strides=[1, 2, 2, 1]) | |||||
| return x | |||||
| def tf_fc(x, shape, with_relu=True): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape[-1:]).astype(np.float32)) | |||||
| x = tf.matmul(x, weight) + bias | |||||
| if with_relu: | |||||
| x = tf.nn.relu(x) | |||||
| return x | |||||
| def tf_lenet(x, y_): | |||||
| ''' | |||||
| LeNet model in TensorFlow, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print('Building LeNet model in tensorflow...') | |||||
| x = tf.reshape(x, [-1, 28, 28, 1]) | |||||
| x = tf_conv_pool(x, 1, 6) | |||||
| x = tf_conv_pool(x, 6, 16) | |||||
| x = tf.transpose(x, [0, 3, 1, 2]) | |||||
| x = tf.reshape(x, (-1, 7*7*16)) | |||||
| x = tf_fc(x, (7*7*16, 120), with_relu=True) | |||||
| x = tf_fc(x, (120, 84), with_relu=True) | |||||
| y = tf_fc(x, (84, 10), with_relu=False) | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,23 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_logreg(x, y_): | |||||
| ''' | |||||
| Logistic Regression model in TensorFlow, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print("Build logistic regression model in tensorflow...") | |||||
| weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32)) | |||||
| bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32)) | |||||
| y = tf.matmul(x, weight) + bias | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,34 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_fc(x, shape, with_relu=True): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape[-1:]).astype(np.float32)) | |||||
| x = tf.matmul(x, weight) + bias | |||||
| if with_relu: | |||||
| x = tf.nn.relu(x) | |||||
| return x | |||||
| def tf_mlp(x, y_, num_class=10): | |||||
| ''' | |||||
| MLP model in TensorFlow, for CIFAR dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print("Building MLP model in tensorflow...") | |||||
| x = tf_fc(x, (3072, 256), with_relu=True) | |||||
| x = tf_fc(x, (256, 256), with_relu=True) | |||||
| y = tf_fc(x, (256, num_class), with_relu=False) | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,49 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_rnn(x, y_): | |||||
| ''' | |||||
| RNN model in TensorFlow, for MNIST dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print("Building RNN model in tensorflow...") | |||||
| diminput = 28 | |||||
| dimhidden = 128 | |||||
| dimoutput = 10 | |||||
| nsteps = 28 | |||||
| weight1 = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(diminput, dimhidden)).astype(np.float32)) | |||||
| bias1 = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, )).astype(np.float32)) | |||||
| weight2 = tf.Variable(np.random.normal(scale=0.1, size=( | |||||
| dimhidden + dimhidden, dimhidden)).astype(np.float32)) | |||||
| bias2 = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, )).astype(np.float32)) | |||||
| weight3 = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32)) | |||||
| bias3 = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(dimoutput, )).astype(np.float32)) | |||||
| last_state = tf.zeros((128, dimhidden), dtype=tf.float32) | |||||
| for i in range(nsteps): | |||||
| cur_x = tf.slice(x, (0, i * diminput), (-1, diminput)) | |||||
| h = tf.matmul(cur_x, weight1) + bias1 | |||||
| s = tf.concat([h, last_state], axis=1) | |||||
| s = tf.matmul(s, weight2) + bias2 | |||||
| last_state = tf.nn.relu(s) | |||||
| final_state = last_state | |||||
| y = tf.matmul(final_state, weight3) + bias3 | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| @@ -0,0 +1,113 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def tf_conv2d(x, in_channel, out_channel, stride=1): | |||||
| weight = tf.Variable(np.random.normal(scale=0.1, size=( | |||||
| out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32)) | |||||
| x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME') | |||||
| return x | |||||
| def tf_batch_norm_with_relu(x, hidden): | |||||
| scale = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(hidden,)).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(hidden,)).astype(np.float32)) | |||||
| axis = list(range(len(x.shape) - 1)) | |||||
| a_mean, a_var = tf.nn.moments(x, axis) | |||||
| x = tf.nn.batch_normalization( | |||||
| x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2) | |||||
| x = tf.nn.relu(x) | |||||
| return x | |||||
| def tf_resnet_block(x, in_channel, num_blocks, is_first=False): | |||||
| if is_first: | |||||
| out_channel = in_channel | |||||
| identity = x | |||||
| x = tf_conv2d(x, in_channel, out_channel, stride=1) | |||||
| x = tf_batch_norm_with_relu(x, out_channel) | |||||
| x = tf_conv2d(x, out_channel, out_channel, stride=1) | |||||
| x = x + identity | |||||
| else: | |||||
| out_channel = 2 * in_channel | |||||
| identity = x | |||||
| x = tf_batch_norm_with_relu(x, in_channel) | |||||
| x = tf_conv2d(x, in_channel, out_channel, stride=2) | |||||
| x = tf_batch_norm_with_relu(x, out_channel) | |||||
| x = tf_conv2d(x, out_channel, out_channel, stride=1) | |||||
| identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[ | |||||
| 1, 2, 2, 1], padding='VALID') | |||||
| identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [ | |||||
| in_channel // 2, in_channel // 2]]) | |||||
| x = x + identity | |||||
| for i in range(1, num_blocks): | |||||
| identity = x | |||||
| x = tf_batch_norm_with_relu(x, out_channel) | |||||
| x = tf_conv2d(x, out_channel, out_channel, stride=1) | |||||
| x = tf_batch_norm_with_relu(x, out_channel) | |||||
| x = tf_conv2d(x, out_channel, out_channel, stride=1) | |||||
| x = x + identity | |||||
| return x | |||||
| def tf_fc(x, shape): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=shape[-1:]).astype(np.float32)) | |||||
| x = tf.matmul(x, weight) + bias | |||||
| return x | |||||
| def tf_resnet(x, y_, num_layers, num_class=10): | |||||
| ''' | |||||
| ResNet model in TensorFlow, for CIFAR10 dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| num_layers: 18 or 34 | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| print("Number of Class: {}".format(num_class)) | |||||
| base_size = 16 | |||||
| x = tf_conv2d(x, 3, base_size, stride=1) | |||||
| x = tf_batch_norm_with_relu(x, base_size) | |||||
| if num_layers == 18: | |||||
| print("Building ResNet-18 model in tensorflow...") | |||||
| x = tf_resnet_block(x, base_size, num_blocks=2, is_first=True) | |||||
| x = tf_resnet_block(x, base_size, num_blocks=2) | |||||
| x = tf_resnet_block(x, 2 * base_size, num_blocks=2) | |||||
| x = tf_resnet_block(x, 4 * base_size, num_blocks=2) | |||||
| elif num_layers == 34: | |||||
| print("Building ResNet-34 model in tensorflow...") | |||||
| x = tf_resnet_block(x, base_size, num_blocks=3, is_first=True) | |||||
| x = tf_resnet_block(x, base_size, num_blocks=4) | |||||
| x = tf_resnet_block(x, 2 * base_size, num_blocks=6) | |||||
| x = tf_resnet_block(x, 4 * base_size, num_blocks=3) | |||||
| else: | |||||
| assert False, "Number of layers should be 18 or 34 !" | |||||
| x = tf_batch_norm_with_relu(x, 8 * base_size) | |||||
| x = tf.transpose(x, [0, 3, 1, 2]) | |||||
| x = tf.reshape(x, [-1, 128 * base_size]) | |||||
| y = tf_fc(x, (128 * base_size, num_class)) | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| def tf_resnet18(x, y_, num_class=10): | |||||
| return tf_resnet(x, y_, 18, num_class) | |||||
| def tf_resnet34(x, y_, num_class=10): | |||||
| return tf_resnet(x, y_, 34, num_class) | |||||
| @@ -0,0 +1,103 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def conv_bn_relu(x, in_channel, out_channel): | |||||
| weight = tf.Variable(np.random.normal(scale=0.1, size=( | |||||
| out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32)) | |||||
| scale = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(out_channel,)).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(out_channel,)).astype(np.float32)) | |||||
| x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME') | |||||
| axis = list(range(len(x.shape) - 1)) | |||||
| a_mean, a_var = tf.nn.moments(x, axis) | |||||
| x = tf.nn.batch_normalization( | |||||
| x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2) | |||||
| x = tf.nn.relu(x) | |||||
| return x | |||||
| def vgg_2block(x, in_channel, out_channel): | |||||
| x = conv_bn_relu(x, in_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ | |||||
| 1, 2, 2, 1], padding='VALID') | |||||
| return x | |||||
| def vgg_3block(x, in_channel, out_channel): | |||||
| x = conv_bn_relu(x, in_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ | |||||
| 1, 2, 2, 1], padding='VALID') | |||||
| return x | |||||
| def vgg_4block(x, in_channel, out_channel): | |||||
| x = conv_bn_relu(x, in_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = conv_bn_relu(x, out_channel, out_channel) | |||||
| x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[ | |||||
| 1, 2, 2, 1], padding='VALID') | |||||
| return x | |||||
| def tf_fc(x, in_feat, out_feat): | |||||
| weight = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(in_feat, out_feat)).astype(np.float32)) | |||||
| bias = tf.Variable(np.random.normal( | |||||
| scale=0.1, size=(out_feat,)).astype(np.float32)) | |||||
| x = tf.matmul(x, weight) + bias | |||||
| return x | |||||
| def tf_vgg(x, y_, num_layers, num_class=10): | |||||
| ''' | |||||
| ResNet model in TensorFlow, for CIFAR10 dataset. | |||||
| Parameters: | |||||
| x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C) | |||||
| y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| num_layers: 18 or 34 | |||||
| Return: | |||||
| loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) | |||||
| y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) | |||||
| ''' | |||||
| if num_layers == 16: | |||||
| print('Building VGG-16 model in tensorflow') | |||||
| x = vgg_2block(x, 3, 64) | |||||
| x = vgg_2block(x, 64, 128) | |||||
| x = vgg_3block(x, 128, 256) | |||||
| x = vgg_3block(x, 256, 512) | |||||
| x = vgg_3block(x, 512, 512) | |||||
| elif num_layers == 19: | |||||
| print('Building VGG-19 model in tensorflow') | |||||
| x = vgg_2block(x, 3, 64) | |||||
| x = vgg_2block(x, 64, 128) | |||||
| x = vgg_4block(x, 128, 256) | |||||
| x = vgg_4block(x, 256, 512) | |||||
| x = vgg_4block(x, 512, 512) | |||||
| else: | |||||
| assert False, "Number of layers should be 18 or 34 !" | |||||
| x = tf.reshape(x, [-1, 512]) | |||||
| x = tf_fc(x, 512, 4096) | |||||
| x = tf_fc(x, 4096, 4096) | |||||
| y = tf_fc(x, 4096, num_class) | |||||
| print("Number of Class: {}".format(num_class)) | |||||
| loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) | |||||
| loss = tf.reduce_mean(loss) | |||||
| return loss, y | |||||
| def tf_vgg16(x, y_, num_class=10): | |||||
| return tf_vgg(x, y_, 16, num_class) | |||||
| def tf_vgg19(x, y_, num_class=10): | |||||
| return tf_vgg(x, y_, 34, num_class) | |||||
| @@ -0,0 +1,213 @@ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.optim as optim | |||||
| import torch.nn.functional as F | |||||
| import torch.backends.cudnn as cudnn | |||||
| from pytorch_models import * | |||||
| import hetu as ht | |||||
| import numpy as np | |||||
| import argparse | |||||
| from time import time | |||||
| import os | |||||
| import logging | |||||
| logging.basicConfig(level=logging.INFO, | |||||
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |||||
| logger = logging.getLogger(__name__) | |||||
| def print_rank0(msg): | |||||
| if local_rank % 8 == 0: | |||||
| logger.info(msg) | |||||
| def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None): | |||||
| print_rank0('Epoch: %d' % epoch) | |||||
| n_train_batches = data.shape[0] // batch_size | |||||
| net.train() | |||||
| train_loss = 0 | |||||
| correct = 0 | |||||
| total = 0 | |||||
| for minibatch_index in range(n_train_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| inputs = torch.Tensor(data[minibatch_start:minibatch_end]) | |||||
| targets = torch.Tensor(label[minibatch_start:minibatch_end]).long() | |||||
| inputs, targets = inputs.to(device), targets.to(device) | |||||
| optimizer.zero_grad() | |||||
| outputs = net(inputs) | |||||
| loss = criterion(outputs, targets) | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| train_loss += loss.item() | |||||
| _, predicted = outputs.max(1) | |||||
| total += targets.size(0) | |||||
| correct += predicted.eq(targets).sum().item() | |||||
| print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1))) | |||||
| print_rank0("Train accuracy = %f" % (100.*correct/total)) | |||||
| def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None): | |||||
| net.eval() | |||||
| n_test_batches = data.shape[0] // batch_size | |||||
| test_loss = 0 | |||||
| correct = 0 | |||||
| total = 0 | |||||
| with torch.no_grad(): | |||||
| for minibatch_index in range(n_test_batches): | |||||
| minibatch_start = minibatch_index * args.batch_size | |||||
| minibatch_end = (minibatch_index + 1) * args.batch_size | |||||
| inputs = torch.Tensor(data[minibatch_start:minibatch_end]) | |||||
| targets = torch.Tensor(label[minibatch_start:minibatch_end]).long() | |||||
| inputs, targets = inputs.to(device), targets.to(device) | |||||
| outputs = net(inputs) | |||||
| loss = criterion(outputs, targets) | |||||
| test_loss += loss.item() | |||||
| _, predicted = outputs.max(1) | |||||
| total += targets.size(0) | |||||
| correct += predicted.eq(targets).sum().item() | |||||
| print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1))) | |||||
| print_rank0("Validation accuracy = %f" % (100.*correct/total)) | |||||
| if __name__ == "__main__": | |||||
| # argument parser | |||||
| global local_rank | |||||
| local_rank = 0 | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model', type=str, required=True, | |||||
| help='model to be tested') | |||||
| parser.add_argument('--dataset', type=str, required=True, | |||||
| help='dataset to be trained on') | |||||
| parser.add_argument('--batch-size', type=int, | |||||
| default=128, help='batch size') | |||||
| parser.add_argument('--learning-rate', type=float, | |||||
| default=0.1, help='learning rate') | |||||
| parser.add_argument('--opt', type=str, default='sgd', | |||||
| help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam') | |||||
| parser.add_argument('--num-epochs', type=int, | |||||
| default=20, help='epoch number') | |||||
| parser.add_argument('--gpu', type=int, default=0, | |||||
| help='gpu to be used, -1 means cpu') | |||||
| parser.add_argument('--validate', action='store_true', | |||||
| help='whether to use validation') | |||||
| parser.add_argument('--timing', action='store_true', | |||||
| help='whether to time the training phase') | |||||
| parser.add_argument('--distributed', action='store_true', | |||||
| help='whether to distributed training') | |||||
| parser.add_argument('--local_rank', type=int, default=-1) | |||||
| args = parser.parse_args() | |||||
| if args.distributed == True: | |||||
| init_method = 'tcp://' | |||||
| master_ip = os.getenv('MASTER_ADDR', 'localhost') | |||||
| master_port = os.getenv('MASTER_PORT', '6000') | |||||
| init_method += master_ip + ':' + master_port | |||||
| rank = int(os.getenv('RANK', '0')) | |||||
| world_size = int(os.getenv("WORLD_SIZE", '1')) | |||||
| print("***"*50) | |||||
| print(init_method) | |||||
| torch.distributed.init_process_group(backend="nccl", | |||||
| world_size=world_size, | |||||
| rank=rank, | |||||
| init_method=init_method) | |||||
| if args.gpu == -1: | |||||
| device = 'cpu' | |||||
| else: | |||||
| if args.distributed == True: | |||||
| local_rank = rank % torch.cuda.device_count() | |||||
| torch.cuda.set_device(local_rank) | |||||
| device = torch.device('cuda:%d' % local_rank) | |||||
| logger.info('Use GPU %d.' % local_rank) | |||||
| else: | |||||
| device = torch.device('cuda:%d' % args.gpu) | |||||
| torch.cuda.set_device(args.gpu) | |||||
| print_rank0('Use GPU %d.' % args.gpu) | |||||
| assert args.model in ['mlp', 'resnet18', 'resnet34', | |||||
| 'vgg16', 'vgg19', 'rnn'], 'Model not supported now.' | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100': | |||||
| net = eval(args.model)(100) | |||||
| elif args.model == 'rnn': | |||||
| net = eval(args.model)(28, 10, 128, 28) | |||||
| else: | |||||
| net = eval(args.model)() | |||||
| assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet'] | |||||
| dataset = args.dataset | |||||
| net.to(device) | |||||
| if args.distributed: | |||||
| net = torch.nn.parallel.DistributedDataParallel( | |||||
| net, device_ids=[local_rank]) | |||||
| assert args.opt in ['sgd', 'momentum', 'nesterov', | |||||
| 'adagrad', 'adam'], 'Optimizer not supported!' | |||||
| if args.opt == 'sgd': | |||||
| print_rank0('Use SGD Optimizer.') | |||||
| opt = optim.SGD(net.parameters(), lr=args.learning_rate) | |||||
| elif args.opt == 'momentum': | |||||
| print_rank0('Use Momentum Optimizer.') | |||||
| opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9) | |||||
| elif args.opt == 'nesterov': | |||||
| print_rank0('Use Nesterov Momentum Optimizer.') | |||||
| opt = optim.SGD(net.parameters(), lr=args.learning_rate, | |||||
| momentum=0.9, nesterov=True) | |||||
| elif args.opt == 'adagrad': | |||||
| print_rank0('Use AdaGrad Optimizer.') | |||||
| opt = optim.Adagrad(net.parameters(), lr=args.learning_rate) | |||||
| else: | |||||
| print_rank0('Use Adam Optimizer.') | |||||
| opt = optim.Adam(lr=args.learning_rate) | |||||
| criterion = nn.CrossEntropyLoss() | |||||
| # data loading | |||||
| print_rank0('Loading %s data...' % dataset) | |||||
| if dataset == 'MNIST': | |||||
| datasets = ht.data.mnist(onehot=False) | |||||
| train_set_x, train_set_y = datasets[0] | |||||
| valid_set_x, valid_set_y = datasets[1] | |||||
| test_set_x, test_set_y = datasets[2] | |||||
| elif dataset == 'CIFAR10': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( | |||||
| num_class=10, onehot=False) | |||||
| if args.model == "mlp": | |||||
| train_set_x = train_set_x.reshape(train_set_x.shape[0], -1) | |||||
| valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1) | |||||
| elif dataset == 'CIFAR100': | |||||
| train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar( | |||||
| num_class=100, onehot=False) | |||||
| running_time = 0 | |||||
| # training | |||||
| print_rank0("Start training loop...") | |||||
| for i in range(args.num_epochs + 1): | |||||
| if args.timing: | |||||
| start = time() | |||||
| train(epoch=i, net=net, data=train_set_x, label=train_set_y, | |||||
| batch_size=args.batch_size, criterion=criterion, optimizer=opt) | |||||
| if args.timing: | |||||
| end = time() | |||||
| print_rank0("Running time of current epoch = %fs" % (end - start)) | |||||
| if i != 0: | |||||
| running_time += (end - start) | |||||
| test(epoch=i, net=net, data=valid_set_x, label=valid_set_y, | |||||
| batch_size=args.batch_size, criterion=criterion) | |||||
| print_rank0("*"*50) | |||||
| print_rank0("Running time of total %d epoch = %fs" % | |||||
| (args.num_epochs, running_time)) | |||||
| @@ -0,0 +1,9 @@ | |||||
| { | |||||
| "DMLC_ROLE":"worker", | |||||
| "WORKER_ID":"0", | |||||
| "DMLC_PS_ROOT_URI":"127.0.0.1", | |||||
| "DMLC_PS_ROOT_PORT":"13030", | |||||
| "DMLC_NUM_WORKER":"2", | |||||
| "DMLC_NUM_SERVER":"1", | |||||
| "DMLC_PS_VAN_TYPE":"p3" | |||||
| } | |||||
| @@ -0,0 +1,9 @@ | |||||
| { | |||||
| "DMLC_ROLE":"worker", | |||||
| "WORKER_ID":"1", | |||||
| "DMLC_PS_ROOT_URI":"127.0.0.1", | |||||
| "DMLC_PS_ROOT_PORT":"13030", | |||||
| "DMLC_NUM_WORKER":"2", | |||||
| "DMLC_NUM_SERVER":"1", | |||||
| "DMLC_PS_VAN_TYPE":"p3" | |||||
| } | |||||
| @@ -0,0 +1,2 @@ | |||||
| datasets/ | |||||
| logs/ | |||||
| @@ -0,0 +1,109 @@ | |||||
| # CTR Examples (with Distributed Settings) | |||||
| In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset. | |||||
| ## Structure | |||||
| ``` | |||||
| - ctr | |||||
| - datasets/ contains sampled criteo data | |||||
| - models/ ctr models in hetu | |||||
| - tf_models/ ctr models in tensorflow | |||||
| - settings/ configurations for distributed training | |||||
| - tests/ test scripts | |||||
| - kill.sh script to kill all python processes | |||||
| - run_hetu.py basic trainer for hetu | |||||
| - run_tf_local.py local trainer for tensorflow | |||||
| - run_tf_horovod.py trainer for tensorflow in horovod setting | |||||
| - run_tf_parallax.py trainer for tensorflow in parallax setting | |||||
| - tf_launch_server.py launcher for server in tensorflow | |||||
| - tf_launch_worker.py launcher for worker in tensorflow | |||||
| ``` | |||||
| ## Prepare criteo data | |||||
| * We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files. | |||||
| * To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset. | |||||
| ## Flags for test files | |||||
| Here we explain some of the flags you may use in test files: | |||||
| * model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult') | |||||
| * config: to specify the configuration file in settings. | |||||
| * val: whether using validation. | |||||
| * cache: whether using cache in PS/Hybrid mode. | |||||
| * bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.) | |||||
| * all: whether to use all criteo data. | |||||
| * bound: per embedding entry staleness in cache setting, default to be 100. | |||||
| ## Usage | |||||
| If memory available, you can try to run the model locally, by running | |||||
| ```bash | |||||
| # run locally | |||||
| bash tests/local_{model}_{dataset}.sh | |||||
| # run in ps setting (locally) | |||||
| bash tests/ps_{model}_{dataset}.sh | |||||
| # run in hybrid setting (locally) | |||||
| bash tests/hybrid_{model}_{dataset}.sh | |||||
| # run tensorflow locally | |||||
| python run_tf_local.py --model {model}_{dataset} | |||||
| # run tensorflow in horovod | |||||
| horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset} | |||||
| # run tensorflow in parallax | |||||
| python {absolute_path_to}/run_tf_parallax.py | |||||
| # run tensorflow in ps setting | |||||
| python tf_launch_server.py --config {config} --id {rank} | |||||
| python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config} | |||||
| ``` | |||||
| ## Configuration | |||||
| We use a simple yaml file to specify the run configuration. | |||||
| ```yaml | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 4 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| launch : | |||||
| worker : 4 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| ``` | |||||
| The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster. | |||||
| The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by | |||||
| ## Examples | |||||
| ### Local execution | |||||
| Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data): | |||||
| ```bash | |||||
| python run_hetu.py --model wdl_criteo (--all) (--val) | |||||
| ``` | |||||
| ### PS mode execution | |||||
| Run ps locally, here we can also run on multiple nodes. | |||||
| ```bash | |||||
| # launch scheduler and server, -n means number of servers, --sched means using scheduler | |||||
| python -m hetu.launcher {config} -n 1 --sched | |||||
| # launch workers (or run scheduler and server together if configured in config file) | |||||
| python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10) | |||||
| ``` | |||||
| You can also specify the cache to be used and also the cache bound. | |||||
| ### Hybrid mode execution | |||||
| You must launch a scheduler and server in one terminal: | |||||
| ```bash | |||||
| python -m hetu.launcher {config} -n 1 --sched | |||||
| ``` | |||||
| And then launch the workers simultaneously using mpirun command: | |||||
| ```bash | |||||
| mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ... | |||||
| ``` | |||||
| Or if in distributed nodes setting: | |||||
| ``` | |||||
| mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ... | |||||
| ``` | |||||
| @@ -0,0 +1,3 @@ | |||||
| #/bin/bash | |||||
| #pkill -f mnist_mlp_ps.py | |||||
| kill -9 $(pidof python) | |||||
| @@ -0,0 +1,5 @@ | |||||
| from .wdl_adult import wdl_adult | |||||
| from .dcn_criteo import dcn_criteo | |||||
| from .dc_criteo import dc_criteo | |||||
| from .wdl_criteo import wdl_criteo | |||||
| from .deepfm_criteo import dfm_criteo | |||||
| @@ -0,0 +1,63 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| import time | |||||
| def residual_layer(x0, input_dim, hidden_dim): | |||||
| embedding_len = input_dim | |||||
| weight_1 = init.random_normal( | |||||
| shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1') | |||||
| bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1') | |||||
| weight_2 = init.random_normal( | |||||
| shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2') | |||||
| bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2') | |||||
| x0w = ht.matmul_op(x0, weight_1) # (batch, hidden_dim) | |||||
| x0w_b = x0w + ht.broadcastto_op(bias_1, x0w) | |||||
| relu1 = ht.relu_op(x0w_b) | |||||
| x1w = ht.matmul_op(relu1, weight_2) # (batch, input_dim) | |||||
| x1w_b = x1w + ht.broadcastto_op(bias_2, x1w) | |||||
| residual = x1w_b + x0 | |||||
| y = ht.relu_op(residual) | |||||
| return y | |||||
| def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3): | |||||
| for i in range(num_layers): | |||||
| x0 = residual_layer(x0, input_dim, hidden_dim) | |||||
| return x0 | |||||
| def dc_criteo(dense_input, sparse_input, y_): | |||||
| feature_dimension = 33762577 | |||||
| embedding_size = 8 | |||||
| learning_rate = 0.001 | |||||
| Embedding = init.random_normal( | |||||
| [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") | |||||
| sparse_input = ht.embedding_lookup_op(Embedding, sparse_input) | |||||
| sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) | |||||
| # dc_model | |||||
| x = ht.concat_op(sparse_input, dense_input, axis=1) | |||||
| input_dim = 26 * 8 + 13 | |||||
| hidden_dim = input_dim | |||||
| residual_out = build_residual_layers( | |||||
| x, input_dim, hidden_dim, num_layers=5) | |||||
| W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4") | |||||
| y = ht.matmul_op(residual_out, W4) | |||||
| y = ht.sigmoid_op(y) | |||||
| loss = ht.binarycrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) | |||||
| train_op = opt.minimize(loss) | |||||
| return loss, y, y_, train_op | |||||
| @@ -0,0 +1,68 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| import time | |||||
| def cross_layer(x0, x1): | |||||
| # x0: input embedding feature (batch_size, 26 * embedding_size + 13) | |||||
| # x1: the output of last layer (batch_size, 26 * embedding_size + 13) | |||||
| embedding_len = 26 * 128 + 13 | |||||
| weight = init.random_normal( | |||||
| shape=(embedding_len, 1), stddev=0.01, name='weight') | |||||
| bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias') | |||||
| x1w = ht.matmul_op(x1, weight) # (batch_size, 1) | |||||
| y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0)) | |||||
| y = y + x1 + ht.broadcastto_op(bias, y) | |||||
| return y | |||||
| def build_cross_layer(x0, num_layers=3): | |||||
| x1 = x0 | |||||
| for i in range(num_layers): | |||||
| x1 = cross_layer(x0, x1) | |||||
| return x1 | |||||
| def dcn_criteo(dense_input, sparse_input, y_): | |||||
| feature_dimension = 33762577 | |||||
| embedding_size = 128 | |||||
| learning_rate = 0.003 | |||||
| Embedding = init.random_normal( | |||||
| [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) | |||||
| sparse_input = ht.embedding_lookup_op( | |||||
| Embedding, sparse_input, ctx=ht.cpu(0)) | |||||
| sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) | |||||
| x = ht.concat_op(sparse_input, dense_input, axis=1) | |||||
| # Cross Network | |||||
| cross_output = build_cross_layer(x, num_layers=3) | |||||
| # DNN | |||||
| flatten = x | |||||
| W1 = init.random_normal( | |||||
| [26*embedding_size + 13, 256], stddev=0.01, name="W1") | |||||
| W2 = init.random_normal([256, 256], stddev=0.01, name="W2") | |||||
| W3 = init.random_normal([256, 256], stddev=0.01, name="W3") | |||||
| W4 = init.random_normal( | |||||
| [256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4") | |||||
| fc1 = ht.matmul_op(flatten, W1) | |||||
| relu1 = ht.relu_op(fc1) | |||||
| fc2 = ht.matmul_op(relu1, W2) | |||||
| relu2 = ht.relu_op(fc2) | |||||
| y3 = ht.matmul_op(relu2, W3) | |||||
| y4 = ht.concat_op(cross_output, y3, axis=1) | |||||
| y = ht.matmul_op(y4, W4) | |||||
| y = ht.sigmoid_op(y) | |||||
| loss = ht.binarycrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) | |||||
| train_op = opt.minimize(loss) | |||||
| return loss, y, y_, train_op | |||||
| @@ -0,0 +1,59 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| import time | |||||
| def dfm_criteo(dense_input, sparse_input, y_): | |||||
| feature_dimension = 33762577 | |||||
| embedding_size = 128 | |||||
| learning_rate = 0.01 | |||||
| # FM | |||||
| Embedding1 = init.random_normal( | |||||
| [feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0)) | |||||
| FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") | |||||
| sparse_1dim_input = ht.embedding_lookup_op( | |||||
| Embedding1, sparse_input, ctx=ht.cpu(0)) | |||||
| fm_dense_part = ht.matmul_op(dense_input, FM_W) | |||||
| fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1) | |||||
| # fst order output | |||||
| y1 = fm_dense_part + fm_sparse_part | |||||
| Embedding2 = init.random_normal( | |||||
| [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) | |||||
| sparse_2dim_input = ht.embedding_lookup_op( | |||||
| Embedding2, sparse_input, ctx=ht.cpu(0)) | |||||
| sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1) | |||||
| sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum) | |||||
| sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input) | |||||
| sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1) | |||||
| sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum | |||||
| sparse_2dim_half = sparse_2dim * 0.5 | |||||
| # snd order output | |||||
| y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) | |||||
| # DNN | |||||
| flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size)) | |||||
| W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1") | |||||
| W2 = init.random_normal([256, 256], stddev=0.01, name="W2") | |||||
| W3 = init.random_normal([256, 1], stddev=0.01, name="W3") | |||||
| fc1 = ht.matmul_op(flatten, W1) | |||||
| relu1 = ht.relu_op(fc1) | |||||
| fc2 = ht.matmul_op(relu1, W2) | |||||
| relu2 = ht.relu_op(fc2) | |||||
| y3 = ht.matmul_op(relu2, W3) | |||||
| y4 = y1 + y2 | |||||
| y = y4 + y3 | |||||
| y = ht.sigmoid_op(y) | |||||
| loss = ht.binarycrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) | |||||
| train_op = opt.minimize(loss) | |||||
| return loss, y, y_, train_op | |||||
| @@ -0,0 +1,320 @@ | |||||
| import os | |||||
| import numpy as np | |||||
| ########################################################################### | |||||
| # criteo | |||||
| ########################################################################### | |||||
| def download_criteo(path): | |||||
| import tarfile | |||||
| import pandas as pd | |||||
| from six.moves import urllib | |||||
| if not os.path.exists(path): | |||||
| os.makedirs(path) | |||||
| assert os.path.isdir(path), 'Please provide a directory path.' | |||||
| # this source may be invalid, please use other valid sources. | |||||
| origin = ( | |||||
| 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz' | |||||
| ) | |||||
| print('Downloading data from %s' % origin) | |||||
| dataset = os.path.join(path, 'criteo.tar.gz') | |||||
| urllib.request.urlretrieve(origin, dataset) | |||||
| print("Extracting criteo zip...") | |||||
| with tarfile.open(dataset) as f: | |||||
| f.extractall(path=path) | |||||
| print("Create local files...") | |||||
| # save csv filed | |||||
| df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None) | |||||
| df.columns = ['label'] + ["I" + | |||||
| str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)] | |||||
| df.to_csv(os.path.join(path, "train.csv"), index=0) | |||||
| print('Csv file saved.') | |||||
| # save numpy arrays | |||||
| target_path = [os.path.join(path, filename) for filename in [ | |||||
| 'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy', | |||||
| 'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']] | |||||
| dense_feats = [col for col in df.columns if col.startswith('I')] | |||||
| sparse_feats = [col for col in df.columns if col.startswith('C')] | |||||
| labels = df['label'] | |||||
| dense_feats = process_dense_feats(df, dense_feats) | |||||
| sparse_feats = process_sparse_feats(df, sparse_feats) | |||||
| num_data = dense_feats.shape[0] | |||||
| perm = np.random.permutation(num_data) | |||||
| # split data in 2 parts | |||||
| test_num = num_data // 10 | |||||
| processed_data = [ | |||||
| dense_feats[perm[:-test_num]], # train dense | |||||
| sparse_feats[perm[:-test_num]], # train sparse | |||||
| labels[perm[:-test_num]], # train labels | |||||
| dense_feats[perm[-test_num:]], # validate dense | |||||
| sparse_feats[perm[-test_num:]], # validate sparse | |||||
| labels[perm[-test_num:]], # validate labels | |||||
| ] | |||||
| print('Array shapes:') | |||||
| for i in range(len(processed_data)): | |||||
| print(os.path.split(target_path[i]) | |||||
| [-1].split('.')[0], processed_data[i].shape) | |||||
| np.save(target_path[i], processed_data[i]) | |||||
| print('Numpy arrays saved.') | |||||
| def process_dense_feats(data, feats): | |||||
| d = data.copy() | |||||
| d = d[feats].fillna(0.0) | |||||
| for f in feats: | |||||
| d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1) | |||||
| return d | |||||
| def process_sparse_feats(data, feats): | |||||
| from sklearn.preprocessing import LabelEncoder | |||||
| # process to embeddings. | |||||
| d = data.copy() | |||||
| d = d[feats].fillna("-1") | |||||
| for f in feats: | |||||
| label_encoder = LabelEncoder() | |||||
| d[f] = label_encoder.fit_transform(d[f]) | |||||
| feature_cnt = 0 | |||||
| for f in feats: | |||||
| d[f] += feature_cnt | |||||
| feature_cnt += d[f].nunique() | |||||
| return d | |||||
| def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True): | |||||
| import pandas as pd | |||||
| csv_path = os.path.join(path, "train.csv") | |||||
| if not os.path.exists(csv_path): | |||||
| download_criteo(path) | |||||
| df = pd.read_csv(csv_path, nrows=nrows, header=0) | |||||
| dense_feats = [col for col in df.columns if col.startswith('I')] | |||||
| sparse_feats = [col for col in df.columns if col.startswith('C')] | |||||
| labels = np.array(df['label']).reshape(-1, 1) | |||||
| dense_feats = np.array(process_dense_feats(df, dense_feats)) | |||||
| sparse_feats = np.array(process_sparse_feats( | |||||
| df, sparse_feats)).astype(np.int32) | |||||
| if return_val: | |||||
| test_num = nrows // 10 | |||||
| train_dense = dense_feats[:-test_num] | |||||
| train_sparse = sparse_feats[:-test_num] | |||||
| train_label = labels[:-test_num] | |||||
| validate_dense = dense_feats[-test_num:] | |||||
| validate_sparse = sparse_feats[-test_num:] | |||||
| validate_label = labels[-test_num:] | |||||
| return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label) | |||||
| else: | |||||
| return dense_feats, sparse_feats, labels | |||||
| def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')): | |||||
| # all data should be available! no checking. | |||||
| processed_data = [np.load(os.path.join(path, filename)) | |||||
| for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']] | |||||
| return tuple(processed_data) | |||||
| def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True): | |||||
| file_paths = [os.path.join(path, filename) for filename in [ | |||||
| 'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy', | |||||
| 'test_sparse_feats.npy', 'train_labels.npy', 'test_labels.npy']] | |||||
| if not all([os.path.exists(p) for p in file_paths]): | |||||
| download_criteo(path) | |||||
| files = [np.load(filename) for filename in file_paths] | |||||
| if return_val: | |||||
| return (files[0], files[1]), (files[2], files[3]), (files[4], files[5]) | |||||
| else: | |||||
| return files[0], files[2], files[4] | |||||
| ########################################################################### | |||||
| # adult | |||||
| ########################################################################### | |||||
| def maybe_download(train_data, test_data): | |||||
| import pandas as pd | |||||
| """if adult data "train.csv" and "test.csv" are not in your directory, | |||||
| download them. | |||||
| """ | |||||
| COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", | |||||
| "marital_status", "occupation", "relationship", "race", "gender", | |||||
| "capital_gain", "capital_loss", "hours_per_week", "native_country", | |||||
| "income_bracket"] | |||||
| if not os.path.exists(train_data): | |||||
| print("downloading training data...") | |||||
| df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", | |||||
| names=COLUMNS, skipinitialspace=True) | |||||
| else: | |||||
| df_train = pd.read_csv("train.csv") | |||||
| if not os.path.exists(test_data): | |||||
| print("downloading testing data...") | |||||
| df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", | |||||
| names=COLUMNS, skipinitialspace=True, skiprows=1) | |||||
| else: | |||||
| df_test = pd.read_csv("test.csv") | |||||
| return df_train, df_test | |||||
| def cross_columns(x_cols): | |||||
| """simple helper to build the crossed columns in a pandas dataframe | |||||
| """ | |||||
| crossed_columns = dict() | |||||
| colnames = ['_'.join(x_c) for x_c in x_cols] | |||||
| for cname, x_c in zip(colnames, x_cols): | |||||
| crossed_columns[cname] = x_c | |||||
| return crossed_columns | |||||
| def val2idx(df, cols): | |||||
| """helper to index categorical columns before embeddings. | |||||
| """ | |||||
| val_types = dict() | |||||
| for c in cols: | |||||
| val_types[c] = df[c].unique() | |||||
| val_to_idx = dict() | |||||
| for k, v in val_types.items(): | |||||
| val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])} | |||||
| for k, v in val_to_idx.items(): | |||||
| df[k] = df[k].apply(lambda x: v[x]) | |||||
| unique_vals = dict() | |||||
| for c in cols: | |||||
| unique_vals[c] = df[c].nunique() | |||||
| return df, unique_vals | |||||
| def onehot(x): | |||||
| from sklearn.preprocessing import OneHotEncoder | |||||
| return np.array(OneHotEncoder().fit_transform(x).todense()) | |||||
| def wide(df_train, df_test, wide_cols, x_cols, target): | |||||
| import pandas as pd | |||||
| print('Processing wide data') | |||||
| df_train['IS_TRAIN'] = 1 | |||||
| df_test['IS_TRAIN'] = 0 | |||||
| df_wide = pd.concat([df_train, df_test]) | |||||
| crossed_columns_d = cross_columns(x_cols) | |||||
| categorical_columns = list( | |||||
| df_wide.select_dtypes(include=['object']).columns) | |||||
| wide_cols += list(crossed_columns_d.keys()) | |||||
| for k, v in crossed_columns_d.items(): | |||||
| df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1) | |||||
| df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']] | |||||
| dummy_cols = [ | |||||
| c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())] | |||||
| df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols]) | |||||
| train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) | |||||
| test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) | |||||
| assert all(train.columns == test.columns) | |||||
| cols = [c for c in train.columns if c != target] | |||||
| X_train = train[cols].values | |||||
| y_train = train[target].values.reshape(-1, 1) | |||||
| X_test = test[cols].values | |||||
| y_test = test[target].values.reshape(-1, 1) | |||||
| return X_train, y_train, X_test, y_test | |||||
| def load_adult_data(return_val=True): | |||||
| import pandas as pd | |||||
| df_train, df_test = maybe_download("train.csv", "test.csv") | |||||
| df_train['income_label'] = ( | |||||
| df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) | |||||
| df_test['income_label'] = ( | |||||
| df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) | |||||
| age_groups = [0, 25, 65, 90] | |||||
| age_labels = range(len(age_groups) - 1) | |||||
| df_train['age_group'] = pd.cut( | |||||
| df_train['age'], age_groups, labels=age_labels) | |||||
| df_test['age_group'] = pd.cut( | |||||
| df_test['age'], age_groups, labels=age_labels) | |||||
| # columns for wide model | |||||
| wide_cols = ['workclass', 'education', 'marital_status', 'occupation', | |||||
| 'relationship', 'race', 'gender', 'native_country', 'age_group'] | |||||
| x_cols = (['education', 'occupation'], ['native_country', 'occupation']) | |||||
| # columns for deep model | |||||
| embedding_cols = ['workclass', 'education', 'marital_status', 'occupation', | |||||
| 'relationship', 'race', 'gender', 'native_country'] | |||||
| cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week'] | |||||
| target = 'income_label' | |||||
| x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide( | |||||
| df_train, df_test, wide_cols, x_cols, target) | |||||
| x_train_wide = np.array(x_train_wide).astype(np.float32) | |||||
| x_test_wide = np.array(x_test_wide).astype(np.float32) | |||||
| print('Processing deep data') | |||||
| df_train['IS_TRAIN'] = 1 | |||||
| df_test['IS_TRAIN'] = 0 | |||||
| df_deep = pd.concat([df_train, df_test]) | |||||
| deep_cols = embedding_cols + cont_cols | |||||
| df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']] | |||||
| from sklearn.preprocessing import StandardScaler | |||||
| scaler = StandardScaler() | |||||
| df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]), | |||||
| columns=cont_cols) | |||||
| df_deep, unique_vals = val2idx(df_deep, embedding_cols) | |||||
| train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) | |||||
| test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) | |||||
| x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32) | |||||
| y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32) | |||||
| x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32) | |||||
| y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32) | |||||
| x_train_deep = np.transpose(x_train_deep) | |||||
| x_test_deep = np.transpose(x_test_deep) | |||||
| y_train = onehot(y_train) | |||||
| y_test = onehot(y_test) | |||||
| if return_val: | |||||
| return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test | |||||
| else: | |||||
| return x_train_deep, x_train_wide, y_train | |||||
| ########################################################################### | |||||
| # avazu | |||||
| ########################################################################### | |||||
| def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')): | |||||
| import pandas as pd | |||||
| # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data | |||||
| train_file = os.path.join(path, 'train.csv') | |||||
| # test_file = os.path.join(path, 'test.csv') # useless, no labels | |||||
| df_train = pd.read_csv(train_file) | |||||
| sparse_feats = process_sparse_feats(df_train, df_train.columns[2:]) | |||||
| # the embedding num for each feature: | |||||
| # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60] | |||||
| # sum: 9449445 | |||||
| np.save(os.path.join(path, 'sparse.npy'), sparse_feats) | |||||
| if __name__ == '__main__': | |||||
| download_criteo(os.path.join(os.path.split( | |||||
| os.path.abspath(__file__)), '../datasets/criteo')) | |||||
| @@ -0,0 +1,56 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| def wdl_adult(X_deep, X_wide, y_): | |||||
| lr = 5 / 128 | |||||
| dim_wide = 809 | |||||
| dim_deep = 68 | |||||
| W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") | |||||
| W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") | |||||
| b1 = init.random_normal([50], stddev=0.1, name="b1") | |||||
| W2 = init.random_normal([50, 20], stddev=0.1, name="W2") | |||||
| b2 = init.random_normal([20], stddev=0.1, name="b2") | |||||
| # deep | |||||
| Embedding = [] | |||||
| X_deep_input = None | |||||
| for i in range(8): | |||||
| Embedding_name = "Embedding_deep_" + str(i) | |||||
| Embedding.append(init.random_normal( | |||||
| [50, 8], stddev=0.1, name=Embedding_name)) | |||||
| now = ht.embedding_lookup_op(Embedding[i], X_deep[i]) | |||||
| now = ht.array_reshape_op(now, (-1, 8)) | |||||
| if X_deep_input is None: | |||||
| X_deep_input = now | |||||
| else: | |||||
| X_deep_input = ht.concat_op(X_deep_input, now, 1) | |||||
| for i in range(4): | |||||
| now = ht.array_reshape_op(X_deep[i + 8], (-1, 1)) | |||||
| X_deep_input = ht.concat_op(X_deep_input, now, 1) | |||||
| mat1 = ht.matmul_op(X_deep_input, W1) | |||||
| add1 = mat1 + ht.broadcastto_op(b1, mat1) | |||||
| relu1 = ht.relu_op(add1) | |||||
| dropout1 = relu1 | |||||
| mat2 = ht.matmul_op(dropout1, W2) | |||||
| add2 = mat2 + ht.broadcastto_op(b2, mat2) | |||||
| relu2 = ht.relu_op(add2) | |||||
| dropout2 = relu2 | |||||
| dmodel = dropout2 | |||||
| # wide | |||||
| wmodel = ht.concat_op(X_wide, dmodel, 1) | |||||
| wmodel = ht.matmul_op(wmodel, W) | |||||
| prediction = wmodel | |||||
| loss = ht.softmaxcrossentropy_op(prediction, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=lr) | |||||
| train_op = opt.minimize(loss) | |||||
| return loss, prediction, y_, train_op | |||||
| @@ -0,0 +1,42 @@ | |||||
| import hetu as ht | |||||
| from hetu import init | |||||
| import numpy as np | |||||
| import time | |||||
| def wdl_criteo(dense_input, sparse_input, y_): | |||||
| feature_dimension = 33762577 | |||||
| embedding_size = 128 | |||||
| learning_rate = 0.01 | |||||
| Embedding = init.random_normal( | |||||
| [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0)) | |||||
| sparse_input = ht.embedding_lookup_op( | |||||
| Embedding, sparse_input, ctx=ht.cpu(0)) | |||||
| sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size)) | |||||
| # DNN | |||||
| flatten = dense_input | |||||
| W1 = init.random_normal([13, 256], stddev=0.01, name="W1") | |||||
| W2 = init.random_normal([256, 256], stddev=0.01, name="W2") | |||||
| W3 = init.random_normal([256, 256], stddev=0.01, name="W3") | |||||
| W4 = init.random_normal( | |||||
| [256 + 26*embedding_size, 1], stddev=0.01, name="W4") | |||||
| fc1 = ht.matmul_op(flatten, W1) | |||||
| relu1 = ht.relu_op(fc1) | |||||
| fc2 = ht.matmul_op(relu1, W2) | |||||
| relu2 = ht.relu_op(fc2) | |||||
| y3 = ht.matmul_op(relu2, W3) | |||||
| y4 = ht.concat_op(sparse_input, y3, axis=1) | |||||
| y = ht.matmul_op(y4, W4) | |||||
| y = ht.sigmoid_op(y) | |||||
| loss = ht.binarycrossentropy_op(y, y_) | |||||
| loss = ht.reduce_mean_op(loss, [0]) | |||||
| opt = ht.optim.SGDOptimizer(learning_rate=learning_rate) | |||||
| train_op = opt.minimize(loss) | |||||
| return loss, y, y_, train_op | |||||
| @@ -0,0 +1,230 @@ | |||||
| import hetu as ht | |||||
| from hetu.launcher import launch | |||||
| import os | |||||
| import os.path as osp | |||||
| import numpy as np | |||||
| import yaml | |||||
| import time | |||||
| import argparse | |||||
| from tqdm import tqdm | |||||
| from sklearn import metrics | |||||
| def worker(args): | |||||
| def train(iterations, auc_enabled=True, tqdm_enabled=False): | |||||
| localiter = tqdm(range(iterations) | |||||
| ) if tqdm_enabled else range(iterations) | |||||
| train_loss = [] | |||||
| train_acc = [] | |||||
| if auc_enabled: | |||||
| train_auc = [] | |||||
| for it in localiter: | |||||
| loss_val, predict_y, y_val, _ = executor.run( | |||||
| 'train', convert_to_numpy_ret_vals=True) | |||||
| if y_val.shape[1] == 1: # for criteo case | |||||
| acc_val = np.equal( | |||||
| y_val, | |||||
| predict_y > 0.5).astype(np.float32) | |||||
| else: | |||||
| acc_val = np.equal( | |||||
| np.argmax(y_val, 1), | |||||
| np.argmax(predict_y, 1)).astype(np.float32) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| if auc_enabled: | |||||
| train_auc.append(metrics.roc_auc_score(y_val, predict_y)) | |||||
| if auc_enabled: | |||||
| return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc) | |||||
| else: | |||||
| return np.mean(train_loss), np.mean(train_acc) | |||||
| def validate(iterations, tqdm_enabled=False): | |||||
| localiter = tqdm(range(iterations) | |||||
| ) if tqdm_enabled else range(iterations) | |||||
| test_loss = [] | |||||
| test_acc = [] | |||||
| test_auc = [] | |||||
| for it in localiter: | |||||
| loss_val, test_y_predicted, y_test_val = executor.run( | |||||
| 'validate', convert_to_numpy_ret_vals=True) | |||||
| if y_test_val.shape[1] == 1: # for criteo case | |||||
| correct_prediction = np.equal( | |||||
| y_test_val, | |||||
| test_y_predicted > 0.5).astype(np.float32) | |||||
| else: | |||||
| correct_prediction = np.equal( | |||||
| np.argmax(y_test_val, 1), | |||||
| np.argmax(test_y_predicted, 1)).astype(np.float32) | |||||
| test_loss.append(loss_val[0]) | |||||
| test_acc.append(correct_prediction) | |||||
| test_auc.append(metrics.roc_auc_score( | |||||
| y_test_val, test_y_predicted)) | |||||
| return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc) | |||||
| def get_current_shard(data): | |||||
| if args.comm is not None: | |||||
| part_size = data.shape[0] // nrank | |||||
| start = part_size * rank | |||||
| end = start + part_size if rank != nrank - 1 else data.shape[0] | |||||
| return data[start:end] | |||||
| else: | |||||
| return data | |||||
| batch_size = 128 | |||||
| dataset = args.dataset | |||||
| model = args.model | |||||
| device_id = 0 | |||||
| if args.comm == 'PS': | |||||
| rank = ht.get_worker_communicate().rank() | |||||
| nrank = int(os.environ['DMLC_NUM_WORKER']) | |||||
| device_id = rank % 8 | |||||
| elif args.comm == 'Hybrid': | |||||
| comm = ht.wrapped_mpi_nccl_init() | |||||
| device_id = comm.dev_id | |||||
| rank = comm.rank | |||||
| nrank = int(os.environ['DMLC_NUM_WORKER']) | |||||
| if dataset == 'criteo': | |||||
| # define models for criteo | |||||
| if args.all: | |||||
| from models.load_data import process_all_criteo_data | |||||
| dense, sparse, labels = process_all_criteo_data( | |||||
| return_val=args.val) | |||||
| elif args.val: | |||||
| from models.load_data import process_head_criteo_data | |||||
| dense, sparse, labels = process_head_criteo_data(return_val=True) | |||||
| else: | |||||
| from models.load_data import process_sampled_criteo_data | |||||
| dense, sparse, labels = process_sampled_criteo_data() | |||||
| if isinstance(dense, tuple): | |||||
| dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [ | |||||
| get_current_shard(dense[1]), batch_size, 'validate']]) | |||||
| sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [ | |||||
| get_current_shard(sparse[1]), batch_size, 'validate']]) | |||||
| y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [ | |||||
| get_current_shard(labels[1]), batch_size, 'validate']]) | |||||
| else: | |||||
| dense_input = ht.dataloader_op( | |||||
| [[get_current_shard(dense), batch_size, 'train']]) | |||||
| sparse_input = ht.dataloader_op( | |||||
| [[get_current_shard(sparse), batch_size, 'train']]) | |||||
| y_ = ht.dataloader_op( | |||||
| [[get_current_shard(labels), batch_size, 'train']]) | |||||
| elif dataset == 'adult': | |||||
| from models.load_data import load_adult_data | |||||
| x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data() | |||||
| dense_input = [ | |||||
| ht.dataloader_op([ | |||||
| [get_current_shard(x_train_deep[:, i]), batch_size, 'train'], | |||||
| [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'], | |||||
| ]) for i in range(12) | |||||
| ] | |||||
| sparse_input = ht.dataloader_op([ | |||||
| [get_current_shard(x_train_wide), batch_size, 'train'], | |||||
| [get_current_shard(x_test_wide), batch_size, 'validate'], | |||||
| ]) | |||||
| y_ = ht.dataloader_op([ | |||||
| [get_current_shard(y_train), batch_size, 'train'], | |||||
| [get_current_shard(y_test), batch_size, 'validate'], | |||||
| ]) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| print("Data loaded.") | |||||
| loss, prediction, y_, train_op = model(dense_input, sparse_input, y_) | |||||
| eval_nodes = {'train': [loss, prediction, y_, train_op]} | |||||
| if args.val: | |||||
| print('Validation enabled...') | |||||
| eval_nodes['validate'] = [loss, prediction, y_] | |||||
| executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs') | |||||
| executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id), | |||||
| comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path) | |||||
| if args.all and dataset == 'criteo': | |||||
| print('Processing all data...') | |||||
| file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[ | |||||
| args.comm], args.raw_model) | |||||
| file_path += '%d.log' % rank if args.comm else '.log' | |||||
| file_path = osp.join(osp.dirname( | |||||
| osp.abspath(__file__)), 'logs', file_path) | |||||
| log_file = open(file_path, 'w') | |||||
| total_epoch = args.nepoch if args.nepoch > 0 else 11 | |||||
| for ep in range(total_epoch): | |||||
| print("ep: %d" % ep) | |||||
| ep_st = time.time() | |||||
| train_loss, train_acc, train_auc = train(executor.get_batch_num( | |||||
| 'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True) | |||||
| ep_en = time.time() | |||||
| if args.val: | |||||
| val_loss, val_acc, val_auc = validate( | |||||
| executor.get_batch_num('validate')) | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ | |||||
| % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st) | |||||
| else: | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ | |||||
| % (train_loss, train_acc, train_auc, ep_en - ep_st) | |||||
| print(printstr) | |||||
| log_file.write(printstr + '\n') | |||||
| log_file.flush() | |||||
| else: | |||||
| total_epoch = args.nepoch if args.nepoch > 0 else 50 | |||||
| for ep in range(total_epoch): | |||||
| if ep == 5: | |||||
| start = time.time() | |||||
| print("epoch %d" % ep) | |||||
| ep_st = time.time() | |||||
| train_loss, train_acc = train( | |||||
| executor.get_batch_num('train'), auc_enabled=False) | |||||
| ep_en = time.time() | |||||
| if args.val: | |||||
| val_loss, val_acc, val_auc = validate( | |||||
| executor.get_batch_num('validate')) | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f" | |||||
| % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc)) | |||||
| else: | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" | |||||
| % (train_loss, train_acc, ep_en - ep_st)) | |||||
| print('all time:', time.time() - start) | |||||
| if __name__ == '__main__': | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--model", type=str, required=True, | |||||
| help="model to be tested") | |||||
| parser.add_argument("--val", action="store_true", | |||||
| help="whether to use validation") | |||||
| parser.add_argument("--all", action="store_true", | |||||
| help="whether to use all data") | |||||
| parser.add_argument("--comm", default=None, | |||||
| help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid") | |||||
| parser.add_argument("--bsp", action="store_true", | |||||
| help="whether to use bsp instead of asp") | |||||
| parser.add_argument("--cache", default=None, help="cache policy") | |||||
| parser.add_argument("--bound", default=100, help="cache bound") | |||||
| parser.add_argument("--config", type=str, default=osp.join(osp.dirname( | |||||
| osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps") | |||||
| parser.add_argument("--nepoch", type=int, default=-1, | |||||
| help="num of epochs, each train 1/10 data") | |||||
| args = parser.parse_args() | |||||
| import models | |||||
| print('Model:', args.model) | |||||
| model = eval('models.' + args.model) | |||||
| args.dataset = args.model.split('_')[-1] | |||||
| args.raw_model = args.model | |||||
| args.model = model | |||||
| if args.comm is None: | |||||
| worker(args) | |||||
| elif args.comm == 'Hybrid': | |||||
| settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader) | |||||
| value = settings['shared'] | |||||
| os.environ['DMLC_ROLE'] = 'worker' | |||||
| for k, v in value.items(): | |||||
| os.environ[k] = str(v) | |||||
| worker(args) | |||||
| elif args.comm == 'PS': | |||||
| launch(worker, args) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,174 @@ | |||||
| import os | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| import time | |||||
| import argparse | |||||
| from tqdm import tqdm | |||||
| from sklearn import metrics | |||||
| import horovod.tensorflow as hvd | |||||
| def pop_env(): | |||||
| for k in ['https_proxy', 'http_proxy']: | |||||
| if k in os.environ: | |||||
| os.environ.pop(k) | |||||
| pop_env() | |||||
| # horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model | |||||
| # horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model | |||||
| # if using multi nodes setting in conda, need to modify /etc/bash.bashrc | |||||
| # we can also use mpirun (default gloo): | |||||
| # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ | |||||
| # -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model | |||||
| def train_criteo(model, args): | |||||
| hvd.init() | |||||
| def get_current_shard(data): | |||||
| part_size = data.shape[0] // hvd.size() | |||||
| start = part_size * hvd.rank() | |||||
| end = start + part_size if hvd.rank() != hvd.size() - \ | |||||
| 1 else data.shape[0] | |||||
| return data[start:end] | |||||
| if args.all: | |||||
| from models.load_data import process_all_criteo_data | |||||
| dense, sparse, all_labels = process_all_criteo_data() | |||||
| dense_feature = get_current_shard(dense[0]) | |||||
| sparse_feature = get_current_shard(sparse[0]) | |||||
| labels = get_current_shard(all_labels[0]) | |||||
| val_dense = get_current_shard(dense[1]) | |||||
| val_sparse = get_current_shard(sparse[1]) | |||||
| val_labels = get_current_shard(all_labels[1]) | |||||
| else: | |||||
| from models.load_data import process_sampled_criteo_data | |||||
| dense_feature, sparse_feature, labels = process_sampled_criteo_data() | |||||
| dense_feature = get_current_shard(dense_feature) | |||||
| sparse_feature = get_current_shard(sparse_feature) | |||||
| labels = get_current_shard(labels) | |||||
| batch_size = 128 | |||||
| dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) | |||||
| sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) | |||||
| y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) | |||||
| loss, y, opt = model(dense_input, sparse_input, y_) | |||||
| global_step = tf.train.get_or_create_global_step() | |||||
| # here in DistributedOptimizer by default all tensor are reduced on GPU | |||||
| # can use device_sparse=xxx, device_dense=xxx to modify | |||||
| # if using device_sparse='/cpu:0', the performance degrades | |||||
| train_op = hvd.DistributedOptimizer( | |||||
| opt).minimize(loss, global_step=global_step) | |||||
| gpu_options = tf.compat.v1.GPUOptions( | |||||
| allow_growth=True, visible_device_list=str(hvd.local_rank())) | |||||
| # here horovod default use gpu to initialize, which will cause OOM | |||||
| hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')] | |||||
| sess = tf.compat.v1.train.MonitoredTrainingSession( | |||||
| hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) | |||||
| my_feed_dict = { | |||||
| dense_input: np.empty(shape=(batch_size, 13)), | |||||
| sparse_input: np.empty(shape=(batch_size, 26)), | |||||
| y_: np.empty(shape=(batch_size, 1)), | |||||
| } | |||||
| if args.all: | |||||
| raw_log_file = './logs/tf_hvd_%s_%d.log' % ( | |||||
| args.model, hvd.local_rank()) | |||||
| print('Processing all data, log to', raw_log_file) | |||||
| log_file = open(raw_log_file, 'w') | |||||
| iterations = dense_feature.shape[0] // batch_size | |||||
| total_epoch = 400 | |||||
| start_index = 0 | |||||
| for ep in range(total_epoch): | |||||
| print("epoch %d" % ep) | |||||
| st_time = time.time() | |||||
| train_loss, train_acc, train_auc = [], [], [] | |||||
| for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| start_index += batch_size | |||||
| if start_index + batch_size > dense_feature.shape[0]: | |||||
| start_index = 0 | |||||
| loss_val = sess.run([loss, y, y_, train_op], | |||||
| feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| train_auc.append(metrics.roc_auc_score(true_val, pred_val)) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| tra_auc = np.mean(train_auc) | |||||
| en_time = time.time() | |||||
| train_time = en_time - st_time | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ | |||||
| % (tra_loss, tra_accuracy, tra_auc, train_time) | |||||
| print(printstr) | |||||
| log_file.write(printstr + '\n') | |||||
| log_file.flush() | |||||
| else: | |||||
| iterations = dense_feature.shape[0] // batch_size | |||||
| epoch = 50 | |||||
| for ep in range(epoch): | |||||
| print('epoch', ep) | |||||
| if ep == 5: | |||||
| start = time.time() | |||||
| ep_st = time.time() | |||||
| train_loss = [] | |||||
| train_acc = [] | |||||
| for idx in range(iterations): | |||||
| start_index = idx * batch_size | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| loss_val = sess.run([loss, y, y_, train_op], | |||||
| feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| if pred_val.shape[1] == 1: # for criteo case | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| else: | |||||
| acc_val = np.equal( | |||||
| np.argmax(pred_val, 1), | |||||
| np.argmax(true_val, 1)).astype(np.float32) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| ep_en = time.time() | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" | |||||
| % (tra_loss, tra_accuracy, ep_en - ep_st)) | |||||
| print('all time:', (time.time() - start)) | |||||
| def main(): | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--model", type=str, required=True, | |||||
| help="model to be tested") | |||||
| parser.add_argument("--all", action="store_true", | |||||
| help="whether to use all data") | |||||
| args = parser.parse_args() | |||||
| raw_model = args.model | |||||
| import tf_models | |||||
| model = eval('tf_models.' + raw_model) | |||||
| dataset = raw_model.split('_')[-1] | |||||
| print('Model:', raw_model) | |||||
| train_criteo(model, args) | |||||
| if __name__ == '__main__': | |||||
| main() | |||||
| @@ -0,0 +1,202 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| import time | |||||
| import argparse | |||||
| from tqdm import tqdm | |||||
| from sklearn import metrics | |||||
| def train_criteo(model, args): | |||||
| if args.all: | |||||
| from models.load_data import process_all_criteo_data | |||||
| dense, sparse, all_labels = process_all_criteo_data() | |||||
| dense_feature, val_dense = dense | |||||
| sparse_feature, val_sparse = sparse | |||||
| labels, val_labels = all_labels | |||||
| else: | |||||
| from models.load_data import process_sampled_criteo_data | |||||
| dense_feature, sparse_feature, labels = process_sampled_criteo_data() | |||||
| batch_size = 128 | |||||
| dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) | |||||
| sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) | |||||
| y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) | |||||
| loss, y, opt = model(dense_input, sparse_input, y_) | |||||
| train_op = opt.minimize(loss) | |||||
| init = tf.compat.v1.global_variables_initializer() | |||||
| gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) | |||||
| sess = tf.compat.v1.Session( | |||||
| config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) | |||||
| sess.run(init) | |||||
| my_feed_dict = { | |||||
| dense_input: np.empty(shape=(batch_size, 13)), | |||||
| sparse_input: np.empty(shape=(batch_size, 26)), | |||||
| y_: np.empty(shape=(batch_size, 1)), | |||||
| } | |||||
| if args.all: | |||||
| raw_log_file = './logs/tf_local_%s.log' % (args.model) | |||||
| print('Processing all data, log to', raw_log_file) | |||||
| log_file = open(raw_log_file, 'w') | |||||
| iterations = dense_feature.shape[0] // batch_size | |||||
| total_epoch = 11 | |||||
| start_index = 0 | |||||
| for ep in range(total_epoch): | |||||
| print("epoch %d" % ep) | |||||
| st_time = time.time() | |||||
| train_loss, train_acc, train_auc = [], [], [] | |||||
| for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| start_index += batch_size | |||||
| if start_index + batch_size > dense_feature.shape[0]: | |||||
| start_index = 0 | |||||
| loss_val = sess.run([loss, y, y_, train_op], | |||||
| feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| train_auc.append(metrics.roc_auc_score(true_val, pred_val)) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| tra_auc = np.mean(train_auc) | |||||
| en_time = time.time() | |||||
| train_time = en_time - st_time | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ | |||||
| % (tra_loss, tra_accuracy, tra_auc, train_time) | |||||
| print(printstr) | |||||
| log_file.write(printstr + '\n') | |||||
| log_file.flush() | |||||
| else: | |||||
| iteration = dense_feature.shape[0] // batch_size | |||||
| epoch = 50 | |||||
| for ep in range(epoch): | |||||
| print('epoch', ep) | |||||
| if ep == 5: | |||||
| start = time.time() | |||||
| ep_st = time.time() | |||||
| train_loss = [] | |||||
| train_acc = [] | |||||
| for idx in range(iteration): | |||||
| start_index = idx * batch_size | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| loss_val = sess.run([loss, y, y_, train_op], | |||||
| feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| if pred_val.shape[1] == 1: # for criteo case | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| else: | |||||
| acc_val = np.equal( | |||||
| np.argmax(pred_val, 1), | |||||
| np.argmax(true_val, 1)).astype(np.float32) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| ep_en = time.time() | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" | |||||
| % (tra_loss, tra_accuracy, ep_en - ep_st)) | |||||
| print('all time:', (time.time() - start)) | |||||
| def train_adult(model): | |||||
| batch_size = 128 | |||||
| total_epoch = 50 | |||||
| dim_wide = 809 | |||||
| X_deep = [] | |||||
| for i in range(8): | |||||
| X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1])) | |||||
| for i in range(4): | |||||
| X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1])) | |||||
| X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide]) | |||||
| y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2]) | |||||
| loss, y, train_op = model(X_deep, X_wide, y_) | |||||
| init = tf.global_variables_initializer() | |||||
| gpu_options = tf.GPUOptions(allow_growth=True) | |||||
| sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) | |||||
| sess.run(init) | |||||
| from models.load_data import load_adult_data | |||||
| x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False) | |||||
| iterations = x_train_deep.shape[0] // batch_size | |||||
| for ep in range(total_epoch): | |||||
| print('epoch', ep) | |||||
| if ep == 5: | |||||
| start = time.time() | |||||
| ep_st = time.time() | |||||
| train_loss = [] | |||||
| train_acc = [] | |||||
| pre_index = 0 | |||||
| for it in range(iterations): | |||||
| batch_x_deep = x_train_deep[pre_index:pre_index + batch_size] | |||||
| batch_x_wide = x_train_wide[pre_index:pre_index + batch_size] | |||||
| batch_y = y_train[pre_index:pre_index + batch_size] | |||||
| pre_index += batch_size | |||||
| my_feed_dict = dict() | |||||
| for i in range(12): | |||||
| my_feed_dict[X_deep[i]] = np.array( | |||||
| batch_x_deep[:, 1]).reshape(-1, 1) | |||||
| my_feed_dict[X_wide] = np.array(batch_x_wide) | |||||
| my_feed_dict[y_] = batch_y | |||||
| loss_val = sess.run([loss, y, y_, train_op], | |||||
| feed_dict=my_feed_dict) | |||||
| acc_val = np.equal( | |||||
| np.argmax(loss_val[1], 1), | |||||
| np.argmax(loss_val[2], 1)).astype(np.float32) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| ep_en = time.time() | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" | |||||
| % (tra_loss, tra_accuracy, ep_en - ep_st)) | |||||
| print('all time:', (time.time() - start)) | |||||
| def main(): | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--model", type=str, required=True, | |||||
| help="model to be tested") | |||||
| parser.add_argument("--all", action="store_true", | |||||
| help="whether to use all data") | |||||
| args = parser.parse_args() | |||||
| raw_model = args.model | |||||
| import tf_models | |||||
| model = eval('tf_models.' + raw_model) | |||||
| dataset = raw_model.split('_')[-1] | |||||
| print('Model:', raw_model) | |||||
| if dataset == 'criteo': | |||||
| train_criteo(model, args) | |||||
| elif dataset == 'adult': | |||||
| train_adult(model) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| if __name__ == '__main__': | |||||
| main() | |||||
| @@ -0,0 +1,211 @@ | |||||
| import os | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| import time | |||||
| import argparse | |||||
| from tqdm import tqdm | |||||
| from sklearn import metrics | |||||
| from autodist import AutoDist | |||||
| from autodist.resource_spec import ResourceSpec | |||||
| from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax | |||||
| from autodist.strategy.base import Strategy | |||||
| from autodist.kernel.common.utils import get_op_name | |||||
| from tensorflow.python.framework import ops | |||||
| def pop_env(): | |||||
| for k in ['https_proxy', 'http_proxy']: | |||||
| if k in os.environ: | |||||
| os.environ.pop(k) | |||||
| pop_env() | |||||
| # Please DO NOT modify /etc/bash.bashrc to activate conda environment. | |||||
| # Use python_venv in spec yml file instead. | |||||
| # Use absolute path of python file. | |||||
| # Here we use the tf native partitioner instead of autodist's PartitionPS. | |||||
| class Parallaxx(PSLoadBalancing, AllReduce): | |||||
| """ | |||||
| Modify original parallax to remove replica on CPUs. | |||||
| """ | |||||
| def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0): | |||||
| PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness) | |||||
| AllReduce.__init__(self, chunk_size) | |||||
| # pylint: disable=attribute-defined-outside-init | |||||
| def build(self, graph_item, resource_spec): | |||||
| """Generate the strategy.""" | |||||
| expr = Strategy() | |||||
| # For each variable, generate variable synchronizer config | |||||
| expr.graph_config.replicas.extend( | |||||
| [k for k, v in resource_spec.gpu_devices]) | |||||
| reduction_device_names = [k for k, _ in resource_spec.cpu_devices] | |||||
| self.loads = {ps: 0.0 for ps in reduction_device_names} | |||||
| # Generate node config | |||||
| node_config = [] | |||||
| for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()): | |||||
| var_op_name = get_op_name(var.name) | |||||
| grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name] | |||||
| if isinstance(grad, ops.Tensor): # this is a dense variable | |||||
| group_id = idx // self.chunk_size | |||||
| config = self._gen_all_reduce_node_config( | |||||
| var.name, group=group_id) | |||||
| else: # sparse updates | |||||
| # For Parallax Strategy, all PS vars are sparse so we don't use a proxy. | |||||
| # Sparse variables are likely larger, so keeping copies would be costlier, | |||||
| # and usually each device only requires a small part of the overall variable. | |||||
| config = self._gen_ps_node_config( | |||||
| var, | |||||
| # For Parallax Strategy, all PS vars are sparse which does not need proxy. | |||||
| False, | |||||
| self._sync, | |||||
| self._staleness | |||||
| ) | |||||
| node_config.append(config) | |||||
| expr.node_config.extend(node_config) | |||||
| return expr | |||||
| def train_criteo(model, args): | |||||
| resource_spec_file = os.path.join(os.path.dirname( | |||||
| __file__), 'settings', 'plx_local_spec.yml') | |||||
| autodist = AutoDist(resource_spec_file, Parallaxx()) | |||||
| respec = ResourceSpec(resource_spec_file) | |||||
| if args.all: | |||||
| from models.load_data import process_all_criteo_data | |||||
| dense, sparse, all_labels = process_all_criteo_data() | |||||
| dense_feature, val_dense = dense | |||||
| sparse_feature, val_sparse = sparse | |||||
| labels, val_labels = all_labels | |||||
| else: | |||||
| from models.load_data import process_sampled_criteo_data | |||||
| dense_feature, sparse_feature, labels = process_sampled_criteo_data() | |||||
| # autodist will split the feeding data | |||||
| batch_size = 128 | |||||
| with tf.Graph().as_default() as g, autodist.scope(): | |||||
| dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) | |||||
| sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) | |||||
| y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) | |||||
| embed_partitioner = tf.fixed_size_partitioner( | |||||
| len(respec.nodes), 0) if len(respec.nodes) > 1 else None | |||||
| loss, y, opt = model(dense_input, sparse_input, | |||||
| y_, embed_partitioner, False) | |||||
| train_op = opt.minimize(loss) | |||||
| sess = autodist.create_distributed_session() | |||||
| my_feed_dict = { | |||||
| dense_input: np.empty(shape=(batch_size, 13)), | |||||
| sparse_input: np.empty(shape=(batch_size, 26)), | |||||
| y_: np.empty(shape=(batch_size, 1)), | |||||
| } | |||||
| if args.all: | |||||
| raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[ | |||||
| 0], 'logs', 'tf_plx_%s.log' % (args.model)) | |||||
| print('Processing all data, log to', raw_log_file) | |||||
| log_file = open(raw_log_file, 'w') | |||||
| iterations = dense_feature.shape[0] // batch_size | |||||
| total_epoch = 11 | |||||
| start_index = 0 | |||||
| for ep in range(total_epoch): | |||||
| print("epoch %d" % ep) | |||||
| st_time = time.time() | |||||
| train_loss, train_acc, train_auc = [], [], [] | |||||
| for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| start_index += batch_size | |||||
| if start_index + batch_size > dense_feature.shape[0]: | |||||
| start_index = 0 | |||||
| loss_val = sess.run( | |||||
| [loss, y, y_, train_op], feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| train_auc.append(metrics.roc_auc_score(true_val, pred_val)) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| tra_auc = np.mean(train_auc) | |||||
| en_time = time.time() | |||||
| train_time = en_time - st_time | |||||
| printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ | |||||
| % (tra_loss, tra_accuracy, tra_auc, train_time) | |||||
| print(printstr) | |||||
| log_file.write(printstr + '\n') | |||||
| log_file.flush() | |||||
| else: | |||||
| iteration = dense_feature.shape[0] // batch_size | |||||
| epoch = 50 | |||||
| for ep in range(epoch): | |||||
| print('epoch', ep) | |||||
| if ep == 5: | |||||
| start = time.time() | |||||
| ep_st = time.time() | |||||
| train_loss = [] | |||||
| train_acc = [] | |||||
| for idx in range(iteration): | |||||
| start_index = idx * batch_size | |||||
| my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size] | |||||
| my_feed_dict[y_][:] = labels[start_index: start_index+batch_size] | |||||
| loss_val = sess.run( | |||||
| [loss, y, y_, train_op], feed_dict=my_feed_dict) | |||||
| pred_val = loss_val[1] | |||||
| true_val = loss_val[2] | |||||
| if pred_val.shape[1] == 1: # for criteo case | |||||
| acc_val = np.equal( | |||||
| true_val, | |||||
| pred_val > 0.5) | |||||
| else: | |||||
| acc_val = np.equal( | |||||
| np.argmax(pred_val, 1), | |||||
| np.argmax(true_val, 1)).astype(np.float32) | |||||
| train_loss.append(loss_val[0]) | |||||
| train_acc.append(acc_val) | |||||
| tra_accuracy = np.mean(train_acc) | |||||
| tra_loss = np.mean(train_loss) | |||||
| ep_en = time.time() | |||||
| print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" | |||||
| % (tra_loss, tra_accuracy, ep_en - ep_st)) | |||||
| print('all time:', (time.time() - start)) | |||||
| def main(): | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--model", type=str, required=True, | |||||
| help="model to be tested") | |||||
| parser.add_argument("--all", action="store_true", | |||||
| help="whether to use all data") | |||||
| args = parser.parse_args() | |||||
| raw_model = args.model | |||||
| import tf_models | |||||
| model = eval('tf_models.' + raw_model) | |||||
| dataset = raw_model.split('_')[-1] | |||||
| print('Model:', raw_model) | |||||
| if dataset == 'criteo': | |||||
| train_criteo(model, args) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| if __name__ == '__main__': | |||||
| main() | |||||
| @@ -0,0 +1,10 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 4 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| launch : | |||||
| worker : 0 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| @@ -0,0 +1,10 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 2 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| launch : | |||||
| worker : 2 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| @@ -0,0 +1,10 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 4 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| launch : | |||||
| worker : 4 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| @@ -0,0 +1,10 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 8 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| launch : | |||||
| worker : 8 | |||||
| server : 1 | |||||
| scheduler : true | |||||
| @@ -0,0 +1,6 @@ | |||||
| shared : | |||||
| DMLC_PS_ROOT_URI : 127.0.0.1 | |||||
| DMLC_PS_ROOT_PORT : 13100 | |||||
| DMLC_NUM_WORKER : 4 | |||||
| DMLC_NUM_SERVER : 1 | |||||
| DMLC_PS_VAN_TYPE : p3 | |||||
| @@ -0,0 +1,4 @@ | |||||
| nodes: | |||||
| - address: localhost | |||||
| cpus: [0] | |||||
| gpus: [0,1,2,3,4,5,6,7] | |||||
| @@ -0,0 +1,9 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "127.0.0.1:12349", | |||||
| "127.0.0.1:12348" | |||||
| ], | |||||
| "ps": [ | |||||
| "127.0.0.1:12345" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,11 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "127.0.0.1:23459", | |||||
| "127.0.0.1:23458", | |||||
| "127.0.0.1:23457", | |||||
| "127.0.0.1:23456" | |||||
| ], | |||||
| "ps": [ | |||||
| "127.0.0.1:23455" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,15 @@ | |||||
| { | |||||
| "worker": [ | |||||
| "127.0.0.1:34569", | |||||
| "127.0.0.1:34568", | |||||
| "127.0.0.1:34567", | |||||
| "127.0.0.1:34566", | |||||
| "127.0.0.1:34565", | |||||
| "127.0.0.1:34564", | |||||
| "127.0.0.1:34563", | |||||
| "127.0.0.1:34562" | |||||
| ], | |||||
| "ps": [ | |||||
| "127.0.0.1:34575" | |||||
| ] | |||||
| } | |||||
| @@ -0,0 +1,7 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & | |||||
| mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml | |||||
| @@ -0,0 +1,7 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & | |||||
| mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml | |||||
| @@ -0,0 +1,7 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & | |||||
| mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml | |||||
| @@ -0,0 +1,7 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & | |||||
| mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model dcn_criteo --val | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model dfm_criteo --val | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model wdl_adult --val | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model wdl_criteo --val | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml | |||||
| @@ -0,0 +1,6 @@ | |||||
| #!/bin/bash | |||||
| workdir=$(cd $(dirname $0); pwd) | |||||
| mainpy=${workdir}/../run_hetu.py | |||||
| python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml | |||||