initial commit

4 years ago · 7fd3de80ce
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,6 @@
 [submodule "third_party/GraphMix"]
 	path = third_party/GraphMix
 	url = https://github.com/nox-410/GraphMix.git
 [submodule "third_party/HetuML"]
 	path = third_party/HetuML
 	url = https://github.com/ccchengff/HetuML.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,59 @@
 cmake_minimum_required(VERSION 3.18)
 project(Hetu CXX)
 include(cmake/config.cmake)
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_CXX_FLAGS "-O3 -Wall")
 # openmp
 find_package(OpenMP REQUIRED)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 # compile flag
 if(${HETU_VERSION} STREQUAL "all")
    set(HETU_COMPILE_GPU ON)
    set(HETU_COMPILE_MKL ON)
 elseif(${HETU_VERSION} STREQUAL "gpu")
    set(HETU_COMPILE_GPU ON)
    set(HETU_COMPILE_MKL OFF)
 elseif(${HETU_VERSION} STREQUAL "mkl")
    set(HETU_COMPILE_GPU OFF)
    set(HETU_COMPILE_MKL ON)
 else()
    message(FATAL_ERROR "unknown hetu version")
 endif()
 message(STATUS "HETU version: ${HETU_VERSION}")
 # cuda
 if(${HETU_COMPILE_GPU})
    set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc)
    file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION)
    string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION})
    if(${CUDA_VERSION} VERSION_LESS "10.1")
        message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}")
    endif()
    find_package(CUDAToolkit REQUIRED)
    enable_language(CUDA)
 endif()
 include(FetchContent) # download third_party
 add_subdirectory(${CMAKE_SOURCE_DIR}/src)
 if(${HETU_PS})
    add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite)
 endif()
 if(${HETU_GEOMETRIC})
    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix)
 endif()
 if (HETU_ML)
    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML)
 endif()
 enable_testing()
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@@ -0,0 +1,33 @@
 ## Committer
 Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer. 
 One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
 Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md).
 ## Committer Members
 ### Current Committer
 - [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)>
 - [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)>
 - [codecaution](https://github.com/codecaution) - **Xiaonan Nie**
 - [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang**
 - [nox-410](https://github.com/nox-410) - **Yining Shi**
 - [initzhang](https://github.com/initzhang) - **Xin Zhang**
 - [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang**
 - [AFDWang](https://github.com/AFDWang) - **Yujie Wang**
 - [sj1104](https://github.com/sj1104) - **Jia Shen**
 - [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou**
 - [zmxdream](https://github.com/zmxdream) - **Minxu Zhang**
 We would like to sincerely thank the following community members for their contributions to Hetu.
 - [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)**
 - [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)**
 - [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)**
 - [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)**
 - [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)**
 - [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao**
 - [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue**
 - **Lizi Su**
 - **Dong Li**
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,53 @@
 # Contributing to Hetu
 Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing. 
 ## Issues
 We use Github Issues to track public bugs and feature requests.
 ### Search Known Issues First
 Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant.
 ### Reporting New Issues
 If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash.
 ## Pull Requests
 We strongly welcome your pull request to make Hetu better. 
 ### Branch Management
 There are three main branches here:
 1. `main` branch.
 	(1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`...
 	(2). **Don't submit any PR on `main` branch.**
 2. `specific version` branchs. 
 	(1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing	 branch. After full testing, `specific version` branch will be merged to `main` branch for the next release.
 	(2). **You are recommended to submit bugfix or feature PR on `specific version` branch.**
 Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release. 
 ### Make Pull Requests
 The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay.
 Before submitting a pull request, please make sure the followings are done:
 1. Fork the repo and create your branch from `main` or `specific version`.
 2. Update code or documentation if you have changed APIs.
 3. Add the copyright notice to the top of any new files you've added.
 4. Check your code lints and checkstyles.
 5. Test and test again your code.
 6. Now, you can submit your pull request on  `specific version` branch.
 ## Code Style Guide
 Use [Code Style](./.clang-format) for Python and C++.
 ## License
 By contributing to Hetu, you agree that your contributions will be licensed
 under [License](LICENSE)
--- a/+ 201
+++ b/+ 201
@@ -1,124 +1,201 @@
 木兰宽松许可证， 第2版
 2020年1月 http://license.coscl.org.cn/MulanPSL2
 您对“软件”的复制、使用、修改及分发受木兰宽松许可证，第2版（“本许可证”）的如下条款的约束：
 0.   定义
 “软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。
 “贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。
 “贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。
 “法人实体” 是指提交贡献的机构及其“关联实体”。
 “关联实体” 是指，对“本许可证”下的行为方而言，控制、受控制或与其共同受控制的机构，此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。
 1.   授予版权许可
 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可，您可以复制、使用、修改、分发其“贡献”，不论修改与否。
 2.   授予专利许可
 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的（根据本条规定撤销除外）专利许可，供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求，不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地，就“软件”或其中的“贡献”对任何人发起专利侵权诉讼（包括反诉或交叉诉讼）或其他专利维权行动，指控其侵犯专利权，则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。
 3.   无商标许可
 “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可，但您为满足第4条规定的声明义务而必须使用除外。
 4.   分发限制
 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发，不论修改与否，但您必须向接收者提供“本许可证”的副本，并保留“软件”中的版权、商标、专利及免责声明。
 5.   免责声明与责任限制
 “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下，“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任，不论因何种原因导致或者基于何种法律理论，即使其曾被建议有此种损失的可能性。
 6.   语言
 “本许可证”以中英文双语表述，中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致，以中文版为准。
 条款结束
 如何将木兰宽松许可证，第2版，应用到您的软件
 如果您希望将木兰宽松许可证，第2版，应用到您的新软件，为了方便接收者查阅，建议您完成如下三步：
 1， 请您补充如下声明中的空白，包括软件名、软件的首次发表年份以及您作为版权人的名字；
 2， 请您在软件包的一级目录下创建以“LICENSE”为名的文件，将整个许可证文本放入该文件中；
 3， 请将如下声明文本放入每个源文件的头部注释中。
 Copyright (c) [Year] [name of copyright holder]
 [Software Name] is licensed under Mulan PSL v2.
 You can use this software according to the terms and conditions of the Mulan PSL v2.
 You may obtain a copy of Mulan PSL v2 at:
         http://license.coscl.org.cn/MulanPSL2
 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 See the Mulan PSL v2 for more details.
 Mulan Permissive Software License，Version 2
 Mulan Permissive Software License，Version 2 (Mulan PSL v2)
 January 2020 http://license.coscl.org.cn/MulanPSL2
 Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions:
 0. Definition
 Software means the program and related documents which are licensed under this License and comprise all Contribution(s).
 Contribution means the copyrightable work licensed by a particular Contributor under this License.
 Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License.
 Legal Entity means the entity making a Contribution and all its Affiliates.
 Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity.
 1. Grant of Copyright License
 Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not.
 2. Grant of Patent License
 Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken.
 3. No Trademark License
 No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4.
 4. Distribution Restriction
 You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software.
 5. Disclaimer of Warranty and Limitation of Liability
 THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 6. Language
 THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL.
 END OF THE TERMS AND CONDITIONS
 How to Apply the Mulan Permissive Software License，Version 2 (Mulan PSL v2) to Your Software
 To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps:
 Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner;
 Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package;
 Attach the statement to the appropriate annotated syntax at the beginning of each source file.
 Copyright (c) [Year] [name of copyright holder]
 [Software Name] is licensed under Mulan PSL v2.
 You can use this software according to the terms and conditions of the Mulan PSL v2.
 You may obtain a copy of Mulan PSL v2 at:
         http://license.coscl.org.cn/MulanPSL2
 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 See the Mulan PSL v2 for more details.
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [2021] [Peking University]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -1,20 +1,177 @@
 #### 从命令行创建一个新的仓库
 <div align=center>
 <img src="./img/hetu.png" width="300" />
 </div>
 # HETU
 <!--- [![license](https://img.shields.io/github/license/apache/zookeeper?color=282661)](LICENSE) --->
 [Documentation](https://hetu-doc.readthedocs.io) |
 [Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html)
 Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by <a href="http://net.pku.edu.cn/~cuibin/" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics:
 - Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms.
 - Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks.
 - Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce.
 - Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark
 - Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search.
 We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details.
 ## Installation
 1. Clone the repository.
 2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used:
 ```conda env create -f environment.yml``` .
 Please prepare Cuda toolkit and CuDNN in advance.
 3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix.
 ```bash
 touch README.md
 git init
 git add README.md
 git commit -m "first commit"
 git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
 git push -u origin master
 # modify paths and configurations in cmake/config.cmake
 # generate Makefile
 mkdir build && cd build && cmake ..
 # compile
 # make all
 make -j 8
 # make hetu, version is specified in cmake/config.cmake
 make hetu -j 8
 # make allreduce module
 make allreduce -j 8
 # make ps module
 make ps -j 8
 # make geometric module
 make geometric -j 8
 # make hetu-cache module
 make hetu_cache -j 8
 ```
 #### 从命令行推送已经创建的仓库
 4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` .
 ## Usage
 Train logistic regression on gpu:
 ```bash
 git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
 git push -u origin master
 bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST
 ```
 Train a 3-layer mlp on gpu:
 ```bash
 bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10
 ```
 Train a 3-layer cnn with gpu:
 ```bash
 bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST
 ```
 Train a 3-layer mlp with allreduce on 8 gpus (use mpirun):
 ```bash
 bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10
 ```
 Train a 3-layer mlp with PS on 1 server and 2 workers:
 ```bash
 # in the script we launch the scheduler and server, and two workers
 bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10
 ```
 ## More Examples
 Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks.
 ## Community
 * Email: xupeng.miao@pku.edu.cn
 * Slack: coming soon
 * Hetu homepage: https://hetu-doc.readthedocs.io
 * [Committers & Contributors](COMMITTERS.md)
 * [Contributing to Hetu](CONTRIBUTING.md)
 * [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html)
 ## Enterprise Users
 If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here.
 <img src="./img/tencent.png" width = "200"/>
 <img src="./img/alibabacloud.png" width = "200"/>
 <img src="./img/kuaishou.png" width = "200"/>
 ## License
 The entire codebase is under [license](LICENSE)
 ## Papers
  1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021
  2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021
  3. coming soon
 ## Acknowledgements
 We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel).
 ## Appendix
 The prerequisites for different modules in Hetu is listed as follows:
  ```
  "*" means you should prepare by yourself, while others support auto-download
  Hetu: OpenMP(*), CMake(*)
  Hetu (version mkl): MKL 1.6.1
  Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*)
  Hetu (version all): both
  Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version
  Hetu-PS: Protobuf(*), ZeroMQ 4.3.2
  Hetu-Geometric: Pybind11(*), Metis(*)
  Hetu-Cache: Pybind11(*), this module needs PS module
  ##################################################################
  Tips for preparing the prerequisites
  Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment):
  1. download from https://developer.nvidia.com
  2. install
  3. modify paths in cmake/config.cmake if necessary
  Preparing OpenMP:
  Your just need to ensure your compiler support openmp.
  Preparing CMake, Protobuf, Pybind11, Metis:
  Install by anaconda: 
  conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis
  Preparing OpenMPI (not necessary):
  install by anaconda: `conda install -c conda-forge openmpi=4.0.3`
  or
  1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
  2. build openmpi by `./configure /path/to/build && make -j8 && make install`
  3. modify MPI_HOME to /path/to/build in cmake/config.cmake
  Preparing MKL (not necessary):
  install by anaconda: `conda install -c conda-forge onednn`
  or
  1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz
  2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` 
  3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake 
  Preparing ZeroMQ (not necessary):
  install by anaconda: `conda install -c anaconda zeromq=4.3.2`
  or
  1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip
  2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
  3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake
  ```
--- a/bin/heturun
+++ b/bin/heturun
@@ -0,0 +1,2 @@
 #!/bin/bash
 python $(cd $(dirname $0); pwd)/../python/runner.py $@
--- a/cmake/Modules/FindCUDNN.cmake
+++ b/cmake/Modules/FindCUDNN.cmake
@@ -0,0 +1,75 @@
 # Find the CUDNN libraries
 #
 # The following variables are optionally searched for defaults
 #  CUDNN_ROOT: Base directory where CUDNN is found
 #  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
 #  CUDNN_LIBRARY: Directory where CUDNN library is searched for
 #  CUDNN_STATIC: Are we looking for a static library? (default: no)
 #
 # The following are set after configuration is done:
 #  CUDNN_FOUND
 #  CUDNN_INCLUDE_PATH
 #  CUDNN_LIBRARY_PATH
 #
 set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
 if (DEFINED $ENV{CUDNN_ROOT_DIR})
    message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
 endif()
 list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
 # Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
 list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
 set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
 find_path(CUDNN_INCLUDE_PATH cudnn.h
    HINTS ${CUDNN_INCLUDE_DIR}
    PATH_SUFFIXES cuda/include cuda include
    REQUIRED)
 option(CUDNN_STATIC "Look for static CUDNN" OFF)
 if (CUDNN_STATIC)
    set(CUDNN_LIBNAME "libcudnn_static.a")
 else()
    set(CUDNN_LIBNAME "cudnn")
 endif()
 set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
 if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
    message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
 endif()
 find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
    PATHS ${CUDNN_LIBRARY}
    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
    REQUIRED)
 set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc")
 file(WRITE ${file} "
    #include <iostream>
    #include \"${CUDNN_INCLUDE_PATH}/cudnn.h\"
    int main()
    {
        std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL;
        int x = cudnnGetVersion();
        return x == CUDNN_VERSION;
    }
 ")
 try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
    RUN_OUTPUT_VARIABLE CUDNN_VERSION
    CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
    LINK_LIBRARIES ${CUDNN_LIBRARY_PATH})
 if (NOT CUDNN_VERSION_MATCHED)
    message(FATAL_ERROR "Found CUDNN header version and library version do not match! \
        (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.")
 endif()
 message(STATUS "CUDNN version: ${CUDNN_VERSION}")
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(
    CUDNN
    REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH
    VERSION_VAR CUDNN_VERSION)
 mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
--- a/cmake/Modules/FindMETIS.cmake
+++ b/cmake/Modules/FindMETIS.cmake
@@ -0,0 +1,70 @@
 # Accepts the following variables:
 #
 # METIS_ROOT: Prefix where METIS is installed.
 # METIS_LIB_NAME: Name of the METIS library (default: metis).
 # METIS_LIBRARY: Full path of the METIS library.
 # Sets the following variables:
 #
 # METIS_LIBRARY: Full path of the METIS library.
 # METIS_FOUND: True if ParMETIS was found.
 # METIS_LIBRARIES: List of all libraries needed for linking with METIS,
 #
 # Provides the following macros:
 #
 # find_package(METIS)
 #
 # Searches for METIS (See above)
 # search metis header
 find_path(METIS_INCLUDE_DIR metis.h
    PATHS ${METIS_DIR} ${METIS_ROOT}
    PATH_SUFFIXES metis include include/metis Lib METISLib
    NO_DEFAULT_PATH
    DOC "Include directory of metis")
 find_path(METIS_INCLUDE_DIR metis.h
    PATH_SUFFIXES metis include include/metis Lib METISLib)
 set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library")
 # search metis library
 if(NOT METIS_LIB_NAME)
    set(METIS_LIB_NAME metis)
 endif(NOT METIS_LIB_NAME)
 find_library(METIS_LIBRARY ${METIS_LIB_NAME}
    PATHS ${METIS_DIR} ${METIS_ROOT}
    PATH_SUFFIXES lib
    NO_DEFAULT_PATH)
 find_library(METIS_LIBRARY ${METIS_LIB_NAME}
    PATH_SUFFIXES lib
 )
 # behave like a CMake module is supposed to behave
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(
    "METIS"
    DEFAULT_MSG
    METIS_INCLUDE_DIR
    METIS_LIBRARY
 )
 mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME)
 # if both headers and library are found, store results
 if(METIS_FOUND)
    set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR})
    set(METIS_LIBRARIES ${METIS_LIBRARY})
    # log result
    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
        "Determing location of METIS succeded:\n"
        "Include directory: ${METIS_INCLUDE_DIRS}\n"
        "Library directory: ${METIS_LIBRARIES}\n\n")
 else(METIS_FOUND)
    # log errornous result
    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
        "Determing location of METIS failed:\n"
        "Include directory: ${METIS_INCLUDE_DIRS}\n"
        "Library directory: ${METIS_LIBRARIES}\n\n")
 endif(METIS_FOUND)
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,14 @@
 # - Try to find DNNL(MKL-DNN)
 # Once done this will define
 # DNNL_FOUND - System has DNNL
 # DNNL_INCLUDE_DIR - The DNNL include directories
 # DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build
 # DNNL_LIBRARY - The libraries needed to use DNNL
 # DNNL_DEFINITIONS - Compiler switches required for using DNNL
 find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include )
 find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include )
 find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src )
 include ( FindPackageHandleStandardArgs )
 find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR )
--- a/cmake/Modules/FindNCCL.cmake
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,97 @@
 # Try to find NCCL
 #
 # The following variables are optionally searched for defaults
 #  NCCL_ROOT: Base directory where all NCCL components are found
 #  NCCL_ROOT_DIR: Base directory where all NCCL components are found
 #  NCCL_INCLUDE_DIR: Directory where NCCL header is found
 #  NCCL_LIB_DIR: Directory where NCCL library is found
 #
 # The following are set after configuration is done:
 #  NCCL_FOUND
 #  NCCL_INCLUDE_DIRS
 #  NCCL_LIBRARIES
 #
 # The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
 # install NCCL in the same location as the CUDA toolkit.
 # See https://github.com/caffe2/caffe2/issues/1601
 if (NOT DEFINED NCCL_ROOT)
    set(NCCL_ROOT $ENV{CONDA_PREFIX})
 endif()
 set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")
 find_path(NCCL_INCLUDE_DIRS
    NAMES nccl.h
    HINTS
    ${NCCL_ROOT}
    ${NCCL_ROOT}/include
    ${NCCL_INCLUDE_DIR}
    ${NCCL_ROOT_DIR}
    ${NCCL_ROOT_DIR}/include
    ${CUDA_TOOLKIT_ROOT_DIR}/include
    REQUIRED)
 if ($ENV{USE_STATIC_NCCL})
    message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
    set(NCCL_LIBNAME "libnccl_static.a")
 else()
    set(NCCL_LIBNAME "nccl")
 endif()
 find_library(NCCL_LIBRARIES
    NAMES ${NCCL_LIBNAME}
    HINTS
    ${NCCL_LIB_DIR}
    ${NCCL_ROOT}
    ${NCCL_ROOT}/lib
    ${NCCL_ROOT}/lib/x86_64-linux-gnu
    ${NCCL_ROOT}/lib64
    ${NCCL_ROOT_DIR}
    ${NCCL_ROOT_DIR}/lib
    ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
    ${NCCL_ROOT_DIR}/lib64
    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
    REQUIRED)
 set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
 message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
 set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
 list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR})
 include(CheckCXXSymbolExists)
 check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
 if (NCCL_VERSION_DEFINED)
    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
    file(WRITE ${file} "
        #include <iostream>
        #include \"${NCCL_HEADER_FILE}\"
        int main()
        {
            std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH;
            int x;
            ncclGetVersion(&x);
            return x == NCCL_VERSION_CODE;
        }
    ")
    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
        RUN_OUTPUT_VARIABLE NCCL_VERSION
        CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
        LINK_LIBRARIES ${NCCL_LIBRARIES})
    if (NOT NCCL_VERSION_MATCHED)
        message(FATAL_ERROR "Found NCCL header version and library version do not match! \
            (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
    endif()
    message(STATUS "NCCL version: ${NCCL_VERSION}")
 else()
    message(STATUS "NCCL version < 2.3.5-5")
 endif ()
 set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
 mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(
    NCCL
    REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES
    VERSION_VAR NCCL_VERSION)
--- a/cmake/Modules/FindZMQ.cmake
+++ b/cmake/Modules/FindZMQ.cmake
@@ -0,0 +1,47 @@
 # - Try to find ZMQ
 # Once done this will define
 # ZMQ_FOUND - System has ZMQ
 # ZMQ_INCLUDE_DIRS - The ZMQ include directories
 # ZMQ_LIBRARIES - The libraries needed to use ZMQ
 # ZMQ_DEFINITIONS - Compiler switches required for using ZMQ
 find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include )
 find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib )
 set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} )
 set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} )
 if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS)
    set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc")
    file(WRITE ${file} "
        #include <iostream>
        #include \"${ZMQ_INCLUDE_DIRS}/zmq.h\"
        int main()
        {
            std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH;
            int x, y, z;
            zmq_version(&x, &y, &z);
            return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH;
        }
    ")
    try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
        RUN_OUTPUT_VARIABLE ZMQ_VERSION
        LINK_LIBRARIES ${ZMQ_LIBRARIES})
    if (NOT ZMQ_VERSION_MATCHED)
        message(WARNING "Found ZMQ header version and library version do not match! \
            (include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.")
        unset(ZMQ_INCLUDE_DIRS)
        unset(ZMQ_LIBRARIES)
        unset(ZMQ_VERSION)
    else ()
        message(STATUS "ZMQ version: ${ZMQ_VERSION}")
    endif()
 endif()
 include ( FindPackageHandleStandardArgs )
 # handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE
 # if all listed variables are TRUE
 find_package_handle_standard_args (
    ZMQ
    REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS
    VERSION_VAR ZMQ_VERSION)
--- a/cmake/config.example.cmake
+++ b/cmake/config.example.cmake
@@ -0,0 +1,55 @@
 ######################
 ### Set targets ######
 ######################
 # hetu main version, choose from (mkl, gpu, all)
 # if using mkl (for CPU) or all, OpenMP(*), mkl required
 # if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required
 set(HETU_VERSION "all")
 # whether to compile allreduce module
 # nccl(*), openmpi required
 set(HETU_ALLREDUCE ON)
 # whether to compile ps module
 # protobuf(*), zeromq required
 set(HETU_PS ON)
 # whether to compile geometric module (for GNNs)
 # pybind11(*), metis(*) required
 set(HETU_GEOMETRIC ON)
 # whether to compile cache module (for PS)
 # to enable this, you must turn HETU_PS on
 # pybind11(*) required
 set(HETU_CACHE ON)
 # whether to compile Hetu ML Module
 set(HETU_ML ON)
 set(HETU_PARALLEL_ML ON)
 ######################
 ### Set paths ########
 ######################
 # CUDA version >= 10.1
 set(CUDAToolkit_ROOT /usr/local/cuda)
 # NCCL version >= 2.8
 set(NCCL_ROOT $ENV{CONDA_PREFIX})
 set(CUDNN_ROOT)
 # MPI version >= 3.1 (OpenMPI version >= 4.0.3)
 # if valid version not found, we'll download and compile it in time (openmpi-4.0.3)
 set(MPI_HOME $ENV{CONDA_PREFIX})
 # MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl
 # if not found, we'll download and compile it in time
 set(MKL_ROOT $ENV{CONDA_PREFIX})
 set(MKL_BUILD $ENV{CONDA_PREFIX})
 # ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq
 # if not found, we'll download and compile it in time
 set(ZMQ_ROOT $ENV{CONDA_PREFIX})
 set(ZMQ_BUILD $ENV{CONDA_PREFIX})
--- a/environment.yml
+++ b/environment.yml
@@ -0,0 +1,84 @@
 name: hetu
 channels:
  - conda-forge
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=4.5=1_gnu
  - bcrypt=3.2.0=py37h5e8e339_1
  - blas=1.0=mkl
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2021.7.5=h06a4308_1
  - certifi=2021.5.30=py37h06a4308_0
  - cffi=1.14.6=py37hc58025e_0
  - cmake=3.18.2=ha30ef3c_0
  - cryptography=3.4.7=py37h5d9358c_0
  - cudatoolkit=10.1.243=h6bb024c_0
  - expat=2.4.1=h2531618_2
  - intel-openmp=2021.3.0=h06a4308_3350
  - joblib=1.0.1=pyhd3eb1b0_0
  - krb5=1.18.2=h173b8e3_0
  - ld_impl_linux-64=2.35.1=h7274673_9
  - libcurl=7.71.1=h20c2e04_1
  - libedit=3.1.20210216=h27cfd23_1
  - libffi=3.3=he6710b0_2
  - libgcc-ng=9.3.0=h5101ec6_17
  - libgfortran-ng=7.5.0=h14aa051_19
  - libgfortran4=7.5.0=h14aa051_19
  - libgomp=9.3.0=h5101ec6_17
  - libprotobuf=3.15.8=h780b84a_0
  - libsodium=1.0.18=h7b6447c_0
  - libssh2=1.9.0=h1ba5d50_1
  - libstdcxx-ng=9.3.0=hd4cf53a_17
  - libuv=1.40.0=h7b6447c_0
  - lz4-c=1.9.3=h2531618_0
  - metis=5.1.0=hf484d3e_4
  - mkl=2021.3.0=h06a4308_520
  - mkl-service=2.4.0=py37h7f8727e_0
  - mkl_fft=1.3.0=py37h42c9631_2
  - mkl_random=1.2.2=py37h51133e4_0
  - mpi=1.0=openmpi
  - nccl=2.8.3.1=hcaf9a05_0
  - ncurses=6.2=he6710b0_1
  - numpy=1.20.3=py37hf144106_0
  - numpy-base=1.20.3=py37h74d4b33_0
  - onednn=2.3=omp_hf4ef041_0
  - onnx=1.9.0=py37h284874a_0
  - onnxruntime=1.7.2=py37he8cb6d3_1
  - openmpi=4.0.3=hdf1f1ad_1
  - openssl=1.1.1k=h27cfd23_0
  - pandas=1.2.5=py37h295c915_0
  - paramiko=2.7.2=pyh9f0ad1d_0
  - pip=21.1.3=py37h06a4308_0
  - protobuf=3.15.8=py37hcd2ae1e_0
  - psutil=5.8.0=py37h5e8e339_1
  - pybind11=2.6.2=py37hff7bd54_1
  - pycparser=2.20=pyh9f0ad1d_2
  - pynacl=1.4.0=py37h5e8e339_2
  - python=3.7.10=h12debd9_4
  - python-dateutil=2.8.2=pyhd3eb1b0_0
  - python_abi=3.7=2_cp37m
  - pytz=2021.1=pyhd3eb1b0_0
  - pyyaml=5.4.1=py37h27cfd23_1
  - re2=2021.04.01=h9c3ff4c_0
  - readline=8.1=h27cfd23_0
  - rhash=1.4.1=h3c74f83_1
  - scikit-learn=0.24.2=py37ha9443f7_0
  - scipy=1.6.2=py37had2a1c9_1
  - setuptools=52.0.0=py37h06a4308_0
  - six=1.16.0=pyhd3eb1b0_0
  - sqlite=3.36.0=hc218d9a_0
  - threadpoolctl=2.2.0=pyhb85f177_0
  - tk=8.6.10=hbc83047_0
  - tqdm=4.61.2=pyhd3eb1b0_1
  - typing-extensions=3.10.0.0=hd8ed1ab_0
  - typing_extensions=3.10.0.0=pyha770c72_0
  - wheel=0.36.2=pyhd3eb1b0_0
  - xz=5.2.5=h7b6447c_0
  - yaml=0.2.5=h7b6447c_0
  - zeromq=4.3.2=he6710b0_3
  - zlib=1.2.11=h7b6447c_3
  - zstd=1.4.9=haebb681_0
  - pip:
    - cloudpickle==1.6.0
    - wget==3.2
--- a/examples/cnn/README.md
+++ b/examples/cnn/README.md
@@ -0,0 +1,49 @@
 # CNN Examples
 In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison.
 ## Structure
 ```
 - cnn
    - models/               CNN models in HETU
    - pytorch_models/       CNN models in PyTorch
    - tf_models/            CNN models in TensorFlow
    - scripts/              Test scripts
    - main.py               Trainer for HETU
    - run_tf_horovod.py     Trainer for Horovod
    - tf_launch_server.py   Trainer for TF-PS (role: server)
    - tf_launch_worker.py   Trainer for TF-PS (role: worker)
    - tf_main.py            Trainer for TensorFlow
    - torch_main.py         Trainer for Pytorch
    - 
 ```
 ## Usage
 Here are some examples of running scripts.
 ```bash
 bash scripts/hetu_1gpu.sh mlp CIFAR10   # mlp with CIFAR10 dataset in hetu
 bash scripts/hetu_8gpu.sh mlp CIFAR10   # mlp with CIFAR10 in hetu with 8-GPU (1-node)
 bash scripts/hetu_16gpu.sh mlp CIFAR10  # mlp with CIFAR10 in hetu with 8-GPU (2-nodes)            
 ```
 To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples.
 We can change the setting in scripts. See `mnist_mlp.sh` below.
 ```bash
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../main.py
 ### validate and timing
 python ${mainpy} --model mlp --dataset CIFAR10  --validate --timing
 ### run in cpu
 # python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing
 ```
 For more details about training setting, please refer to `main.py`.
 ## Models
 We provide following models with specific datasets.
 ```
 CIFAR100: VGG, ResNet
 CIFAR10: MLP, VGG, ResNet
 MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN
 ```
--- a/examples/cnn/local_s1.yml
+++ b/examples/cnn/local_s1.yml
@@ -0,0 +1,10 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13030
  DMLC_NUM_WORKER : 2
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
 launch :
  worker : 0
  server : 1
  scheduler : true
--- a/examples/cnn/main.py
+++ b/examples/cnn/main.py
@@ -0,0 +1,202 @@
 import hetu as ht
 import models
 import os
 import numpy as np
 import argparse
 import json
 import logging
 from time import time
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def print_rank0(msg):
    if device_id == 0:
        logger.info(msg)
 if __name__ == "__main__":
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        help='model to be tested')
    parser.add_argument('--dataset', type=str, required=True,
                        help='dataset to be trained on')
    parser.add_argument('--batch-size', type=int,
                        default=128, help='batch size')
    parser.add_argument('--learning-rate', type=float,
                        default=0.1, help='learning rate')
    parser.add_argument('--opt', type=str, default='sgd',
                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
    parser.add_argument('--num-epochs', type=int,
                        default=10, help='epoch number')
    parser.add_argument('--gpu', type=int, default=0,
                        help='gpu to be used, -1 means cpu')
    parser.add_argument('--validate', action='store_true',
                        help='whether to use validation')
    parser.add_argument('--timing', action='store_true',
                        help='whether to time the training phase')
    parser.add_argument('--comm-mode', default=None, help='communication mode')
    args = parser.parse_args()
    global device_id
    device_id = 0
    print_rank0("Training {} on HETU".format(args.model))
    if args.comm_mode in ('AllReduce', 'Hybrid'):
        comm, device_id = ht.mpi_nccl_init()
        executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
    else:
        if args.gpu == -1:
            executor_ctx = ht.cpu(0)
            print_rank0('Use CPU.')
        else:
            executor_ctx = ht.gpu(args.gpu)
            print_rank0('Use GPU %d.' % args.gpu)
    if args.comm_mode in ('PS', 'Hybrid'):
        settings_file = open(os.path.join(os.path.abspath(
            os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu))
        settings = json.load(settings_file)
        for key in settings:
            if type(settings[key]) == str:
                os.environ[key] = settings[key]
            else:
                os.environ[key] = str(settings[key])  # type is str
    assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \
        'Model not supported!'
    model = eval('models.' + args.model)
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    assert args.opt in ['sgd', 'momentum', 'nesterov',
                        'adagrad', 'adam'], 'Optimizer not supported!'
    if args.opt == 'sgd':
        print_rank0('Use SGD Optimizer.')
        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
    elif args.opt == 'momentum':
        print_rank0('Use Momentum Optimizer.')
        opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
    elif args.opt == 'nesterov':
        print_rank0('Use Nesterov Momentum Optimizer.')
        opt = ht.optim.MomentumOptimizer(
            learning_rate=args.learning_rate, nesterov=True)
    elif args.opt == 'adagrad':
        print_rank0('Use AdaGrad Optimizer.')
        opt = ht.optim.AdaGradOptimizer(
            learning_rate=args.learning_rate, initial_accumulator_value=0.1)
    else:
        print_rank0('Use Adam Optimizer.')
        opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)
    # data loading
    print_rank0('Loading %s data...' % dataset)
    if dataset == 'MNIST':
        datasets = ht.data.mnist()
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        # train_set_x: (50000, 784), train_set_y: (50000, 10)
        # valid_set_x: (10000, 784), valid_set_y: (10000, 10)
        # x_shape = (args.batch_size, 784)
        # y_shape = (args.batch_size, 10)
    elif dataset == 'CIFAR10':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
            num_class=10)
        if args.model == "mlp":
            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
        # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10)
        # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10)
        # x_shape = (args.batch_size, 3, 32, 32)
        # y_shape = (args.batch_size, 10)
    elif dataset == 'CIFAR100':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
            num_class=100)
        # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100)
        # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100)
    else:
        raise NotImplementedError
    # model definition
    print_rank0('Building model {}'.format(args.model))
    x = ht.dataloader_op([
        ht.Dataloader(train_set_x, args.batch_size, 'train'),
        ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
    ])
    y_ = ht.dataloader_op([
        ht.Dataloader(train_set_y, args.batch_size, 'train'),
        ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
    ])
    if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
        loss, y = model(x, y_, 100)
    else:
        loss, y = model(x, y_)
    train_op = opt.minimize(loss)
    eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]}
    executor = ht.Executor(eval_nodes, ctx=executor_ctx,
                           comm_mode=args.comm_mode)
    n_train_batches = executor.get_batch_num('train')
    n_valid_batches = executor.get_batch_num('validate')
    # training
    print_rank0("Start training loop...")
    running_time = 0
    for i in range(args.num_epochs + 1):
        print_rank0("Epoch %d" % i)
        loss_all = 0
        batch_num = 0
        if args.timing:
            start = time()
        correct_predictions = []
        for minibatch_index in range(n_train_batches):
            loss_val, predict_y, y_val, _ = executor.run(
                'train', eval_node_list=[loss, y, y_, train_op])
            # Loss for this minibatch
            predict_y = predict_y.asnumpy()
            y_val = y_val.asnumpy()
            loss_all += loss_val.asnumpy()
            batch_num += 1
            # Predict accuracy for this minibatch
            correct_prediction = np.equal(
                np.argmax(y_val, 1),
                np.argmax(predict_y, 1)).astype(np.float32)
            correct_predictions.extend(correct_prediction)
        loss_all /= batch_num
        accuracy = np.mean(correct_predictions)
        print_rank0("Train loss = %f" % loss_all)
        print_rank0("Train accuracy = %f" % accuracy)
        if args.timing:
            end = time()
            during_time = end - start
            print_rank0("Running time of current epoch = %fs" % (during_time))
            if i != 0:
                running_time += during_time
        if args.validate:
            val_loss_all = 0
            batch_num = 0
            correct_predictions = []
            for minibatch_index in range(n_valid_batches):
                loss_val, valid_y_predicted, y_val = executor.run(
                    'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True)
                val_loss_all += loss_val
                batch_num += 1
                correct_prediction = np.equal(
                    np.argmax(y_val, 1),
                    np.argmax(valid_y_predicted, 1)).astype(np.float32)
                correct_predictions.extend(correct_prediction)
            val_loss_all /= batch_num
            accuracy = np.mean(correct_predictions)
            print_rank0("Validation loss = %f" % val_loss_all)
            print_rank0("Validation accuracy = %f" % accuracy)
    print_rank0("*"*50)
    print_rank0("Running time of total %d epoch = %fs" %
                (args.num_epochs, running_time))
    if args.comm_mode in ('AllReduce', 'Hybrid'):
        ht.mpi_nccl_finish(comm)
--- a/examples/cnn/models/AlexNet.py
+++ b/examples/cnn/models/AlexNet.py
@@ -0,0 +1,61 @@
 import hetu as ht
 from hetu import init
 def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False):
    weight = init.random_normal(
        shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
    bn_scale = init.random_normal(
        shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale')
    bn_bias = init.random_normal(
        shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias')
    x = ht.conv2d_op(x, weight, stride=1, padding=1)
    x = ht.batch_normalization_op(x, bn_scale, bn_bias)
    if with_relu:
        x = ht.relu_op(x)
    if with_pool:
        x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0)
    return x
 def fc(x, shape, name, with_relu=True):
    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
    x = ht.matmul_op(x, weight)
    x = x + ht.broadcastto_op(bias, x)
    if with_relu:
        x = ht.relu_op(x)
    return x
 def alexnet(x, y_):
    '''
    AlexNet model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print('Building AlexNet model...')
    x = ht.array_reshape_op(x, [-1, 1, 28, 28])
    x = conv_bn_relu_pool(x,   1,  32, 'alexnet_conv1',
                          with_relu=True, with_pool=True)
    x = conv_bn_relu_pool(x,  32,  64, 'alexnet_conv2',
                          with_relu=True, with_pool=True)
    x = conv_bn_relu_pool(x,  64, 128, 'alexnet_conv3',
                          with_relu=True, with_pool=False)
    x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4',
                          with_relu=True, with_pool=False)
    x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5',
                          with_relu=False, with_pool=True)
    x = ht.array_reshape_op(x, (-1, 256*3*3))
    x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True)
    x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True)
    y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/CNN.py
+++ b/examples/cnn/models/CNN.py
@@ -0,0 +1,41 @@
 import hetu as ht
 from hetu import init
 def conv_relu_avg(x, shape):
    weight = init.random_normal(shape=shape, stddev=0.1)
    x = ht.conv2d_op(x, weight, padding=2, stride=1)
    x = ht.relu_op(x)
    x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
    return x
 def fc(x, shape):
    weight = init.random_normal(shape=shape, stddev=0.1)
    bias = init.random_normal(shape=shape[-1:], stddev=0.1)
    x = ht.array_reshape_op(x, (-1, shape[0]))
    x = ht.matmul_op(x, weight)
    y = x + ht.broadcastto_op(bias, x)
    return y
 def cnn_3_layers(x, y_):
    '''
    3-layer-CNN model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print('Building 3-layer-CNN model...')
    x = ht.array_reshape_op(x, [-1, 1, 28, 28])
    x = conv_relu_avg(x, (32, 1, 5, 5))
    x = conv_relu_avg(x, (64, 32, 5, 5))
    y = fc(x, (7 * 7 * 64, 10))
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/LSTM.py
+++ b/examples/cnn/models/LSTM.py
@@ -0,0 +1,90 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 def lstm(x, y_):
    '''
    LSTM model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    diminput = 28
    dimhidden = 128
    dimoutput = 10
    nsteps = 28
    forget_gate_w = init.random_normal(
        shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w")
    forget_gate_u = init.random_normal(
        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u")
    forget_gate_b = init.random_normal(
        shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b")
    input_gate_w = init.random_normal(
        shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w")
    input_gate_u = init.random_normal(
        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u")
    input_gate_b = init.random_normal(
        shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b")
    output_gate_w = init.random_normal(
        shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w")
    output_gate_u = init.random_normal(
        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u")
    output_gate_b = init.random_normal(
        shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b")
    tanh_w = init.random_normal(
        shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w")
    tanh_u = init.random_normal(
        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u")
    tanh_b = init.random_normal(
        shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b")
    out_weights = init.random_normal(
        shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight")
    out_bias = init.random_normal(
        shape=(dimoutput,), stddev=0.1, name="lstm_out_bias")
    initial_state = ht.Variable(value=np.zeros((1,)).astype(
        np.float32), name='initial_state', trainable=False)
    for i in range(nsteps):
        cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput))
        # forget gate
        if i == 0:
            temp = ht.matmul_op(cur_x, forget_gate_w)
            last_c_state = ht.broadcastto_op(initial_state, temp)
            last_h_state = ht.broadcastto_op(initial_state, temp)
            cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp
        else:
            cur_forget = ht.matmul_op(
                last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w)
        cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget)
        cur_forget = ht.sigmoid_op(cur_forget)
        # input gate
        cur_input = ht.matmul_op(
            last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w)
        cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input)
        cur_input = ht.sigmoid_op(cur_input)
        # output gate
        cur_output = ht.matmul_op(
            last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w)
        cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output)
        cur_output = ht.sigmoid_op(cur_output)
        # tanh
        cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \
            ht.matmul_op(cur_x, tanh_w)
        cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh)
        cur_tanh = ht.tanh_op(cur_tanh)
        last_c_state = ht.mul_op(last_c_state, cur_forget) + \
            ht.mul_op(cur_input, cur_tanh)
        last_h_state = ht.tanh_op(last_c_state) * cur_output
    x = ht.matmul_op(last_h_state, out_weights)
    y = x + ht.broadcastto_op(out_bias, x)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/LeNet.py
+++ b/examples/cnn/models/LeNet.py
@@ -0,0 +1,46 @@
 import hetu as ht
 from hetu import init
 def conv_pool(x, in_channel, out_channel, name):
    weight = init.random_normal(
        shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight')
    x = ht.conv2d_op(x, weight, padding=2, stride=1)
    x = ht.relu_op(x)
    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
    return x
 def fc(x, shape, name, with_relu=True):
    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
    x = ht.matmul_op(x, weight)
    x = x + ht.broadcastto_op(bias, x)
    if with_relu:
        x = ht.relu_op(x)
    return x
 def lenet(x, y_):
    '''
    LeNet model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print('Building LeNet model...')
    x = ht.array_reshape_op(x, (-1, 1, 28, 28))
    x = conv_pool(x, 1,  6, name='lenet_conv1')
    x = conv_pool(x, 6, 16, name='lenet_conv2')
    x = ht.array_reshape_op(x, (-1, 7*7*16))
    x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True)
    x = fc(x, (120, 84), name='lenet_fc2', with_relu=True)
    y = fc(x, (84,  10), name='lenet_fc3', with_relu=False)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/LogReg.py
+++ b/examples/cnn/models/LogReg.py
@@ -0,0 +1,24 @@
 import hetu as ht
 from hetu import init
 def logreg(x, y_):
    '''
    Logistic Regression model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print("Build logistic regression model...")
    weight = init.zeros((784, 10), name='logreg_weight')
    bias = init.zeros((10,), name='logreg_bias')
    x = ht.matmul_op(x, weight)
    y = x + ht.broadcastto_op(bias, x)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/MLP.py
+++ b/examples/cnn/models/MLP.py
@@ -0,0 +1,33 @@
 import hetu as ht
 from hetu import init
 def fc(x, shape, name, with_relu=True):
    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
    x = ht.matmul_op(x, weight)
    x = x + ht.broadcastto_op(bias, x)
    if with_relu:
        x = ht.relu_op(x)
    return x
 def mlp(x, y_):
    '''
    MLP model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print("Building MLP model...")
    x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True)
    x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
    y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/RNN.py
+++ b/examples/cnn/models/RNN.py
@@ -0,0 +1,56 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 def rnn(x, y_):
    '''
    RNN model, for MNIST dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    print("Building RNN model...")
    diminput = 28
    dimhidden = 128
    dimoutput = 10
    nsteps = 28
    weight1 = init.random_normal(
        shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1')
    bias1 = init.random_normal(
        shape=(dimhidden, ), stddev=0.1, name='rnn_bias1')
    weight2 = init.random_normal(
        shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2')
    bias2 = init.random_normal(
        shape=(dimhidden, ), stddev=0.1, name='rnn_bias2')
    weight3 = init.random_normal(
        shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3')
    bias3 = init.random_normal(
        shape=(dimoutput, ), stddev=0.1, name='rnn_bias3')
    last_state = ht.Variable(value=np.zeros((1,)).astype(
        np.float32), name='initial_state', trainable=False)
    for i in range(nsteps):
        cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput))
        h = ht.matmul_op(cur_x, weight1)
        h = h + ht.broadcastto_op(bias1, h)
        if i == 0:
            last_state = ht.broadcastto_op(last_state, h)
        s = ht.concat_op(h, last_state, axis=1)
        s = ht.matmul_op(s, weight2)
        s = s + ht.broadcastto_op(bias2, s)
        last_state = ht.relu_op(s)
    final_state = last_state
    x = ht.matmul_op(final_state, weight3)
    y = x + ht.broadcastto_op(bias3, x)
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
--- a/examples/cnn/models/ResNet.py
+++ b/examples/cnn/models/ResNet.py
@@ -0,0 +1,125 @@
 import hetu as ht
 from hetu import init
 def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''):
    weight = init.random_normal(
        shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
    x = ht.conv2d_op(x, weight, stride=stride, padding=padding)
    return x
 def batch_norm_with_relu(x, hidden, name):
    scale = init.random_normal(
        shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale')
    bias = init.random_normal(shape=(1, hidden, 1, 1),
                              stddev=0.1, name=name+'_bias')
    x = ht.batch_normalization_op(x, scale, bias)
    x = ht.relu_op(x)
    return x
 def resnet_block(x, in_channel, num_blocks, is_first=False, name=''):
    if is_first:
        out_channel = in_channel
        identity = x
        x = conv2d(x, in_channel, out_channel, stride=1,
                   padding=1, name=name+'_conv1')
        x = batch_norm_with_relu(x, out_channel, name+'_bn1')
        x = conv2d(x, out_channel, out_channel, stride=1,
                   padding=1, name=name+'_conv2')
        x = x + identity
    else:
        out_channel = 2 * in_channel
        identity = x
        x = batch_norm_with_relu(x, in_channel, name+'_bn0')
        x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]])
        x = conv2d(x, in_channel, out_channel, stride=2,
                   padding=0, name=name+'_conv1')
        x = batch_norm_with_relu(x, out_channel, name+'_bn1')
        x = conv2d(x, out_channel, out_channel, stride=1,
                   padding=1, name=name+'_conv2')
        identity = ht.avg_pool2d_op(
            identity, kernel_H=2, kernel_W=2, padding=0, stride=2)
        identity = ht.pad_op(
            identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])
        x = x + identity
    for i in range(1, num_blocks):
        identity = x
        x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i))
        x = conv2d(x, out_channel, out_channel, stride=1,
                   padding=1, name=name+'_conv%d' % (2 * i + 1))
        x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1))
        x = conv2d(x, out_channel, out_channel, stride=1,
                   padding=1, name=name+'_conv%d' % (2 * i + 2))
        x = x + identity
    return x
 def fc(x, shape, name):
    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
    x = ht.matmul_op(x, weight)
    x = x + ht.broadcastto_op(bias, x)
    return x
 def resnet(x, y_, num_layers=18, num_class=10):
    '''
    ResNet model, for CIFAR10 dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
        num_layers: 18 or 34
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    base_size = 16
    x = conv2d(x, 3, base_size, stride=1, padding=1,
               name='resnet_initial_conv')
    x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn')
    if num_layers == 18:
        print("Building ResNet-18 model...")
        x = resnet_block(x,     base_size, num_blocks=2,
                         is_first=True, name='resnet_block1')
        x = resnet_block(x,     base_size, num_blocks=2,
                         is_first=False, name='resnet_block2')
        x = resnet_block(x, 2 * base_size, num_blocks=2,
                         is_first=False, name='resnet_block3')
        x = resnet_block(x, 4 * base_size, num_blocks=2,
                         is_first=False, name='resnet_block4')
    elif num_layers == 34:
        print("Building ResNet-34 model...")
        x = resnet_block(x,     base_size, num_blocks=3,
                         is_first=True, name='resnet_block1')
        x = resnet_block(x,     base_size, num_blocks=4,
                         is_first=False, name='resnet_block2')
        x = resnet_block(x, 2 * base_size, num_blocks=6,
                         is_first=False, name='resnet_block3')
        x = resnet_block(x, 4 * base_size, num_blocks=3,
                         is_first=False, name='resnet_block4')
    else:
        assert False, "Number of layers should be 18 or 34 !"
    x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn')
    x = ht.array_reshape_op(x, (-1, 128 * base_size))
    y = fc(x, (128 * base_size, num_class), name='resnet_final_fc')
    # here we don't use cudnn for softmax crossentropy to avoid overflows
    loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
 def resnet18(x, y_, num_class=10):
    return resnet(x, y_, 18, num_class)
 def resnet34(x, y_, num_class=10):
    return resnet(x, y_, 34, num_class)
--- a/examples/cnn/models/VGG.py
+++ b/examples/cnn/models/VGG.py
@@ -0,0 +1,100 @@
 import hetu as ht
 from hetu import init
 def conv_bn_relu(x, in_channel, out_channel, name):
    weight = init.random_normal(shape=(out_channel, in_channel, 3, 3),
                                stddev=0.1, name=name+'_weight')
    bn_scale = init.random_normal(shape=(1, out_channel, 1, 1),
                                  stddev=0.1, name=name+'_bn_scale')
    bn_bias = init.random_normal(shape=(1, out_channel, 1, 1),
                                 stddev=0.1, name=name+'_bn_bias')
    x = ht.conv2d_op(x, weight, padding=1, stride=1)
    x = ht.batch_normalization_op(x, bn_scale, bn_bias)
    act = ht.relu_op(x)
    return act
 def vgg_2block(x, in_channel, out_channel, name):
    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
    return x
 def vgg_3block(x, in_channel, out_channel, name):
    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
    return x
 def vgg_4block(x, in_channel, out_channel, name):
    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4')
    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
    return x
 def vgg_fc(x, in_feat, out_feat, name):
    weight = init.random_normal(shape=(in_feat, out_feat),
                                stddev=0.1, name=name+'_weight')
    bias = init.random_normal(shape=(out_feat,),
                              stddev=0.1, name=name+'_bias')
    x = ht.matmul_op(x, weight)
    x = x + ht.broadcastto_op(bias, x)
    return x
 def vgg(x, y_, num_layers, num_class=10):
    '''
    VGG model, for CIFAR10/CIFAR100 dataset.
    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
        num_layers: 16 or 19
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''
    if num_layers == 16:
        print('Building VGG-16 model...')
        x = vgg_2block(x,   3,  64, 'vgg_block1')
        x = vgg_2block(x,  64, 128, 'vgg_block2')
        x = vgg_3block(x, 128, 256, 'vgg_block3')
        x = vgg_3block(x, 256, 512, 'vgg_block4')
        x = vgg_3block(x, 512, 512, 'vgg_block5')
    elif num_layers == 19:
        print('Building VGG-19 model...')
        x = vgg_2block(x,   3,  64, 'vgg_block1')
        x = vgg_2block(x,  64, 128, 'vgg_block2')
        x = vgg_4block(x, 128, 256, 'vgg_block3')
        x = vgg_4block(x, 256, 512, 'vgg_block4')
        x = vgg_4block(x, 512, 512, 'vgg_block5')
    else:
        assert False, 'VGG model should have 16 or 19 layers!'
    x = ht.array_reshape_op(x, (-1, 512))
    x = vgg_fc(x,  512, 4096, 'vgg_fc1')
    x = vgg_fc(x, 4096, 4096, 'vgg_fc2')
    y = vgg_fc(x, 4096, num_class, 'vgg_fc3')
    loss = ht.softmaxcrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    return loss, y
 def vgg16(x, y_, num_class=10):
    return vgg(x, y_, 16, num_class)
 def vgg19(x, y_, num_class=10):
    return vgg(x, y_, 19, num_class)
--- a/examples/cnn/models/init.py
+++ b/examples/cnn/models/init.py
@@ -0,0 +1,9 @@
 from .VGG import vgg, vgg16, vgg19
 from .LogReg import logreg
 from .CNN import cnn_3_layers
 from .AlexNet import alexnet
 from .LeNet import lenet
 from .MLP import mlp
 from .RNN import rnn
 from .LSTM import lstm
 from .ResNet import resnet, resnet18, resnet34
--- a/examples/cnn/pytorch_models/init.py
+++ b/examples/cnn/pytorch_models/init.py
@@ -0,0 +1,4 @@
 from .mlp import mlp
 from .resnet import resnet18, resnet34, resnet50
 from .vgg import vgg16, vgg19
 from .rnn import rnn
--- a/examples/cnn/pytorch_models/mlp.py
+++ b/examples/cnn/pytorch_models/mlp.py
@@ -0,0 +1,20 @@
 import torch.nn.functional as F
 import torch.nn as nn
 class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(3072, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        out = self.fc3(x)
        return out
 def mlp():
    return MLP()
--- a/examples/cnn/pytorch_models/resnet.py
+++ b/examples/cnn/pytorch_models/resnet.py
@@ -0,0 +1,116 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out
 class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out
 class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out
 def resnet18(num_classes=10):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
 def resnet34(num_classes=10):
    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
 def resnet50(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
 def resnet101(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
 def resnet152(num_classes=10):
    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
--- a/examples/cnn/pytorch_models/rnn.py
+++ b/examples/cnn/pytorch_models/rnn.py
@@ -0,0 +1,36 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class RNN(nn.Module):
    def __init__(self, diminput, dimoutput, dimhidden, nsteps):
        super(RNN, self).__init__()
        self.diminput = diminput
        self.dimoutput = dimoutput
        self.dimhidden = dimhidden
        self.nsteps = nsteps
        self.fc1 = nn.Linear(diminput, dimhidden)
        self.fc2 = nn.Linear(dimhidden*2, dimhidden)
        self.fc3 = nn.Linear(dimhidden, dimoutput)
    def forward(self, x):
        last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device)
        for i in range(self.nsteps):
            t = i % self.nsteps
            index = torch.Tensor([idx for idx in range(
                t*self.diminput, (t+1)*self.diminput)]).long().to(x.device)
            cur_x = torch.index_select(x, 1, index)
            h = self.fc1(cur_x)
            s = torch.cat([h, last_state], axis=1)
            s = self.fc2(s)
            last_state = F.relu(s)
        final_state = last_state
        y = self.fc3(final_state)
        return y
 def rnn(diminput, dimoutput, dimhidden, nsteps):
    return RNN(diminput, dimoutput, dimhidden, nsteps)
--- a/examples/cnn/pytorch_models/vgg.py
+++ b/examples/cnn/pytorch_models/vgg.py
@@ -0,0 +1,48 @@
 import torch
 import torch.nn as nn
 cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
 }
 class VGG(nn.Module):
    def __init__(self, vgg_name, num_class=10):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.fc1 = nn.Linear(512, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.classifier = nn.Linear(4096, num_class)
    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.fc2(self.fc1(out))
        out = self.classifier(out)
        return out
    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)
 def vgg16(num_class=10):
    return VGG('VGG16', num_class)
 def vgg19(num_class=10):
    return VGG('VGG19', num_class)
--- a/examples/cnn/run_tf_horovod.py
+++ b/examples/cnn/run_tf_horovod.py
@@ -0,0 +1,309 @@
 import os
 import numpy as np
 import tensorflow as tf
 import tf_models
 import time
 import argparse
 from tqdm import tqdm
 from sklearn import metrics
 import horovod.tensorflow as hvd
 import hetu as ht
 import logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def print_rank0(msg):
    if rank % 8 == 0:
        logger.info(msg)
 def pop_env():
    for k in ['https_proxy', 'http_proxy']:
        if k in os.environ:
            os.environ.pop(k)
 pop_env()
 # horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
 # horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
 # horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8
 #    python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn
 # if using multi nodes setting in conda, need to modify /etc/bash.bashrc
 # we can also use mpirun (default gloo):
 # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
 #  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
 '''
 def train(model, args):
    hvd.init()
    def get_current_shard(data):
        part_size = data.shape[0] // hvd.size()
        start = part_size * hvd.rank()
        end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0]
        return data[start:end]
    batch_size = 128
    if args.model == 'tf_resnet34':
        train_images, train_labels, test_images,\
                test_labels = ht.data.tf_normalize_cifar10()
        x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3])
        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
    else:
        datasets = ht.data.mnist()
        train_images, train_labels = datasets[0]
        test_images, test_labels = datasets[2]
        x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784])
        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
    n_train_batches = train_images.shape[0] // batch_size
    loss, y = model(x, y_)
    opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    global_step = tf.train.get_or_create_global_step()
    # here in DistributedOptimizer by default all tensor are reduced on GPU
    # can use device_sparse=xxx, device_dense=xxx to modify
    # if using device_sparse='/cpu:0', the performance degrades
    train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step)
    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank()))
    # here horovod default use gpu to initialize, which will cause OOM
    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
    sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
    iterations = train_images.shape[0] // batch_size
    total_epoch = 10
    start_index = 0
    total_time = 0
    for ep in range(total_epoch + 1):
        print("epoch %d" % ep)
        st_time = time.time()
        train_loss, train_acc = [], []
        for it in range(n_train_batches):
            x_val = train_images[start_index: start_index + batch_size]
            y_val = train_labels[start_index : start_index+batch_size]
            start_index += batch_size
            if start_index + batch_size > train_images.shape[0]:
                start_index = 0
            loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val})
            pred_val = loss_val[1]
            true_val = loss_val[2]
            acc_val = np.equal(
                true_val,
                pred_val > 0.5)
            train_loss.append(loss_val[0])
            train_acc.append(acc_val)
        tra_accuracy = np.mean(train_acc)
        tra_loss = np.mean(train_loss)
        en_time = time.time()
        train_time = en_time - st_time
        if ep != 0:
            total_time += train_time
        printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\
                    % (tra_loss, tra_accuracy, train_time)
    print("training time:", total_time)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True, help="model to be tested")
    parser.add_argument("--all", action="store_true", help="whether to use all data")
    args = parser.parse_args()
    raw_model = args.model
    import tf_models
    model = eval('tf_models.' + raw_model)
    print('Model:', raw_model)
    train(model, args)
 if __name__ == '__main__':
    main()
 '''
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        help='model to be tested')
    parser.add_argument('--dataset', type=str, required=True,
                        help='dataset to be trained on')
    parser.add_argument('--batch-size', type=int,
                        default=128, help='batch size')
    parser.add_argument('--learning-rate', type=float,
                        default=0.1, help='learning rate')
    parser.add_argument('--opt', type=str, default='sgd',
                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
    parser.add_argument('--num-epochs', type=int,
                        default=20, help='epoch number')
    parser.add_argument('--validate', action='store_true',
                        help='whether to use validation')
    parser.add_argument('--timing', action='store_true',
                        help='whether to time the training phase')
    args = parser.parse_args()
    hvd.init()
    global rank
    rank = hvd.rank()
    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
        'Model not supported now.'
    model = eval('tf_models.' + args.model)
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    assert args.opt in ['sgd', 'momentum', 'nesterov',
                        'adagrad', 'adam'], 'Optimizer not supported!'
    if args.opt == 'sgd':
        print_rank0('Use SGD Optimizer.')
        opt = tf.train.GradientDescentOptimizer(
            learning_rate=args.learning_rate)
    elif args.opt == 'momentum':
        print_rank0('Use Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9)
    elif args.opt == 'nesterov':
        print_rank0('Use Nesterov Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
    elif args.opt == 'adagrad':
        print_rank0('Use AdaGrad Optimizer.')
        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
    else:
        print_rank0('Use Adam Optimizer.')
        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    if dataset == 'MNIST':
        datasets = ht.data.mnist()
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        # train_set_x: (50000, 784), train_set_y: (50000,)
        # valid_set_x: (10000, 784), valid_set_y: (10000,)
    elif dataset == 'CIFAR10':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
            num_class=10)
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        if args.model == "tf_mlp":
            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
    elif dataset == 'CIFAR100':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
            num_class=100)
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
    else:
        raise NotImplementedError
    if dataset == 'MNIST':
        x = tf.compat.v1.placeholder(
            dtype=tf.float32, shape=(None, 784), name='x')
        y_ = tf.compat.v1.placeholder(
            dtype=tf.float32, shape=(None, 10), name='y_')
        loss, y = model(x, y_)
    elif dataset == 'CIFAR10':
        if args.model == "tf_mlp":
            x = tf.compat.v1.placeholder(
                dtype=tf.float32, shape=(None, 3072), name='x')
            y_ = tf.compat.v1.placeholder(
                dtype=tf.float32, shape=(None, 10), name='y_')
        else:
            x = tf.compat.v1.placeholder(
                dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
            y_ = tf.compat.v1.placeholder(
                dtype=tf.float32, shape=(None, 10), name='y_')
        loss, y = model(x, y_, 10)
    elif dataset == 'CIFAR100':
        x = tf.compat.v1.placeholder(
            dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
        y_ = tf.compat.v1.placeholder(
            dtype=tf.float32, shape=(None, 100), name='y_')
        loss, y = model(x, y_, 100)
    global_step = tf.train.get_or_create_global_step()
    # here in DistributedOptimizer by default all tensor are reduced on GPU
    # can use device_sparse=xxx, device_dense=xxx to modify
    # if using device_sparse='/cpu:0', the performance degrades
    train_op = hvd.DistributedOptimizer(
        opt).minimize(loss, global_step=global_step)
    gpu_options = tf.compat.v1.GPUOptions(
        allow_growth=True, visible_device_list=str(hvd.local_rank()))
    # here horovod default use gpu to initialize, which will cause OOM
    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
    sess = tf.compat.v1.train.MonitoredTrainingSession(
        hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
    # sess.run(tf.compat.v1.global_variables_initializer())
    # training
    print_rank0("Start training loop...")
    running_time = 0
    for i in range(args.num_epochs + 1):
        print_rank0("Epoch %d" % i)
        loss_all = 0
        batch_num = 0
        if args.timing:
            start = time.time()
        correct_predictions = []
        for minibatch_index in range(n_train_batches):
            minibatch_start = minibatch_index * args.batch_size
            minibatch_end = (minibatch_index + 1) * args.batch_size
            x_val = train_set_x[minibatch_start:minibatch_end]
            y_val = train_set_y[minibatch_start:minibatch_end]
            loss_val, predict_y, _ = sess.run([loss, y, train_op],
                                              feed_dict={x: x_val, y_: y_val})
            correct_prediction = np.equal(
                np.argmax(y_val, 1),
                np.argmax(predict_y, 1)).astype(np.float32)
            correct_predictions.extend(correct_prediction)
            batch_num += 1
            loss_all += loss_val
        loss_all /= batch_num
        accuracy = np.mean(correct_predictions)
        print_rank0("Train loss = %f" % loss_all)
        print_rank0("Train accuracy = %f" % accuracy)
        if args.timing:
            end = time.time()
            print_rank0("Running time of current epoch = %fs" % (end - start))
            if i != 0:
                running_time += (end - start)
        if args.validate:
            val_loss_all = 0
            batch_num = 0
            correct_predictions = []
            for minibatch_index in range(n_valid_batches):
                minibatch_start = minibatch_index * args.batch_size
                minibatch_end = (minibatch_index + 1) * args.batch_size
                valid_x_val = valid_set_x[minibatch_start:minibatch_end]
                valid_y_val = valid_set_y[minibatch_start:minibatch_end]
                loss_val, valid_y_predicted = sess.run([loss, y],
                                                       feed_dict={x: valid_x_val, y_: valid_y_val})
                correct_prediction = np.equal(
                    np.argmax(valid_y_val, 1),
                    np.argmax(valid_y_predicted, 1)).astype(np.float32)
                correct_predictions.extend(correct_prediction)
                val_loss_all += loss_all
                batch_num += 1
            val_loss_all /= batch_num
            accuracy = np.mean(correct_predictions)
            print_rank0("Validation loss = %f" % val_loss_all)
            print_rank0("Validation accuracy = %f" % accuracy)
    print_rank0("*"*50)
    print_rank0("Running time of total %d epoch = %fs" %
                (args.num_epochs, running_time))
--- a/examples/cnn/scripts/hetu_16gpu.sh
+++ b/examples/cnn/scripts/hetu_16gpu.sh
@@ -0,0 +1,9 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../main.py
 depsdir=${workdir}/../../..
 echo $depsdir
 ### validate and timing
 $depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce
--- a/examples/cnn/scripts/hetu_1gpu.sh
+++ b/examples/cnn/scripts/hetu_1gpu.sh
@@ -0,0 +1,11 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../main.py
 # model: 
 # e.g. bash hetu_1gpu.sh mlp CIFAR10
 ### validate and timing
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
--- a/examples/cnn/scripts/hetu_2gpu_ps.sh
+++ b/examples/cnn/scripts/hetu_2gpu_ps.sh
@@ -0,0 +1,10 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../main.py
 ### validate and timing
 python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched &
 python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 &
 python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 &
 wait
--- a/examples/cnn/scripts/hetu_8gpu.sh
+++ b/examples/cnn/scripts/hetu_8gpu.sh
@@ -0,0 +1,8 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../main.py
 depsdir=${workdir}/../../..
 ### validate and timing
 # 
 NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
--- a/examples/cnn/scripts/horovod_16gpu.sh
+++ b/examples/cnn/scripts/horovod_16gpu.sh
@@ -0,0 +1,11 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_tf_horovod.py
 # horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing
 horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
 # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
 #  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
--- a/examples/cnn/scripts/horovod_8gpu.sh
+++ b/examples/cnn/scripts/horovod_8gpu.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_tf_horovod.py
 horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing
--- a/examples/cnn/scripts/pytorch_16gpu_0.sh
+++ b/examples/cnn/scripts/pytorch_16gpu_0.sh
@@ -0,0 +1,18 @@
 #!/bin/bash
 GPUS_PER_NODE=8
 # Change for multinode config
 MASTER_ADDR=162.105.146.117
 MASTER_PORT=6000
 NNODES=2
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../torch_main.py
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        ${mainpy} \
        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
--- a/examples/cnn/scripts/pytorch_16gpu_1.sh
+++ b/examples/cnn/scripts/pytorch_16gpu_1.sh
@@ -0,0 +1,18 @@
 #!/bin/bash
 GPUS_PER_NODE=8
 # Change for multinode config
 MASTER_ADDR=162.105.146.117
 MASTER_PORT=39575
 NNODES=2
 NODE_RANK=1
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../torch_main.py
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        ${mainpy} \
        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
--- a/examples/cnn/scripts/pytorch_1gpu.sh
+++ b/examples/cnn/scripts/pytorch_1gpu.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../torch_main.py
 ## validate and timing
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
--- a/examples/cnn/scripts/pytorch_8gpu.sh
+++ b/examples/cnn/scripts/pytorch_8gpu.sh
@@ -0,0 +1,18 @@
 #!/bin/bash
 GPUS_PER_NODE=8
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../torch_main.py
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        ${mainpy} \
        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
--- a/examples/cnn/scripts/tf_16gpu_worker0.sh
+++ b/examples/cnn/scripts/tf_16gpu_worker0.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../tf_launch_worker.py
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate &
 wait
--- a/examples/cnn/scripts/tf_16gpu_worker1.sh
+++ b/examples/cnn/scripts/tf_16gpu_worker1.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../tf_launch_worker.py
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate &
 wait
--- a/examples/cnn/scripts/tf_1gpu.sh
+++ b/examples/cnn/scripts/tf_1gpu.sh
@@ -0,0 +1,10 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../tf_main.py
 ### validate and timing
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
 ### run in cpu
 # python ${mainpy} --model tf_mlp --gpu -1 --validate --timing
--- a/examples/cnn/scripts/tf_8gpu.sh
+++ b/examples/cnn/scripts/tf_8gpu.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../tf_launch_worker.py
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate &
 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate &
 wait
--- a/examples/cnn/settings/tf_dist_s1_w16.json
+++ b/examples/cnn/settings/tf_dist_s1_w16.json
@@ -0,0 +1,23 @@
 {
    "worker": [
        "162.105.146.117:34569",
        "162.105.146.117:34568",
        "162.105.146.117:34567",
        "162.105.146.117:34566",
        "162.105.146.117:34565",
        "162.105.146.117:34564",
        "162.105.146.117:34563",
        "162.105.146.117:34562",
        "162.105.146.118:34779",
        "162.105.146.118:34778",
        "162.105.146.118:34777",
        "162.105.146.118:34776",
        "162.105.146.118:34775",
        "162.105.146.118:34774",
        "162.105.146.118:34773",
        "162.105.146.118:34772"
    ],
    "ps": [
        "162.105.146.117:34575"
    ]
 }
--- a/examples/cnn/settings/tf_dist_s1_w4.json
+++ b/examples/cnn/settings/tf_dist_s1_w4.json
@@ -0,0 +1,11 @@
 {
    "worker": [
        "162.105.146.119:34569",
        "162.105.146.119:34568",
        "162.105.146.119:34567",
        "162.105.146.119:34566"
    ],
    "ps": [
        "162.105.146.119:34575"
    ]
 }
--- a/examples/cnn/settings/tf_dist_s1_w8.json
+++ b/examples/cnn/settings/tf_dist_s1_w8.json
@@ -0,0 +1,15 @@
 {
    "worker": [
        "162.105.146.119:34569",
        "162.105.146.119:34568",
        "162.105.146.119:34567",
        "162.105.146.119:34566",
        "162.105.146.119:34565",
        "162.105.146.119:34564",
        "162.105.146.119:34563",
        "162.105.146.119:34562"
    ],
    "ps": [
        "162.105.146.119:34575"
    ]
 }
--- a/examples/cnn/tf_launch_server.py
+++ b/examples/cnn/tf_launch_server.py
@@ -0,0 +1,49 @@
 import os
 import tensorflow as tf
 import multiprocessing
 import signal
 import json
 import argparse
 def pop_env():
    for k in ['https_proxy', 'http_proxy']:
        if k in os.environ:
            os.environ.pop(k)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
 pop_env()
 def start_server(cluster, task_id):
    server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
    server.join()
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path")
    parser.add_argument("--id", type=int, required=True)
    args = parser.parse_args()
    raw_config = args.config
    config = json.load(open(raw_config))
    cluster = tf.train.ClusterSpec(config)
    global proc
    proc = multiprocessing.Process(
        target=start_server, args=[cluster, args.id, ])
    proc.start()
    signal.signal(signal.SIGINT, signal_handler)
    proc.join()
 def signal_handler(signal, frame):
    print("SIGINT signal caught, stop Training")
    global proc
    proc.kill()
    exit(0)
 if __name__ == '__main__':
    main()
--- a/examples/cnn/tf_launch_worker.py
+++ b/examples/cnn/tf_launch_worker.py
@@ -0,0 +1,234 @@
 import tensorflow as tf
 import tf_models
 import hetu as ht
 import numpy as np
 import argparse
 import json
 from time import time
 import os
 import logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def print_rank0(msg):
    if task_id % 8 == 0:
        logger.info(msg)
 def pop_env():
    for k in ['https_proxy', 'http_proxy']:
        if k in os.environ:
            os.environ.pop(k)
 pop_env()
 if __name__ == "__main__":
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        help='model to be tested')
    parser.add_argument('--dataset', type=str, required=True,
                        help='dataset to be trained on')
    parser.add_argument('--batch-size', type=int,
                        default=128, help='batch size')
    parser.add_argument('--learning-rate', type=float,
                        default=0.1, help='learning rate')
    parser.add_argument('--opt', type=str, default='sgd',
                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
    parser.add_argument('--num-epochs', type=int,
                        default=20, help='epoch number')
    parser.add_argument('--gpu', type=int, default=0,
                        help='gpu to be used, -1 means cpu')
    parser.add_argument('--validate', action='store_true',
                        help='whether to use validation')
    parser.add_argument('--timing', action='store_true',
                        help='whether to time the training phase')
    parser.add_argument("--rank", type=int, required=True,
                        help="rank of process")
    parser.add_argument(
        "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")
    args = parser.parse_args()
    global task_id
    task_id = int(args.rank)
    print_rank0("task id %d" % (task_id))
    raw_config = args.config
    if args.gpu == -1:
        device = '/job:worker/task:%d/cpu:0' % (task_id)
        print_rank0('Use CPU.')
    else:
        device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu)
        print_rank0('Use GPU %d.' % args.gpu)
    config = json.load(open(raw_config))
    cluster = tf.train.ClusterSpec(config)
    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
        'Model not supported now.'
    model = eval('tf_models.' + args.model)
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    assert args.opt in ['sgd', 'momentum', 'nesterov',
                        'adagrad', 'adam'], 'Optimizer not supported!'
    if args.opt == 'sgd':
        print_rank0('Use SGD Optimizer.')
        opt = tf.train.GradientDescentOptimizer(
            learning_rate=args.learning_rate)
    elif args.opt == 'momentum':
        print_rank0('Use Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9)
    elif args.opt == 'nesterov':
        print_rank0('Use Nesterov Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
    elif args.opt == 'adagrad':
        print_rank0('Use AdaGrad Optimizer.')
        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
    else:
        print_rank0('Use Adam Optimizer.')
        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    with tf.device(
            tf.compat.v1.train.replica_device_setter(
                worker_device=device,
                cluster=cluster)):
        # data loading
        print_rank0('Loading %s data...' % dataset)
        if dataset == 'MNIST':
            datasets = ht.data.mnist()
            train_set_x, train_set_y = datasets[0]
            valid_set_x, valid_set_y = datasets[1]
            test_set_x, test_set_y = datasets[2]
            n_train_batches = train_set_x.shape[0] // args.batch_size
            n_valid_batches = valid_set_x.shape[0] // args.batch_size
            # train_set_x: (50000, 784), train_set_y: (50000,)
            # valid_set_x: (10000, 784), valid_set_y: (10000,)
        elif dataset == 'CIFAR10':
            train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
                num_class=10)
            n_train_batches = train_set_x.shape[0] // args.batch_size
            n_valid_batches = valid_set_x.shape[0] // args.batch_size
            if args.model == "tf_mlp":
                train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
                valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
            # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
            # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
        elif dataset == 'CIFAR100':
            train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
                num_class=100)
            n_train_batches = train_set_x.shape[0] // args.batch_size
            n_valid_batches = valid_set_x.shape[0] // args.batch_size
            # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
            # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
        else:
            raise NotImplementedError
        if dataset == 'MNIST':
            x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
            loss, y = model(x, y_)
        elif dataset == 'CIFAR10':
            if args.model == "tf_mlp":
                x = tf.placeholder(
                    dtype=tf.float32, shape=(None, 3072), name='x')
                y_ = tf.placeholder(
                    dtype=tf.float32, shape=(None, 10), name='y_')
            else:
                x = tf.placeholder(dtype=tf.float32, shape=(
                    None, 32, 32, 3), name='x')
                y_ = tf.placeholder(
                    dtype=tf.float32, shape=(None, 10), name='y_')
            loss, y = model(x, y_, 10)
        elif dataset == 'CIFAR100':
            x = tf.placeholder(dtype=tf.float32, shape=(
                None, 32, 32, 3), name='x')
            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
            loss, y = model(x, y_, 100)
        train_op = opt.minimize(loss)
        server = tf.train.Server(
            cluster, job_name="worker", task_index=task_id)
        init = tf.compat.v1.global_variables_initializer()
        sv = tf.train.Supervisor(
            is_chief=(task_id == 0),
            init_op=init,
            recovery_wait_secs=1)
        sess_config = tf.compat.v1.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % task_id])
        sess = sv.prepare_or_wait_for_session(
            server.target, config=sess_config)
        sess.run(init)
        # training
        print_rank0("Start training loop...")
        running_time = 0
        for i in range(args.num_epochs + 1):
            print_rank0("Epoch %d" % i)
            loss_all = 0
            batch_num = 0
            if args.timing:
                start = time()
            correct_predictions = []
            for minibatch_index in range(n_train_batches):
                minibatch_start = minibatch_index * args.batch_size
                minibatch_end = (minibatch_index + 1) * args.batch_size
                x_val = train_set_x[minibatch_start:minibatch_end]
                y_val = train_set_y[minibatch_start:minibatch_end]
                loss_val, predict_y, _ = sess.run([loss, y, train_op],
                                                  feed_dict={x: x_val, y_: y_val})
                correct_prediction = np.equal(
                    np.argmax(y_val, 1),
                    np.argmax(predict_y, 1)).astype(np.float32)
                correct_predictions.extend(correct_prediction)
                batch_num += 1
                loss_all += loss_val
            loss_all /= batch_num
            accuracy = np.mean(correct_predictions)
            print_rank0("Train loss = %f" % loss_all)
            print_rank0("Train accuracy = %f" % accuracy)
            if args.timing:
                end = time()
                print_rank0("Running time of current epoch = %fs" %
                            (end - start))
                if i != 0:
                    running_time += (end - start)
            if args.validate:
                val_loss_all = 0
                batch_num = 0
                correct_predictions = []
                for minibatch_index in range(n_valid_batches):
                    minibatch_start = minibatch_index * args.batch_size
                    minibatch_end = (minibatch_index + 1) * args.batch_size
                    valid_x_val = valid_set_x[minibatch_start:minibatch_end]
                    valid_y_val = valid_set_y[minibatch_start:minibatch_end]
                    loss_val, valid_y_predicted = sess.run([loss, y],
                                                           feed_dict={x: valid_x_val, y_: valid_y_val})
                    correct_prediction = np.equal(
                        np.argmax(valid_y_val, 1),
                        np.argmax(valid_y_predicted, 1)).astype(np.float32)
                    correct_predictions.extend(correct_prediction)
                    val_loss_all += loss_all
                    batch_num += 1
                val_loss_all /= batch_num
                accuracy = np.mean(correct_predictions)
                print_rank0("Validation loss = %f" % val_loss_all)
                print_rank0("Validation accuracy = %f" % accuracy)
        print_rank0("*"*50)
        print_rank0("Running time of total %d epoch = %fs" %
                    (args.num_epochs, running_time))
--- a/examples/cnn/tf_main.py
+++ b/examples/cnn/tf_main.py
@@ -0,0 +1,194 @@
 import tensorflow as tf
 import tf_models
 import hetu as ht
 import numpy as np
 import argparse
 from time import time
 import logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def print_rank0(msg):
    logger.info(msg)
 if __name__ == "__main__":
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        help='model to be tested')
    parser.add_argument('--dataset', type=str, required=True,
                        help='dataset to be trained on')
    parser.add_argument('--batch-size', type=int,
                        default=128, help='batch size')
    parser.add_argument('--learning-rate', type=float,
                        default=0.1, help='learning rate')
    parser.add_argument('--opt', type=str, default='sgd',
                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
    parser.add_argument('--num-epochs', type=int,
                        default=20, help='epoch number')
    parser.add_argument('--gpu', type=int, default=0,
                        help='gpu to be used, -1 means cpu')
    parser.add_argument('--validate', action='store_true',
                        help='whether to use validation')
    parser.add_argument('--timing', action='store_true',
                        help='whether to time the training phase')
    args = parser.parse_args()
    if args.gpu == -1:
        device = '/cpu:0'
        print_rank0('Use CPU.')
    else:
        device = '/gpu:%d' % args.gpu
        print_rank0('Use GPU %d.' % args.gpu)
    print_rank0("Training {} on TensorFlow".format(args.model))
    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
        'Model not supported now.'
    model = eval('tf_models.' + args.model)
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    assert args.opt in ['sgd', 'momentum', 'nesterov',
                        'adagrad', 'adam'], 'Optimizer not supported!'
    if args.opt == 'sgd':
        print_rank0('Use SGD Optimizer.')
        opt = tf.train.GradientDescentOptimizer(
            learning_rate=args.learning_rate)
    elif args.opt == 'momentum':
        print_rank0('Use Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9)
    elif args.opt == 'nesterov':
        print_rank0('Use Nesterov Momentum Optimizer.')
        opt = tf.train.MomentumOptimizer(
            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
    elif args.opt == 'adagrad':
        print_rank0('Use AdaGrad Optimizer.')
        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
    else:
        print_rank0('Use Adam Optimizer.')
        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    # model definition
    print_rank0('Building model...')
    with tf.device(device):
        if dataset == 'MNIST':
            x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
            loss, y = model(x, y_)
        elif dataset == 'CIFAR10':
            if args.model == "tf_mlp":
                x = tf.placeholder(
                    dtype=tf.float32, shape=(None, 3072), name='x')
                y_ = tf.placeholder(
                    dtype=tf.float32, shape=(None, 10), name='y_')
            else:
                x = tf.placeholder(dtype=tf.float32, shape=(
                    None, 32, 32, 3), name='x')
                y_ = tf.placeholder(
                    dtype=tf.float32, shape=(None, 10), name='y_')
            loss, y = model(x, y_, 10)
        elif dataset == 'CIFAR100':
            x = tf.placeholder(dtype=tf.float32, shape=(
                None, 32, 32, 3), name='x')
            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
            loss, y = model(x, y_, 100)
        train_op = opt.minimize(loss)
    # data loading
    print_rank0('Loading %s data...' % dataset)
    if dataset == 'MNIST':
        datasets = ht.data.mnist()
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        # train_set_x: (50000, 784), train_set_y: (50000,)
        # valid_set_x: (10000, 784), valid_set_y: (10000,)
    elif dataset == 'CIFAR10':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
            num_class=10)
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        if args.model == "tf_mlp":
            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
    elif dataset == 'CIFAR100':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
            num_class=100)
        n_train_batches = train_set_x.shape[0] // args.batch_size
        n_valid_batches = valid_set_x.shape[0] // args.batch_size
        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
    else:
        raise NotImplementedError
    # training
    print_rank0("Start training loop...")
    running_time = 0
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(args.num_epochs + 1):
            print_rank0("Epoch %d" % i)
            loss_all = 0
            batch_num = 0
            if args.timing:
                start = time()
            correct_predictions = []
            for minibatch_index in range(n_train_batches):
                minibatch_start = minibatch_index * args.batch_size
                minibatch_end = (minibatch_index + 1) * args.batch_size
                x_val = train_set_x[minibatch_start:minibatch_end]
                y_val = train_set_y[minibatch_start:minibatch_end]
                loss_val, predict_y, _ = sess.run([loss, y, train_op],
                                                  feed_dict={x: x_val, y_: y_val})
                correct_prediction = np.equal(
                    np.argmax(y_val, 1),
                    np.argmax(predict_y, 1)).astype(np.float32)
                correct_predictions.extend(correct_prediction)
                batch_num += 1
                loss_all += loss_val
            loss_all /= batch_num
            accuracy = np.mean(correct_predictions)
            print_rank0("Train loss = %f" % loss_all)
            print_rank0("Train accuracy = %f" % accuracy)
            if args.timing:
                end = time()
                print_rank0("Running time of current epoch = %fs" %
                            (end - start))
                if i != 0:
                    running_time += (end - start)
            if args.validate:
                val_loss_all = 0
                batch_num = 0
                correct_predictions = []
                for minibatch_index in range(n_valid_batches):
                    minibatch_start = minibatch_index * args.batch_size
                    minibatch_end = (minibatch_index + 1) * args.batch_size
                    valid_x_val = valid_set_x[minibatch_start:minibatch_end]
                    valid_y_val = valid_set_y[minibatch_start:minibatch_end]
                    loss_val, valid_y_predicted = sess.run([loss, y],
                                                           feed_dict={x: valid_x_val, y_: valid_y_val})
                    correct_prediction = np.equal(
                        np.argmax(valid_y_val, 1),
                        np.argmax(valid_y_predicted, 1)).astype(np.float32)
                    correct_predictions.extend(correct_prediction)
                    val_loss_all += loss_all
                    batch_num += 1
                val_loss_all /= batch_num
                accuracy = np.mean(correct_predictions)
                print_rank0("Validation loss = %f" % val_loss_all)
                print_rank0("Validation accuracy = %f" % accuracy)
        print_rank0("*"*50)
        print_rank0("Running time of total %d epoch = %fs" %
                    (args.num_epochs, running_time))
--- a/examples/cnn/tf_models/init.py
+++ b/examples/cnn/tf_models/init.py
@@ -0,0 +1,8 @@
 from .tf_LogReg import tf_logreg
 from .tf_CNN import tf_cnn_3_layers
 from .tf_LeNet import tf_lenet
 from .tf_MLP import tf_mlp
 from .tf_RNN import tf_rnn
 from .tf_LSTM import tf_lstm
 from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34
 from .tf_VGG import tf_vgg16, tf_vgg19
--- a/examples/cnn/tf_models/tf_CNN.py
+++ b/examples/cnn/tf_models/tf_CNN.py
@@ -0,0 +1,45 @@
 import numpy as np
 import tensorflow as tf
 def tf_conv_relu_avg(x, shape):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32))
    x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
    x = tf.nn.relu(x)
    x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1],
                       padding='VALID', strides=[1, 2, 2, 1])
    return x
 def tf_fc(x, shape):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=shape).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=shape[-1:]).astype(np.float32))
    x = tf.reshape(x, (-1, shape[0]))
    y = tf.matmul(x, weight) + bias
    return y
 def tf_cnn_3_layers(x, y_):
    '''
    3-layer-CNN model in TensorFlow, for MNIST dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print('Building 3-layer-CNN model in tensorflow...')
    x = tf.reshape(x, [-1, 28, 28, 1])
    x = tf_conv_relu_avg(x, (32, 1, 5, 5))
    x = tf_conv_relu_avg(x, (64, 32, 5, 5))
    x = tf.transpose(x, [0, 3, 1, 2])
    y = tf_fc(x, (7 * 7 * 64, 10))
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_LSTM.py
+++ b/examples/cnn/tf_models/tf_LSTM.py
@@ -0,0 +1,81 @@
 import numpy as np
 import tensorflow as tf
 def tf_lstm(x, y_):
    '''
    LSTM model in TensorFlow, for MNIST dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print("Building LSTM model in tensorflow...")
    diminput = 28
    dimhidden = 128
    dimoutput = 10
    nsteps = 28
    forget_gate_w = tf.Variable(np.random.normal(
        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
    forget_gate_u = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
    forget_gate_b = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden,)).astype(np.float32))
    input_gate_w = tf.Variable(np.random.normal(
        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
    input_gate_u = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
    input_gate_b = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden,)).astype(np.float32))
    output_gate_w = tf.Variable(np.random.normal(
        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
    output_gate_u = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
    output_gate_b = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden,)).astype(np.float32))
    tanh_w = tf.Variable(np.random.normal(
        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
    tanh_u = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
    tanh_b = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden,)).astype(np.float32))
    out_weights = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
    out_bias = tf.Variable(np.random.normal(
        scale=0.1, size=(dimoutput,)).astype(np.float32))
    initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32)
    last_c_state = initial_state
    last_h_state = initial_state
    for i in range(nsteps):
        cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
        # forget gate
        cur_forget = tf.matmul(last_h_state, forget_gate_u) + \
            tf.matmul(cur_x, forget_gate_w) + forget_gate_b
        cur_forget = tf.sigmoid(cur_forget)
        # input gate
        cur_input = tf.matmul(last_h_state, input_gate_u) + \
            tf.matmul(cur_x, input_gate_w) + input_gate_b
        cur_input = tf.sigmoid(cur_input)
        # output gate
        cur_output = tf.matmul(last_h_state, output_gate_u) + \
            tf.matmul(cur_x, output_gate_w) + output_gate_b
        cur_output = tf.sigmoid(cur_output)
        # tanh
        cur_tanh = tf.matmul(last_h_state, tanh_u) + \
            tf.matmul(cur_x, tanh_w) + tanh_b
        cur_tanh = tf.tanh(cur_tanh)
        last_c_state = last_c_state * cur_forget + cur_input * cur_tanh
        last_h_state = tf.tanh(last_c_state) * cur_output
    y = tf.matmul(last_h_state, out_weights) + out_bias
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_LeNet.py
+++ b/examples/cnn/tf_models/tf_LeNet.py
@@ -0,0 +1,49 @@
 import numpy as np
 import tensorflow as tf
 def tf_conv_pool(x, in_channel, out_channel):
    weight = tf.Variable(np.random.normal(scale=0.1, size=(
        out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
    x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
    x = tf.nn.relu(x)
    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                       padding='VALID', strides=[1, 2, 2, 1])
    return x
 def tf_fc(x, shape, with_relu=True):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=shape).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=shape[-1:]).astype(np.float32))
    x = tf.matmul(x, weight) + bias
    if with_relu:
        x = tf.nn.relu(x)
    return x
 def tf_lenet(x, y_):
    '''
    LeNet model in TensorFlow, for MNIST dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print('Building LeNet model in tensorflow...')
    x = tf.reshape(x, [-1, 28, 28, 1])
    x = tf_conv_pool(x, 1,  6)
    x = tf_conv_pool(x, 6, 16)
    x = tf.transpose(x, [0, 3, 1, 2])
    x = tf.reshape(x, (-1, 7*7*16))
    x = tf_fc(x, (7*7*16, 120), with_relu=True)
    x = tf_fc(x, (120, 84), with_relu=True)
    y = tf_fc(x, (84,  10), with_relu=False)
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_LogReg.py
+++ b/examples/cnn/tf_models/tf_LogReg.py
@@ -0,0 +1,23 @@
 import numpy as np
 import tensorflow as tf
 def tf_logreg(x, y_):
    '''
    Logistic Regression model in TensorFlow, for MNIST dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print("Build logistic regression model in tensorflow...")
    weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32))
    bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32))
    y = tf.matmul(x, weight) + bias
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_MLP.py
+++ b/examples/cnn/tf_models/tf_MLP.py
@@ -0,0 +1,34 @@
 import numpy as np
 import tensorflow as tf
 def tf_fc(x, shape, with_relu=True):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=shape).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=shape[-1:]).astype(np.float32))
    x = tf.matmul(x, weight) + bias
    if with_relu:
        x = tf.nn.relu(x)
    return x
 def tf_mlp(x, y_, num_class=10):
    '''
    MLP model in TensorFlow, for CIFAR dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print("Building MLP model in tensorflow...")
    x = tf_fc(x, (3072, 256), with_relu=True)
    x = tf_fc(x, (256, 256), with_relu=True)
    y = tf_fc(x, (256, num_class), with_relu=False)
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_RNN.py
+++ b/examples/cnn/tf_models/tf_RNN.py
@@ -0,0 +1,49 @@
 import numpy as np
 import tensorflow as tf
 def tf_rnn(x, y_):
    '''
    RNN model in TensorFlow, for MNIST dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print("Building RNN model in tensorflow...")
    diminput = 28
    dimhidden = 128
    dimoutput = 10
    nsteps = 28
    weight1 = tf.Variable(np.random.normal(
        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
    bias1 = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, )).astype(np.float32))
    weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
        dimhidden + dimhidden, dimhidden)).astype(np.float32))
    bias2 = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, )).astype(np.float32))
    weight3 = tf.Variable(np.random.normal(
        scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
    bias3 = tf.Variable(np.random.normal(
        scale=0.1, size=(dimoutput, )).astype(np.float32))
    last_state = tf.zeros((128, dimhidden), dtype=tf.float32)
    for i in range(nsteps):
        cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
        h = tf.matmul(cur_x, weight1) + bias1
        s = tf.concat([h, last_state], axis=1)
        s = tf.matmul(s, weight2) + bias2
        last_state = tf.nn.relu(s)
    final_state = last_state
    y = tf.matmul(final_state, weight3) + bias3
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
--- a/examples/cnn/tf_models/tf_ResNet.py
+++ b/examples/cnn/tf_models/tf_ResNet.py
@@ -0,0 +1,113 @@
 import numpy as np
 import tensorflow as tf
 def tf_conv2d(x, in_channel, out_channel, stride=1):
    weight = tf.Variable(np.random.normal(scale=0.1, size=(
        out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
    x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME')
    return x
 def tf_batch_norm_with_relu(x, hidden):
    scale = tf.Variable(np.random.normal(
        scale=0.1, size=(hidden,)).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=(hidden,)).astype(np.float32))
    axis = list(range(len(x.shape) - 1))
    a_mean, a_var = tf.nn.moments(x, axis)
    x = tf.nn.batch_normalization(
        x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
    x = tf.nn.relu(x)
    return x
 def tf_resnet_block(x, in_channel, num_blocks, is_first=False):
    if is_first:
        out_channel = in_channel
        identity = x
        x = tf_conv2d(x, in_channel, out_channel, stride=1)
        x = tf_batch_norm_with_relu(x, out_channel)
        x = tf_conv2d(x, out_channel, out_channel, stride=1)
        x = x + identity
    else:
        out_channel = 2 * in_channel
        identity = x
        x = tf_batch_norm_with_relu(x, in_channel)
        x = tf_conv2d(x, in_channel, out_channel, stride=2)
        x = tf_batch_norm_with_relu(x, out_channel)
        x = tf_conv2d(x, out_channel, out_channel, stride=1)
        identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[
                                  1, 2, 2, 1], padding='VALID')
        identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [
                          in_channel // 2, in_channel // 2]])
        x = x + identity
    for i in range(1, num_blocks):
        identity = x
        x = tf_batch_norm_with_relu(x, out_channel)
        x = tf_conv2d(x, out_channel, out_channel, stride=1)
        x = tf_batch_norm_with_relu(x, out_channel)
        x = tf_conv2d(x, out_channel, out_channel, stride=1)
        x = x + identity
    return x
 def tf_fc(x, shape):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=shape).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=shape[-1:]).astype(np.float32))
    x = tf.matmul(x, weight) + bias
    return x
 def tf_resnet(x, y_, num_layers, num_class=10):
    '''
    ResNet model in TensorFlow, for CIFAR10 dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
        num_layers: 18 or 34
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    print("Number of Class: {}".format(num_class))
    base_size = 16
    x = tf_conv2d(x, 3, base_size, stride=1)
    x = tf_batch_norm_with_relu(x, base_size)
    if num_layers == 18:
        print("Building ResNet-18 model in tensorflow...")
        x = tf_resnet_block(x,     base_size, num_blocks=2, is_first=True)
        x = tf_resnet_block(x,     base_size, num_blocks=2)
        x = tf_resnet_block(x, 2 * base_size, num_blocks=2)
        x = tf_resnet_block(x, 4 * base_size, num_blocks=2)
    elif num_layers == 34:
        print("Building ResNet-34 model in tensorflow...")
        x = tf_resnet_block(x,     base_size, num_blocks=3, is_first=True)
        x = tf_resnet_block(x,     base_size, num_blocks=4)
        x = tf_resnet_block(x, 2 * base_size, num_blocks=6)
        x = tf_resnet_block(x, 4 * base_size, num_blocks=3)
    else:
        assert False, "Number of layers should be 18 or 34 !"
    x = tf_batch_norm_with_relu(x, 8 * base_size)
    x = tf.transpose(x, [0, 3, 1, 2])
    x = tf.reshape(x, [-1, 128 * base_size])
    y = tf_fc(x, (128 * base_size, num_class))
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
 def tf_resnet18(x, y_, num_class=10):
    return tf_resnet(x, y_, 18, num_class)
 def tf_resnet34(x, y_, num_class=10):
    return tf_resnet(x, y_, 34, num_class)
--- a/examples/cnn/tf_models/tf_VGG.py
+++ b/examples/cnn/tf_models/tf_VGG.py
@@ -0,0 +1,103 @@
 import numpy as np
 import tensorflow as tf
 def conv_bn_relu(x, in_channel, out_channel):
    weight = tf.Variable(np.random.normal(scale=0.1, size=(
        out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
    scale = tf.Variable(np.random.normal(
        scale=0.1, size=(out_channel,)).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=(out_channel,)).astype(np.float32))
    x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME')
    axis = list(range(len(x.shape) - 1))
    a_mean, a_var = tf.nn.moments(x, axis)
    x = tf.nn.batch_normalization(
        x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
    x = tf.nn.relu(x)
    return x
 def vgg_2block(x, in_channel, out_channel):
    x = conv_bn_relu(x, in_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
                       1, 2, 2, 1], padding='VALID')
    return x
 def vgg_3block(x, in_channel, out_channel):
    x = conv_bn_relu(x, in_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
                       1, 2, 2, 1], padding='VALID')
    return x
 def vgg_4block(x, in_channel, out_channel):
    x = conv_bn_relu(x, in_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = conv_bn_relu(x, out_channel, out_channel)
    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
                       1, 2, 2, 1], padding='VALID')
    return x
 def tf_fc(x, in_feat, out_feat):
    weight = tf.Variable(np.random.normal(
        scale=0.1, size=(in_feat, out_feat)).astype(np.float32))
    bias = tf.Variable(np.random.normal(
        scale=0.1, size=(out_feat,)).astype(np.float32))
    x = tf.matmul(x, weight) + bias
    return x
 def tf_vgg(x, y_, num_layers, num_class=10):
    '''
    ResNet model in TensorFlow, for CIFAR10 dataset.
    Parameters:
        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
        num_layers: 18 or 34
    Return:
        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
    '''
    if num_layers == 16:
        print('Building VGG-16 model in tensorflow')
        x = vgg_2block(x,   3,  64)
        x = vgg_2block(x,  64, 128)
        x = vgg_3block(x, 128, 256)
        x = vgg_3block(x, 256, 512)
        x = vgg_3block(x, 512, 512)
    elif num_layers == 19:
        print('Building VGG-19 model in tensorflow')
        x = vgg_2block(x,   3,  64)
        x = vgg_2block(x,  64, 128)
        x = vgg_4block(x, 128, 256)
        x = vgg_4block(x, 256, 512)
        x = vgg_4block(x, 512, 512)
    else:
        assert False, "Number of layers should be 18 or 34 !"
    x = tf.reshape(x, [-1, 512])
    x = tf_fc(x,  512, 4096)
    x = tf_fc(x, 4096, 4096)
    y = tf_fc(x, 4096, num_class)
    print("Number of Class: {}".format(num_class))
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
    loss = tf.reduce_mean(loss)
    return loss, y
 def tf_vgg16(x, y_, num_class=10):
    return tf_vgg(x, y_, 16, num_class)
 def tf_vgg19(x, y_, num_class=10):
    return tf_vgg(x, y_, 34, num_class)
--- a/examples/cnn/torch_main.py
+++ b/examples/cnn/torch_main.py
@@ -0,0 +1,213 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
 import torch.backends.cudnn as cudnn
 from pytorch_models import *
 import hetu as ht
 import numpy as np
 import argparse
 from time import time
 import os
 import logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 def print_rank0(msg):
    if local_rank % 8 == 0:
        logger.info(msg)
 def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None):
    print_rank0('Epoch: %d' % epoch)
    n_train_batches = data.shape[0] // batch_size
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for minibatch_index in range(n_train_batches):
        minibatch_start = minibatch_index * args.batch_size
        minibatch_end = (minibatch_index + 1) * args.batch_size
        inputs = torch.Tensor(data[minibatch_start:minibatch_end])
        targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1)))
    print_rank0("Train accuracy = %f" % (100.*correct/total))
 def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None):
    net.eval()
    n_test_batches = data.shape[0] // batch_size
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for minibatch_index in range(n_test_batches):
            minibatch_start = minibatch_index * args.batch_size
            minibatch_end = (minibatch_index + 1) * args.batch_size
            inputs = torch.Tensor(data[minibatch_start:minibatch_end])
            targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1)))
        print_rank0("Validation accuracy = %f" % (100.*correct/total))
 if __name__ == "__main__":
    # argument parser
    global local_rank
    local_rank = 0
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        help='model to be tested')
    parser.add_argument('--dataset', type=str, required=True,
                        help='dataset to be trained on')
    parser.add_argument('--batch-size', type=int,
                        default=128, help='batch size')
    parser.add_argument('--learning-rate', type=float,
                        default=0.1, help='learning rate')
    parser.add_argument('--opt', type=str, default='sgd',
                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
    parser.add_argument('--num-epochs', type=int,
                        default=20, help='epoch number')
    parser.add_argument('--gpu', type=int, default=0,
                        help='gpu to be used, -1 means cpu')
    parser.add_argument('--validate', action='store_true',
                        help='whether to use validation')
    parser.add_argument('--timing', action='store_true',
                        help='whether to time the training phase')
    parser.add_argument('--distributed', action='store_true',
                        help='whether to distributed training')
    parser.add_argument('--local_rank', type=int, default=-1)
    args = parser.parse_args()
    if args.distributed == True:
        init_method = 'tcp://'
        master_ip = os.getenv('MASTER_ADDR', 'localhost')
        master_port = os.getenv('MASTER_PORT', '6000')
        init_method += master_ip + ':' + master_port
        rank = int(os.getenv('RANK', '0'))
        world_size = int(os.getenv("WORLD_SIZE", '1'))
        print("***"*50)
        print(init_method)
        torch.distributed.init_process_group(backend="nccl",
                                             world_size=world_size,
                                             rank=rank,
                                             init_method=init_method)
    if args.gpu == -1:
        device = 'cpu'
    else:
        if args.distributed == True:
            local_rank = rank % torch.cuda.device_count()
            torch.cuda.set_device(local_rank)
            device = torch.device('cuda:%d' % local_rank)
            logger.info('Use GPU %d.' % local_rank)
        else:
            device = torch.device('cuda:%d' % args.gpu)
            torch.cuda.set_device(args.gpu)
            print_rank0('Use GPU %d.' % args.gpu)
    assert args.model in ['mlp', 'resnet18', 'resnet34',
                          'vgg16', 'vgg19', 'rnn'], 'Model not supported now.'
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
        net = eval(args.model)(100)
    elif args.model == 'rnn':
        net = eval(args.model)(28, 10, 128, 28)
    else:
        net = eval(args.model)()
    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
    dataset = args.dataset
    net.to(device)
    if args.distributed:
        net = torch.nn.parallel.DistributedDataParallel(
            net, device_ids=[local_rank])
    assert args.opt in ['sgd', 'momentum', 'nesterov',
                        'adagrad', 'adam'], 'Optimizer not supported!'
    if args.opt == 'sgd':
        print_rank0('Use SGD Optimizer.')
        opt = optim.SGD(net.parameters(), lr=args.learning_rate)
    elif args.opt == 'momentum':
        print_rank0('Use Momentum Optimizer.')
        opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9)
    elif args.opt == 'nesterov':
        print_rank0('Use Nesterov Momentum Optimizer.')
        opt = optim.SGD(net.parameters(), lr=args.learning_rate,
                        momentum=0.9, nesterov=True)
    elif args.opt == 'adagrad':
        print_rank0('Use AdaGrad Optimizer.')
        opt = optim.Adagrad(net.parameters(), lr=args.learning_rate)
    else:
        print_rank0('Use Adam Optimizer.')
        opt = optim.Adam(lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()
    # data loading
    print_rank0('Loading %s data...' % dataset)
    if dataset == 'MNIST':
        datasets = ht.data.mnist(onehot=False)
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
    elif dataset == 'CIFAR10':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
            num_class=10, onehot=False)
        if args.model == "mlp":
            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
    elif dataset == 'CIFAR100':
        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
            num_class=100, onehot=False)
    running_time = 0
    # training
    print_rank0("Start training loop...")
    for i in range(args.num_epochs + 1):
        if args.timing:
            start = time()
        train(epoch=i, net=net, data=train_set_x, label=train_set_y,
              batch_size=args.batch_size, criterion=criterion, optimizer=opt)
        if args.timing:
            end = time()
            print_rank0("Running time of current epoch = %fs" % (end - start))
            if i != 0:
                running_time += (end - start)
        test(epoch=i, net=net, data=valid_set_x, label=valid_set_y,
             batch_size=args.batch_size, criterion=criterion)
    print_rank0("*"*50)
    print_rank0("Running time of total %d epoch = %fs" %
                (args.num_epochs, running_time))
--- a/examples/cnn/worker_conf0.json
+++ b/examples/cnn/worker_conf0.json
@@ -0,0 +1,9 @@
 {
 "DMLC_ROLE":"worker",
 "WORKER_ID":"0",
 "DMLC_PS_ROOT_URI":"127.0.0.1",
 "DMLC_PS_ROOT_PORT":"13030",
 "DMLC_NUM_WORKER":"2",
 "DMLC_NUM_SERVER":"1",
 "DMLC_PS_VAN_TYPE":"p3"
 }
--- a/examples/cnn/worker_conf1.json
+++ b/examples/cnn/worker_conf1.json
@@ -0,0 +1,9 @@
 {
 "DMLC_ROLE":"worker",
 "WORKER_ID":"1",
 "DMLC_PS_ROOT_URI":"127.0.0.1",
 "DMLC_PS_ROOT_PORT":"13030",
 "DMLC_NUM_WORKER":"2",
 "DMLC_NUM_SERVER":"1",
 "DMLC_PS_VAN_TYPE":"p3"
 }
--- a/examples/ctr/.gitignore
+++ b/examples/ctr/.gitignore
@@ -0,0 +1,2 @@
 datasets/
 logs/
--- a/examples/ctr/README.md
+++ b/examples/ctr/README.md
@@ -0,0 +1,109 @@
 # CTR Examples (with Distributed Settings)
 In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset.
 ## Structure
 ```
 - ctr
    - datasets/             contains sampled criteo data
    - models/               ctr models in hetu
    - tf_models/            ctr models in tensorflow
    - settings/             configurations for distributed training
    - tests/                test scripts
    - kill.sh               script to kill all python processes
    - run_hetu.py           basic trainer for hetu
    - run_tf_local.py       local trainer for tensorflow
    - run_tf_horovod.py     trainer for tensorflow in horovod setting
    - run_tf_parallax.py    trainer for tensorflow in parallax setting
    - tf_launch_server.py   launcher for server in tensorflow
    - tf_launch_worker.py   launcher for worker in tensorflow
 ```
 ## Prepare criteo data
 * We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files.
 * To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset.
 ## Flags for test files
 Here we explain some of the flags you may use in test files:
 * model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult')
 * config: to specify the configuration file in settings.
 * val: whether using validation.
 * cache: whether using cache in PS/Hybrid mode.
 * bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.)
 * all: whether to use all criteo data.
 * bound: per embedding entry staleness in cache setting, default to be 100.
 ## Usage
 If memory available, you can try to run the model locally, by running
 ```bash
 # run locally
 bash tests/local_{model}_{dataset}.sh
 # run in ps setting (locally)
 bash tests/ps_{model}_{dataset}.sh
 # run in hybrid setting (locally)
 bash tests/hybrid_{model}_{dataset}.sh
 # run tensorflow locally
 python run_tf_local.py --model {model}_{dataset}
 # run tensorflow in horovod
 horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset}
 # run tensorflow in parallax
 python {absolute_path_to}/run_tf_parallax.py
 # run tensorflow in ps setting
 python tf_launch_server.py --config {config} --id {rank}
 python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config}
 ```
 ## Configuration
 We use a simple yaml file to specify the run configuration.
 ```yaml
 shared :
    DMLC_PS_ROOT_URI : 127.0.0.1
    DMLC_PS_ROOT_PORT : 13100
    DMLC_NUM_WORKER : 4
    DMLC_NUM_SERVER : 1
 launch :
    worker : 4
    server : 1
    scheduler : true
 ```
 The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.
 The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by
 ## Examples
 ### Local execution
 Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data):
 ```bash
 python run_hetu.py --model wdl_criteo (--all) (--val)
 ```
 ### PS mode execution
 Run ps locally, here we can also run on multiple nodes. 
 ```bash
 # launch scheduler and server, -n means number of servers, --sched means using scheduler
 python -m hetu.launcher {config} -n 1 --sched
 # launch workers (or run scheduler and server together if configured in config file)
 python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10)
 ```
 You can also specify the cache to be used and also the cache bound.
 ### Hybrid mode execution
 You must launch a scheduler and server in one terminal:
 ```bash
 python -m hetu.launcher {config} -n 1 --sched
 ```
 And then launch the workers simultaneously using mpirun command:
 ```bash
 mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ...
 ```
 Or if in distributed nodes setting:
 ```
 mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ...
 ```
--- a/examples/ctr/kill.sh
+++ b/examples/ctr/kill.sh
@@ -0,0 +1,3 @@
 #/bin/bash
 #pkill -f mnist_mlp_ps.py
 kill -9 $(pidof python)
--- a/examples/ctr/models/init.py
+++ b/examples/ctr/models/init.py
@@ -0,0 +1,5 @@
 from .wdl_adult import wdl_adult
 from .dcn_criteo import dcn_criteo
 from .dc_criteo import dc_criteo
 from .wdl_criteo import wdl_criteo
 from .deepfm_criteo import dfm_criteo
--- a/examples/ctr/models/dc_criteo.py
+++ b/examples/ctr/models/dc_criteo.py
@@ -0,0 +1,63 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 import time
 def residual_layer(x0, input_dim, hidden_dim):
    embedding_len = input_dim
    weight_1 = init.random_normal(
        shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1')
    bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1')
    weight_2 = init.random_normal(
        shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2')
    bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2')
    x0w = ht.matmul_op(x0, weight_1)  # (batch, hidden_dim)
    x0w_b = x0w + ht.broadcastto_op(bias_1, x0w)
    relu1 = ht.relu_op(x0w_b)
    x1w = ht.matmul_op(relu1, weight_2)  # (batch, input_dim)
    x1w_b = x1w + ht.broadcastto_op(bias_2, x1w)
    residual = x1w_b + x0
    y = ht.relu_op(residual)
    return y
 def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3):
    for i in range(num_layers):
        x0 = residual_layer(x0, input_dim, hidden_dim)
    return x0
 def dc_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 8
    learning_rate = 0.001
    Embedding = init.random_normal(
        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding")
    sparse_input = ht.embedding_lookup_op(Embedding, sparse_input)
    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
    # dc_model
    x = ht.concat_op(sparse_input, dense_input, axis=1)
    input_dim = 26 * 8 + 13
    hidden_dim = input_dim
    residual_out = build_residual_layers(
        x, input_dim, hidden_dim, num_layers=5)
    W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4")
    y = ht.matmul_op(residual_out, W4)
    y = ht.sigmoid_op(y)
    loss = ht.binarycrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)
    return loss, y, y_, train_op
--- a/examples/ctr/models/dcn_criteo.py
+++ b/examples/ctr/models/dcn_criteo.py
@@ -0,0 +1,68 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 import time
 def cross_layer(x0, x1):
    # x0: input embedding feature (batch_size, 26 * embedding_size + 13)
    # x1: the output of last layer (batch_size, 26 * embedding_size + 13)
    embedding_len = 26 * 128 + 13
    weight = init.random_normal(
        shape=(embedding_len, 1), stddev=0.01, name='weight')
    bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias')
    x1w = ht.matmul_op(x1, weight)  # (batch_size, 1)
    y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0))
    y = y + x1 + ht.broadcastto_op(bias, y)
    return y
 def build_cross_layer(x0, num_layers=3):
    x1 = x0
    for i in range(num_layers):
        x1 = cross_layer(x0, x1)
    return x1
 def dcn_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.003
    Embedding = init.random_normal(
        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
    sparse_input = ht.embedding_lookup_op(
        Embedding, sparse_input, ctx=ht.cpu(0))
    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
    x = ht.concat_op(sparse_input, dense_input, axis=1)
    # Cross Network
    cross_output = build_cross_layer(x, num_layers=3)
    # DNN
    flatten = x
    W1 = init.random_normal(
        [26*embedding_size + 13, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
    W4 = init.random_normal(
        [256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4")
    fc1 = ht.matmul_op(flatten, W1)
    relu1 = ht.relu_op(fc1)
    fc2 = ht.matmul_op(relu1, W2)
    relu2 = ht.relu_op(fc2)
    y3 = ht.matmul_op(relu2, W3)
    y4 = ht.concat_op(cross_output, y3, axis=1)
    y = ht.matmul_op(y4, W4)
    y = ht.sigmoid_op(y)
    loss = ht.binarycrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)
    return loss, y, y_, train_op
--- a/examples/ctr/models/deepfm_criteo.py
+++ b/examples/ctr/models/deepfm_criteo.py
@@ -0,0 +1,59 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 import time
 def dfm_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01
    # FM
    Embedding1 = init.random_normal(
        [feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0))
    FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
    sparse_1dim_input = ht.embedding_lookup_op(
        Embedding1, sparse_input, ctx=ht.cpu(0))
    fm_dense_part = ht.matmul_op(dense_input, FM_W)
    fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1)
    # fst order output
    y1 = fm_dense_part + fm_sparse_part
    Embedding2 = init.random_normal(
        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
    sparse_2dim_input = ht.embedding_lookup_op(
        Embedding2, sparse_input, ctx=ht.cpu(0))
    sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1)
    sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum)
    sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input)
    sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1)
    sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
    sparse_2dim_half = sparse_2dim * 0.5
    # snd order output
    y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)
    # DNN
    flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size))
    W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 1], stddev=0.01, name="W3")
    fc1 = ht.matmul_op(flatten, W1)
    relu1 = ht.relu_op(fc1)
    fc2 = ht.matmul_op(relu1, W2)
    relu2 = ht.relu_op(fc2)
    y3 = ht.matmul_op(relu2, W3)
    y4 = y1 + y2
    y = y4 + y3
    y = ht.sigmoid_op(y)
    loss = ht.binarycrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)
    return loss, y, y_, train_op
--- a/examples/ctr/models/load_data.py
+++ b/examples/ctr/models/load_data.py
@@ -0,0 +1,320 @@
 import os
 import numpy as np
 ###########################################################################
 # criteo
 ###########################################################################
 def download_criteo(path):
    import tarfile
    import pandas as pd
    from six.moves import urllib
    if not os.path.exists(path):
        os.makedirs(path)
    assert os.path.isdir(path), 'Please provide a directory path.'
    # this source may be invalid, please use other valid sources.
    origin = (
        'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'
    )
    print('Downloading data from %s' % origin)
    dataset = os.path.join(path, 'criteo.tar.gz')
    urllib.request.urlretrieve(origin, dataset)
    print("Extracting criteo zip...")
    with tarfile.open(dataset) as f:
        f.extractall(path=path)
    print("Create local files...")
    # save csv filed
    df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None)
    df.columns = ['label'] + ["I" +
                              str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)]
    df.to_csv(os.path.join(path, "train.csv"), index=0)
    print('Csv file saved.')
    # save numpy arrays
    target_path = [os.path.join(path, filename) for filename in [
        'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy',
        'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']]
    dense_feats = [col for col in df.columns if col.startswith('I')]
    sparse_feats = [col for col in df.columns if col.startswith('C')]
    labels = df['label']
    dense_feats = process_dense_feats(df, dense_feats)
    sparse_feats = process_sparse_feats(df, sparse_feats)
    num_data = dense_feats.shape[0]
    perm = np.random.permutation(num_data)
    # split data in 2 parts
    test_num = num_data // 10
    processed_data = [
        dense_feats[perm[:-test_num]],  # train dense
        sparse_feats[perm[:-test_num]],  # train sparse
        labels[perm[:-test_num]],       # train labels
        dense_feats[perm[-test_num:]],  # validate dense
        sparse_feats[perm[-test_num:]],  # validate sparse
        labels[perm[-test_num:]],       # validate labels
    ]
    print('Array shapes:')
    for i in range(len(processed_data)):
        print(os.path.split(target_path[i])
              [-1].split('.')[0], processed_data[i].shape)
        np.save(target_path[i], processed_data[i])
    print('Numpy arrays saved.')
 def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    return d
 def process_sparse_feats(data, feats):
    from sklearn.preprocessing import LabelEncoder
    # process to embeddings.
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
    feature_cnt = 0
    for f in feats:
        d[f] += feature_cnt
        feature_cnt += d[f].nunique()
    return d
 def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True):
    import pandas as pd
    csv_path = os.path.join(path, "train.csv")
    if not os.path.exists(csv_path):
        download_criteo(path)
    df = pd.read_csv(csv_path, nrows=nrows, header=0)
    dense_feats = [col for col in df.columns if col.startswith('I')]
    sparse_feats = [col for col in df.columns if col.startswith('C')]
    labels = np.array(df['label']).reshape(-1, 1)
    dense_feats = np.array(process_dense_feats(df, dense_feats))
    sparse_feats = np.array(process_sparse_feats(
        df, sparse_feats)).astype(np.int32)
    if return_val:
        test_num = nrows // 10
        train_dense = dense_feats[:-test_num]
        train_sparse = sparse_feats[:-test_num]
        train_label = labels[:-test_num]
        validate_dense = dense_feats[-test_num:]
        validate_sparse = sparse_feats[-test_num:]
        validate_label = labels[-test_num:]
        return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label)
    else:
        return dense_feats, sparse_feats, labels
 def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')):
    # all data should be available! no checking.
    processed_data = [np.load(os.path.join(path, filename))
                      for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']]
    return tuple(processed_data)
 def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True):
    file_paths = [os.path.join(path, filename) for filename in [
        'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy',
        'test_sparse_feats.npy',  'train_labels.npy', 'test_labels.npy']]
    if not all([os.path.exists(p) for p in file_paths]):
        download_criteo(path)
    files = [np.load(filename) for filename in file_paths]
    if return_val:
        return (files[0], files[1]), (files[2], files[3]), (files[4], files[5])
    else:
        return files[0], files[2], files[4]
 ###########################################################################
 # adult
 ###########################################################################
 def maybe_download(train_data, test_data):
    import pandas as pd
    """if adult data "train.csv" and "test.csv" are not in your directory,
    download them.
    """
    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
               "marital_status", "occupation", "relationship", "race", "gender",
               "capital_gain", "capital_loss", "hours_per_week", "native_country",
               "income_bracket"]
    if not os.path.exists(train_data):
        print("downloading training data...")
        df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                               names=COLUMNS, skipinitialspace=True)
    else:
        df_train = pd.read_csv("train.csv")
    if not os.path.exists(test_data):
        print("downloading testing data...")
        df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
                              names=COLUMNS, skipinitialspace=True, skiprows=1)
    else:
        df_test = pd.read_csv("test.csv")
    return df_train, df_test
 def cross_columns(x_cols):
    """simple helper to build the crossed columns in a pandas dataframe
    """
    crossed_columns = dict()
    colnames = ['_'.join(x_c) for x_c in x_cols]
    for cname, x_c in zip(colnames, x_cols):
        crossed_columns[cname] = x_c
    return crossed_columns
 def val2idx(df, cols):
    """helper to index categorical columns before embeddings.
    """
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()
    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])
    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()
    return df, unique_vals
 def onehot(x):
    from sklearn.preprocessing import OneHotEncoder
    return np.array(OneHotEncoder().fit_transform(x).todense())
 def wide(df_train, df_test, wide_cols, x_cols, target):
    import pandas as pd
    print('Processing wide data')
    df_train['IS_TRAIN'] = 1
    df_test['IS_TRAIN'] = 0
    df_wide = pd.concat([df_train, df_test])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(
        df_wide.select_dtypes(include=['object']).columns)
    wide_cols += list(crossed_columns_d.keys())
    for k, v in crossed_columns_d.items():
        df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
    df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
    dummy_cols = [
        c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
    df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
    train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
    test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
    assert all(train.columns == test.columns)
    cols = [c for c in train.columns if c != target]
    X_train = train[cols].values
    y_train = train[target].values.reshape(-1, 1)
    X_test = test[cols].values
    y_test = test[target].values.reshape(-1, 1)
    return X_train, y_train, X_test, y_test
 def load_adult_data(return_val=True):
    import pandas as pd
    df_train, df_test = maybe_download("train.csv", "test.csv")
    df_train['income_label'] = (
        df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
    df_test['income_label'] = (
        df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
    age_groups = [0, 25, 65, 90]
    age_labels = range(len(age_groups) - 1)
    df_train['age_group'] = pd.cut(
        df_train['age'], age_groups, labels=age_labels)
    df_test['age_group'] = pd.cut(
        df_test['age'], age_groups, labels=age_labels)
    # columns for wide model
    wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
                 'relationship', 'race', 'gender', 'native_country', 'age_group']
    x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
    # columns for deep model
    embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
                      'relationship', 'race', 'gender', 'native_country']
    cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
    target = 'income_label'
    x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
        df_train, df_test, wide_cols, x_cols, target)
    x_train_wide = np.array(x_train_wide).astype(np.float32)
    x_test_wide = np.array(x_test_wide).astype(np.float32)
    print('Processing deep data')
    df_train['IS_TRAIN'] = 1
    df_test['IS_TRAIN'] = 0
    df_deep = pd.concat([df_train, df_test])
    deep_cols = embedding_cols + cont_cols
    df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
                                      columns=cont_cols)
    df_deep, unique_vals = val2idx(df_deep, embedding_cols)
    train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
    test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
    x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
    y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
    x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
    y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
    x_train_deep = np.transpose(x_train_deep)
    x_test_deep = np.transpose(x_test_deep)
    y_train = onehot(y_train)
    y_test = onehot(y_test)
    if return_val:
        return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
    else:
        return x_train_deep, x_train_wide, y_train
 ###########################################################################
 # avazu
 ###########################################################################
 def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')):
    import pandas as pd
    # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data
    train_file = os.path.join(path, 'train.csv')
    # test_file = os.path.join(path, 'test.csv') # useless, no labels
    df_train = pd.read_csv(train_file)
    sparse_feats = process_sparse_feats(df_train, df_train.columns[2:])
    # the embedding num for each feature:
    # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60]
    # sum: 9449445
    np.save(os.path.join(path, 'sparse.npy'), sparse_feats)
 if __name__ == '__main__':
    download_criteo(os.path.join(os.path.split(
        os.path.abspath(__file__)), '../datasets/criteo'))
--- a/examples/ctr/models/wdl_adult.py
+++ b/examples/ctr/models/wdl_adult.py
@@ -0,0 +1,56 @@
 import hetu as ht
 from hetu import init
 def wdl_adult(X_deep, X_wide, y_):
    lr = 5 / 128
    dim_wide = 809
    dim_deep = 68
    W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
    W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
    b1 = init.random_normal([50], stddev=0.1, name="b1")
    W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
    b2 = init.random_normal([20], stddev=0.1, name="b2")
    # deep
    Embedding = []
    X_deep_input = None
    for i in range(8):
        Embedding_name = "Embedding_deep_" + str(i)
        Embedding.append(init.random_normal(
            [50, 8], stddev=0.1, name=Embedding_name))
        now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
        now = ht.array_reshape_op(now, (-1, 8))
        if X_deep_input is None:
            X_deep_input = now
        else:
            X_deep_input = ht.concat_op(X_deep_input, now, 1)
    for i in range(4):
        now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
        X_deep_input = ht.concat_op(X_deep_input, now, 1)
    mat1 = ht.matmul_op(X_deep_input, W1)
    add1 = mat1 + ht.broadcastto_op(b1, mat1)
    relu1 = ht.relu_op(add1)
    dropout1 = relu1
    mat2 = ht.matmul_op(dropout1, W2)
    add2 = mat2 + ht.broadcastto_op(b2, mat2)
    relu2 = ht.relu_op(add2)
    dropout2 = relu2
    dmodel = dropout2
    # wide
    wmodel = ht.concat_op(X_wide, dmodel, 1)
    wmodel = ht.matmul_op(wmodel, W)
    prediction = wmodel
    loss = ht.softmaxcrossentropy_op(prediction, y_)
    loss = ht.reduce_mean_op(loss, [0])
    opt = ht.optim.SGDOptimizer(learning_rate=lr)
    train_op = opt.minimize(loss)
    return loss, prediction, y_, train_op
--- a/examples/ctr/models/wdl_criteo.py
+++ b/examples/ctr/models/wdl_criteo.py
@@ -0,0 +1,42 @@
 import hetu as ht
 from hetu import init
 import numpy as np
 import time
 def wdl_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01
    Embedding = init.random_normal(
        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
    sparse_input = ht.embedding_lookup_op(
        Embedding, sparse_input, ctx=ht.cpu(0))
    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
    # DNN
    flatten = dense_input
    W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
    W4 = init.random_normal(
        [256 + 26*embedding_size, 1], stddev=0.01, name="W4")
    fc1 = ht.matmul_op(flatten, W1)
    relu1 = ht.relu_op(fc1)
    fc2 = ht.matmul_op(relu1, W2)
    relu2 = ht.relu_op(fc2)
    y3 = ht.matmul_op(relu2, W3)
    y4 = ht.concat_op(sparse_input, y3, axis=1)
    y = ht.matmul_op(y4, W4)
    y = ht.sigmoid_op(y)
    loss = ht.binarycrossentropy_op(y, y_)
    loss = ht.reduce_mean_op(loss, [0])
    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)
    return loss, y, y_, train_op
--- a/examples/ctr/run_hetu.py
+++ b/examples/ctr/run_hetu.py
@@ -0,0 +1,230 @@
 import hetu as ht
 from hetu.launcher import launch
 import os
 import os.path as osp
 import numpy as np
 import yaml
 import time
 import argparse
 from tqdm import tqdm
 from sklearn import metrics
 def worker(args):
    def train(iterations, auc_enabled=True, tqdm_enabled=False):
        localiter = tqdm(range(iterations)
                         ) if tqdm_enabled else range(iterations)
        train_loss = []
        train_acc = []
        if auc_enabled:
            train_auc = []
        for it in localiter:
            loss_val, predict_y, y_val, _ = executor.run(
                'train', convert_to_numpy_ret_vals=True)
            if y_val.shape[1] == 1:  # for criteo case
                acc_val = np.equal(
                    y_val,
                    predict_y > 0.5).astype(np.float32)
            else:
                acc_val = np.equal(
                    np.argmax(y_val, 1),
                    np.argmax(predict_y, 1)).astype(np.float32)
            train_loss.append(loss_val[0])
            train_acc.append(acc_val)
            if auc_enabled:
                train_auc.append(metrics.roc_auc_score(y_val, predict_y))
        if auc_enabled:
            return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
        else:
            return np.mean(train_loss), np.mean(train_acc)
    def validate(iterations, tqdm_enabled=False):
        localiter = tqdm(range(iterations)
                         ) if tqdm_enabled else range(iterations)
        test_loss = []
        test_acc = []
        test_auc = []
        for it in localiter:
            loss_val, test_y_predicted, y_test_val = executor.run(
                'validate', convert_to_numpy_ret_vals=True)
            if y_test_val.shape[1] == 1:  # for criteo case
                correct_prediction = np.equal(
                    y_test_val,
                    test_y_predicted > 0.5).astype(np.float32)
            else:
                correct_prediction = np.equal(
                    np.argmax(y_test_val, 1),
                    np.argmax(test_y_predicted, 1)).astype(np.float32)
            test_loss.append(loss_val[0])
            test_acc.append(correct_prediction)
            test_auc.append(metrics.roc_auc_score(
                y_test_val, test_y_predicted))
        return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)
    def get_current_shard(data):
        if args.comm is not None:
            part_size = data.shape[0] // nrank
            start = part_size * rank
            end = start + part_size if rank != nrank - 1 else data.shape[0]
            return data[start:end]
        else:
            return data
    batch_size = 128
    dataset = args.dataset
    model = args.model
    device_id = 0
    if args.comm == 'PS':
        rank = ht.get_worker_communicate().rank()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8
    elif args.comm == 'Hybrid':
        comm = ht.wrapped_mpi_nccl_init()
        device_id = comm.dev_id
        rank = comm.rank
        nrank = int(os.environ['DMLC_NUM_WORKER'])
    if dataset == 'criteo':
        # define models for criteo
        if args.all:
            from models.load_data import process_all_criteo_data
            dense, sparse, labels = process_all_criteo_data(
                return_val=args.val)
        elif args.val:
            from models.load_data import process_head_criteo_data
            dense, sparse, labels = process_head_criteo_data(return_val=True)
        else:
            from models.load_data import process_sampled_criteo_data
            dense, sparse, labels = process_sampled_criteo_data()
        if isinstance(dense, tuple):
            dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [
                                           get_current_shard(dense[1]), batch_size, 'validate']])
            sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [
                                            get_current_shard(sparse[1]), batch_size, 'validate']])
            y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [
                                  get_current_shard(labels[1]), batch_size, 'validate']])
        else:
            dense_input = ht.dataloader_op(
                [[get_current_shard(dense), batch_size, 'train']])
            sparse_input = ht.dataloader_op(
                [[get_current_shard(sparse), batch_size, 'train']])
            y_ = ht.dataloader_op(
                [[get_current_shard(labels), batch_size, 'train']])
    elif dataset == 'adult':
        from models.load_data import load_adult_data
        x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data()
        dense_input = [
            ht.dataloader_op([
                [get_current_shard(x_train_deep[:, i]), batch_size, 'train'],
                [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'],
            ]) for i in range(12)
        ]
        sparse_input = ht.dataloader_op([
            [get_current_shard(x_train_wide), batch_size, 'train'],
            [get_current_shard(x_test_wide), batch_size, 'validate'],
        ])
        y_ = ht.dataloader_op([
            [get_current_shard(y_train), batch_size, 'train'],
            [get_current_shard(y_test), batch_size, 'validate'],
        ])
    else:
        raise NotImplementedError
    print("Data loaded.")
    loss, prediction, y_, train_op = model(dense_input, sparse_input, y_)
    eval_nodes = {'train': [loss, prediction, y_, train_op]}
    if args.val:
        print('Validation enabled...')
        eval_nodes['validate'] = [loss, prediction, y_]
    executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs')
    executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id),
                           comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path)
    if args.all and dataset == 'criteo':
        print('Processing all data...')
        file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[
                               args.comm], args.raw_model)
        file_path += '%d.log' % rank if args.comm else '.log'
        file_path = osp.join(osp.dirname(
            osp.abspath(__file__)), 'logs', file_path)
        log_file = open(file_path, 'w')
        total_epoch = args.nepoch if args.nepoch > 0 else 11
        for ep in range(total_epoch):
            print("ep: %d" % ep)
            ep_st = time.time()
            train_loss, train_acc, train_auc = train(executor.get_batch_num(
                'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True)
            ep_en = time.time()
            if args.val:
                val_loss, val_acc, val_auc = validate(
                    executor.get_batch_num('validate'))
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
                    % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st)
            else:
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                    % (train_loss, train_acc, train_auc, ep_en - ep_st)
            print(printstr)
            log_file.write(printstr + '\n')
            log_file.flush()
    else:
        total_epoch = args.nepoch if args.nepoch > 0 else 50
        for ep in range(total_epoch):
            if ep == 5:
                start = time.time()
            print("epoch %d" % ep)
            ep_st = time.time()
            train_loss, train_acc = train(
                executor.get_batch_num('train'), auc_enabled=False)
            ep_en = time.time()
            if args.val:
                val_loss, val_acc, val_auc = validate(
                    executor.get_batch_num('validate'))
                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
                      % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
            else:
                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
                      % (train_loss, train_acc, ep_en - ep_st))
        print('all time:', time.time() - start)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True,
                        help="model to be tested")
    parser.add_argument("--val", action="store_true",
                        help="whether to use validation")
    parser.add_argument("--all", action="store_true",
                        help="whether to use all data")
    parser.add_argument("--comm", default=None,
                        help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
    parser.add_argument("--bsp", action="store_true",
                        help="whether to use bsp instead of asp")
    parser.add_argument("--cache", default=None, help="cache policy")
    parser.add_argument("--bound", default=100, help="cache bound")
    parser.add_argument("--config", type=str, default=osp.join(osp.dirname(
        osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps")
    parser.add_argument("--nepoch", type=int, default=-1,
                        help="num of epochs, each train 1/10 data")
    args = parser.parse_args()
    import models
    print('Model:', args.model)
    model = eval('models.' + args.model)
    args.dataset = args.model.split('_')[-1]
    args.raw_model = args.model
    args.model = model
    if args.comm is None:
        worker(args)
    elif args.comm == 'Hybrid':
        settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
        value = settings['shared']
        os.environ['DMLC_ROLE'] = 'worker'
        for k, v in value.items():
            os.environ[k] = str(v)
        worker(args)
    elif args.comm == 'PS':
        launch(worker, args)
    else:
        raise NotImplementedError
--- a/examples/ctr/run_tf_horovod.py
+++ b/examples/ctr/run_tf_horovod.py
@@ -0,0 +1,174 @@
 import os
 import numpy as np
 import tensorflow as tf
 import time
 import argparse
 from tqdm import tqdm
 from sklearn import metrics
 import horovod.tensorflow as hvd
 def pop_env():
    for k in ['https_proxy', 'http_proxy']:
        if k in os.environ:
            os.environ.pop(k)
 pop_env()
 # horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
 # horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
 # if using multi nodes setting in conda, need to modify /etc/bash.bashrc
 # we can also use mpirun (default gloo):
 # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
 #  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
 def train_criteo(model, args):
    hvd.init()
    def get_current_shard(data):
        part_size = data.shape[0] // hvd.size()
        start = part_size * hvd.rank()
        end = start + part_size if hvd.rank() != hvd.size() - \
            1 else data.shape[0]
        return data[start:end]
    if args.all:
        from models.load_data import process_all_criteo_data
        dense, sparse, all_labels = process_all_criteo_data()
        dense_feature = get_current_shard(dense[0])
        sparse_feature = get_current_shard(sparse[0])
        labels = get_current_shard(all_labels[0])
        val_dense = get_current_shard(dense[1])
        val_sparse = get_current_shard(sparse[1])
        val_labels = get_current_shard(all_labels[1])
    else:
        from models.load_data import process_sampled_criteo_data
        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
        dense_feature = get_current_shard(dense_feature)
        sparse_feature = get_current_shard(sparse_feature)
        labels = get_current_shard(labels)
    batch_size = 128
    dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
    sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
    y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
    loss, y, opt = model(dense_input, sparse_input, y_)
    global_step = tf.train.get_or_create_global_step()
    # here in DistributedOptimizer by default all tensor are reduced on GPU
    # can use device_sparse=xxx, device_dense=xxx to modify
    # if using device_sparse='/cpu:0', the performance degrades
    train_op = hvd.DistributedOptimizer(
        opt).minimize(loss, global_step=global_step)
    gpu_options = tf.compat.v1.GPUOptions(
        allow_growth=True, visible_device_list=str(hvd.local_rank()))
    # here horovod default use gpu to initialize, which will cause OOM
    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
    sess = tf.compat.v1.train.MonitoredTrainingSession(
        hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
    my_feed_dict = {
        dense_input: np.empty(shape=(batch_size, 13)),
        sparse_input: np.empty(shape=(batch_size, 26)),
        y_: np.empty(shape=(batch_size, 1)),
    }
    if args.all:
        raw_log_file = './logs/tf_hvd_%s_%d.log' % (
            args.model, hvd.local_rank())
        print('Processing all data, log to', raw_log_file)
        log_file = open(raw_log_file, 'w')
        iterations = dense_feature.shape[0] // batch_size
        total_epoch = 400
        start_index = 0
        for ep in range(total_epoch):
            print("epoch %d" % ep)
            st_time = time.time()
            train_loss, train_acc, train_auc = [], [], []
            for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                start_index += batch_size
                if start_index + batch_size > dense_feature.shape[0]:
                    start_index = 0
                loss_val = sess.run([loss, y, y_, train_op],
                                    feed_dict=my_feed_dict)
                pred_val = loss_val[1]
                true_val = loss_val[2]
                acc_val = np.equal(
                    true_val,
                    pred_val > 0.5)
                train_loss.append(loss_val[0])
                train_acc.append(acc_val)
                train_auc.append(metrics.roc_auc_score(true_val, pred_val))
            tra_accuracy = np.mean(train_acc)
            tra_loss = np.mean(train_loss)
            tra_auc = np.mean(train_auc)
            en_time = time.time()
            train_time = en_time - st_time
            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                % (tra_loss, tra_accuracy, tra_auc, train_time)
            print(printstr)
            log_file.write(printstr + '\n')
            log_file.flush()
    else:
        iterations = dense_feature.shape[0] // batch_size
        epoch = 50
        for ep in range(epoch):
            print('epoch', ep)
            if ep == 5:
                start = time.time()
            ep_st = time.time()
            train_loss = []
            train_acc = []
            for idx in range(iterations):
                start_index = idx * batch_size
                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                loss_val = sess.run([loss, y, y_, train_op],
                                    feed_dict=my_feed_dict)
                pred_val = loss_val[1]
                true_val = loss_val[2]
                if pred_val.shape[1] == 1:  # for criteo case
                    acc_val = np.equal(
                        true_val,
                        pred_val > 0.5)
                else:
                    acc_val = np.equal(
                        np.argmax(pred_val, 1),
                        np.argmax(true_val, 1)).astype(np.float32)
                train_loss.append(loss_val[0])
                train_acc.append(acc_val)
            tra_accuracy = np.mean(train_acc)
            tra_loss = np.mean(train_loss)
            ep_en = time.time()
            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
                  % (tra_loss, tra_accuracy, ep_en - ep_st))
        print('all time:', (time.time() - start))
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True,
                        help="model to be tested")
    parser.add_argument("--all", action="store_true",
                        help="whether to use all data")
    args = parser.parse_args()
    raw_model = args.model
    import tf_models
    model = eval('tf_models.' + raw_model)
    dataset = raw_model.split('_')[-1]
    print('Model:', raw_model)
    train_criteo(model, args)
 if __name__ == '__main__':
    main()
--- a/examples/ctr/run_tf_local.py
+++ b/examples/ctr/run_tf_local.py
@@ -0,0 +1,202 @@
 import numpy as np
 import tensorflow as tf
 import time
 import argparse
 from tqdm import tqdm
 from sklearn import metrics
 def train_criteo(model, args):
    if args.all:
        from models.load_data import process_all_criteo_data
        dense, sparse, all_labels = process_all_criteo_data()
        dense_feature, val_dense = dense
        sparse_feature, val_sparse = sparse
        labels, val_labels = all_labels
    else:
        from models.load_data import process_sampled_criteo_data
        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
    batch_size = 128
    dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
    sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
    y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
    loss, y, opt = model(dense_input, sparse_input, y_)
    train_op = opt.minimize(loss)
    init = tf.compat.v1.global_variables_initializer()
    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
    sess = tf.compat.v1.Session(
        config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
    sess.run(init)
    my_feed_dict = {
        dense_input: np.empty(shape=(batch_size, 13)),
        sparse_input: np.empty(shape=(batch_size, 26)),
        y_: np.empty(shape=(batch_size, 1)),
    }
    if args.all:
        raw_log_file = './logs/tf_local_%s.log' % (args.model)
        print('Processing all data, log to', raw_log_file)
        log_file = open(raw_log_file, 'w')
        iterations = dense_feature.shape[0] // batch_size
        total_epoch = 11
        start_index = 0
        for ep in range(total_epoch):
            print("epoch %d" % ep)
            st_time = time.time()
            train_loss, train_acc, train_auc = [], [], []
            for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                start_index += batch_size
                if start_index + batch_size > dense_feature.shape[0]:
                    start_index = 0
                loss_val = sess.run([loss, y, y_, train_op],
                                    feed_dict=my_feed_dict)
                pred_val = loss_val[1]
                true_val = loss_val[2]
                acc_val = np.equal(
                    true_val,
                    pred_val > 0.5)
                train_loss.append(loss_val[0])
                train_acc.append(acc_val)
                train_auc.append(metrics.roc_auc_score(true_val, pred_val))
            tra_accuracy = np.mean(train_acc)
            tra_loss = np.mean(train_loss)
            tra_auc = np.mean(train_auc)
            en_time = time.time()
            train_time = en_time - st_time
            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                % (tra_loss, tra_accuracy, tra_auc, train_time)
            print(printstr)
            log_file.write(printstr + '\n')
            log_file.flush()
    else:
        iteration = dense_feature.shape[0] // batch_size
        epoch = 50
        for ep in range(epoch):
            print('epoch', ep)
            if ep == 5:
                start = time.time()
            ep_st = time.time()
            train_loss = []
            train_acc = []
            for idx in range(iteration):
                start_index = idx * batch_size
                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                loss_val = sess.run([loss, y, y_, train_op],
                                    feed_dict=my_feed_dict)
                pred_val = loss_val[1]
                true_val = loss_val[2]
                if pred_val.shape[1] == 1:  # for criteo case
                    acc_val = np.equal(
                        true_val,
                        pred_val > 0.5)
                else:
                    acc_val = np.equal(
                        np.argmax(pred_val, 1),
                        np.argmax(true_val, 1)).astype(np.float32)
                train_loss.append(loss_val[0])
                train_acc.append(acc_val)
            tra_accuracy = np.mean(train_acc)
            tra_loss = np.mean(train_loss)
            ep_en = time.time()
            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
                  % (tra_loss, tra_accuracy, ep_en - ep_st))
        print('all time:', (time.time() - start))
 def train_adult(model):
    batch_size = 128
    total_epoch = 50
    dim_wide = 809
    X_deep = []
    for i in range(8):
        X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
    for i in range(4):
        X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1]))
    X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
    y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
    loss, y, train_op = model(X_deep, X_wide, y_)
    init = tf.global_variables_initializer()
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    sess.run(init)
    from models.load_data import load_adult_data
    x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)
    iterations = x_train_deep.shape[0] // batch_size
    for ep in range(total_epoch):
        print('epoch', ep)
        if ep == 5:
            start = time.time()
        ep_st = time.time()
        train_loss = []
        train_acc = []
        pre_index = 0
        for it in range(iterations):
            batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
            batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
            batch_y = y_train[pre_index:pre_index + batch_size]
            pre_index += batch_size
            my_feed_dict = dict()
            for i in range(12):
                my_feed_dict[X_deep[i]] = np.array(
                    batch_x_deep[:, 1]).reshape(-1, 1)
            my_feed_dict[X_wide] = np.array(batch_x_wide)
            my_feed_dict[y_] = batch_y
            loss_val = sess.run([loss, y, y_, train_op],
                                feed_dict=my_feed_dict)
            acc_val = np.equal(
                np.argmax(loss_val[1], 1),
                np.argmax(loss_val[2], 1)).astype(np.float32)
            train_loss.append(loss_val[0])
            train_acc.append(acc_val)
        tra_accuracy = np.mean(train_acc)
        tra_loss = np.mean(train_loss)
        ep_en = time.time()
        print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
              % (tra_loss, tra_accuracy, ep_en - ep_st))
    print('all time:', (time.time() - start))
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True,
                        help="model to be tested")
    parser.add_argument("--all", action="store_true",
                        help="whether to use all data")
    args = parser.parse_args()
    raw_model = args.model
    import tf_models
    model = eval('tf_models.' + raw_model)
    dataset = raw_model.split('_')[-1]
    print('Model:', raw_model)
    if dataset == 'criteo':
        train_criteo(model, args)
    elif dataset == 'adult':
        train_adult(model)
    else:
        raise NotImplementedError
 if __name__ == '__main__':
    main()
--- a/examples/ctr/run_tf_parallax.py
+++ b/examples/ctr/run_tf_parallax.py
@@ -0,0 +1,211 @@
 import os
 import numpy as np
 import tensorflow as tf
 import time
 import argparse
 from tqdm import tqdm
 from sklearn import metrics
 from autodist import AutoDist
 from autodist.resource_spec import ResourceSpec
 from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
 from autodist.strategy.base import Strategy
 from autodist.kernel.common.utils import get_op_name
 from tensorflow.python.framework import ops
 def pop_env():
    for k in ['https_proxy', 'http_proxy']:
        if k in os.environ:
            os.environ.pop(k)
 pop_env()
 # Please DO NOT modify /etc/bash.bashrc to activate conda environment.
 # Use python_venv in spec yml file instead.
 # Use absolute path of python file.
 # Here we use the tf native partitioner instead of autodist's PartitionPS.
 class Parallaxx(PSLoadBalancing, AllReduce):
    """
    Modify original parallax to remove replica on CPUs.
    """
    def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
        PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
        AllReduce.__init__(self, chunk_size)
    # pylint: disable=attribute-defined-outside-init
    def build(self, graph_item, resource_spec):
        """Generate the strategy."""
        expr = Strategy()
        # For each variable, generate variable synchronizer config
        expr.graph_config.replicas.extend(
            [k for k, v in resource_spec.gpu_devices])
        reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
        self.loads = {ps: 0.0 for ps in reduction_device_names}
        # Generate node config
        node_config = []
        for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
            var_op_name = get_op_name(var.name)
            grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
            if isinstance(grad, ops.Tensor):  # this is a dense variable
                group_id = idx // self.chunk_size
                config = self._gen_all_reduce_node_config(
                    var.name, group=group_id)
            else:  # sparse updates
                # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
                # Sparse variables are likely larger, so keeping copies would be costlier,
                # and usually each device only requires a small part of the overall variable.
                config = self._gen_ps_node_config(
                    var,
                    # For Parallax Strategy, all PS vars are sparse which does not need proxy.
                    False,
                    self._sync,
                    self._staleness
                )
            node_config.append(config)
        expr.node_config.extend(node_config)
        return expr
 def train_criteo(model, args):
    resource_spec_file = os.path.join(os.path.dirname(
        __file__), 'settings', 'plx_local_spec.yml')
    autodist = AutoDist(resource_spec_file, Parallaxx())
    respec = ResourceSpec(resource_spec_file)
    if args.all:
        from models.load_data import process_all_criteo_data
        dense, sparse, all_labels = process_all_criteo_data()
        dense_feature, val_dense = dense
        sparse_feature, val_sparse = sparse
        labels, val_labels = all_labels
    else:
        from models.load_data import process_sampled_criteo_data
        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
    # autodist will split the feeding data
    batch_size = 128
    with tf.Graph().as_default() as g, autodist.scope():
        dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
        sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
        embed_partitioner = tf.fixed_size_partitioner(
            len(respec.nodes), 0) if len(respec.nodes) > 1 else None
        loss, y, opt = model(dense_input, sparse_input,
                             y_, embed_partitioner, False)
        train_op = opt.minimize(loss)
        sess = autodist.create_distributed_session()
        my_feed_dict = {
            dense_input: np.empty(shape=(batch_size, 13)),
            sparse_input: np.empty(shape=(batch_size, 26)),
            y_: np.empty(shape=(batch_size, 1)),
        }
        if args.all:
            raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[
                                        0], 'logs', 'tf_plx_%s.log' % (args.model))
            print('Processing all data, log to', raw_log_file)
            log_file = open(raw_log_file, 'w')
            iterations = dense_feature.shape[0] // batch_size
            total_epoch = 11
            start_index = 0
            for ep in range(total_epoch):
                print("epoch %d" % ep)
                st_time = time.time()
                train_loss, train_acc, train_auc = [], [], []
                for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
                    my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                    my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                    my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                    start_index += batch_size
                    if start_index + batch_size > dense_feature.shape[0]:
                        start_index = 0
                    loss_val = sess.run(
                        [loss, y, y_, train_op], feed_dict=my_feed_dict)
                    pred_val = loss_val[1]
                    true_val = loss_val[2]
                    acc_val = np.equal(
                        true_val,
                        pred_val > 0.5)
                    train_loss.append(loss_val[0])
                    train_acc.append(acc_val)
                    train_auc.append(metrics.roc_auc_score(true_val, pred_val))
                tra_accuracy = np.mean(train_acc)
                tra_loss = np.mean(train_loss)
                tra_auc = np.mean(train_auc)
                en_time = time.time()
                train_time = en_time - st_time
                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                    % (tra_loss, tra_accuracy, tra_auc, train_time)
                print(printstr)
                log_file.write(printstr + '\n')
                log_file.flush()
        else:
            iteration = dense_feature.shape[0] // batch_size
            epoch = 50
            for ep in range(epoch):
                print('epoch', ep)
                if ep == 5:
                    start = time.time()
                ep_st = time.time()
                train_loss = []
                train_acc = []
                for idx in range(iteration):
                    start_index = idx * batch_size
                    my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
                    my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
                    my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
                    loss_val = sess.run(
                        [loss, y, y_, train_op], feed_dict=my_feed_dict)
                    pred_val = loss_val[1]
                    true_val = loss_val[2]
                    if pred_val.shape[1] == 1:  # for criteo case
                        acc_val = np.equal(
                            true_val,
                            pred_val > 0.5)
                    else:
                        acc_val = np.equal(
                            np.argmax(pred_val, 1),
                            np.argmax(true_val, 1)).astype(np.float32)
                    train_loss.append(loss_val[0])
                    train_acc.append(acc_val)
                tra_accuracy = np.mean(train_acc)
                tra_loss = np.mean(train_loss)
                ep_en = time.time()
                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
                      % (tra_loss, tra_accuracy, ep_en - ep_st))
            print('all time:', (time.time() - start))
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True,
                        help="model to be tested")
    parser.add_argument("--all", action="store_true",
                        help="whether to use all data")
    args = parser.parse_args()
    raw_model = args.model
    import tf_models
    model = eval('tf_models.' + raw_model)
    dataset = raw_model.split('_')[-1]
    print('Model:', raw_model)
    if dataset == 'criteo':
        train_criteo(model, args)
    else:
        raise NotImplementedError
 if __name__ == '__main__':
    main()
--- a/examples/ctr/settings/local_s1.yml
+++ b/examples/ctr/settings/local_s1.yml
@@ -0,0 +1,10 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13100
  DMLC_NUM_WORKER : 4
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
 launch :
  worker : 0
  server : 1
  scheduler : true
--- a/examples/ctr/settings/local_s1_w2.yml
+++ b/examples/ctr/settings/local_s1_w2.yml
@@ -0,0 +1,10 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13100
  DMLC_NUM_WORKER : 2
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
 launch :
  worker : 2
  server : 1
  scheduler : true
--- a/examples/ctr/settings/local_s1_w4.yml
+++ b/examples/ctr/settings/local_s1_w4.yml
@@ -0,0 +1,10 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13100
  DMLC_NUM_WORKER : 4
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
 launch :
  worker : 4
  server : 1
  scheduler : true
--- a/examples/ctr/settings/local_s1_w8.yml
+++ b/examples/ctr/settings/local_s1_w8.yml
@@ -0,0 +1,10 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13100
  DMLC_NUM_WORKER : 8
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
 launch :
  worker : 8
  server : 1
  scheduler : true
--- a/examples/ctr/settings/local_w4.yml
+++ b/examples/ctr/settings/local_w4.yml
@@ -0,0 +1,6 @@
 shared :
  DMLC_PS_ROOT_URI : 127.0.0.1
  DMLC_PS_ROOT_PORT : 13100
  DMLC_NUM_WORKER : 4
  DMLC_NUM_SERVER : 1
  DMLC_PS_VAN_TYPE : p3
--- a/examples/ctr/settings/plx_local_spec.yml
+++ b/examples/ctr/settings/plx_local_spec.yml
@@ -0,0 +1,4 @@
 nodes:
  - address: localhost
    cpus: [0]
    gpus: [0,1,2,3,4,5,6,7]
--- a/examples/ctr/settings/tf_local_s1_w2.json
+++ b/examples/ctr/settings/tf_local_s1_w2.json
@@ -0,0 +1,9 @@
 {
    "worker": [
        "127.0.0.1:12349",
        "127.0.0.1:12348"
    ],
    "ps": [
        "127.0.0.1:12345"
    ]
 }
--- a/examples/ctr/settings/tf_local_s1_w4.json
+++ b/examples/ctr/settings/tf_local_s1_w4.json
@@ -0,0 +1,11 @@
 {
    "worker": [
        "127.0.0.1:23459",
        "127.0.0.1:23458",
        "127.0.0.1:23457",
        "127.0.0.1:23456"
    ],
    "ps": [
        "127.0.0.1:23455"
    ]
 }
--- a/examples/ctr/settings/tf_local_s1_w8.json
+++ b/examples/ctr/settings/tf_local_s1_w8.json
@@ -0,0 +1,15 @@
 {
    "worker": [
        "127.0.0.1:34569",
        "127.0.0.1:34568",
        "127.0.0.1:34567",
        "127.0.0.1:34566",
        "127.0.0.1:34565",
        "127.0.0.1:34564",
        "127.0.0.1:34563",
        "127.0.0.1:34562"
    ],
    "ps": [
        "127.0.0.1:34575"
    ]
 }
--- a/examples/ctr/tests/hybrid_dcn_criteo.sh
+++ b/examples/ctr/tests/hybrid_dcn_criteo.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
 mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
--- a/examples/ctr/tests/hybrid_dfm_criteo.sh
+++ b/examples/ctr/tests/hybrid_dfm_criteo.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
 mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
--- a/examples/ctr/tests/hybrid_wdl_adult.sh
+++ b/examples/ctr/tests/hybrid_wdl_adult.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
 mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
--- a/examples/ctr/tests/hybrid_wdl_criteo.sh
+++ b/examples/ctr/tests/hybrid_wdl_criteo.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
 mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
--- a/examples/ctr/tests/local_dcn_criteo.sh
+++ b/examples/ctr/tests/local_dcn_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model dcn_criteo --val
--- a/examples/ctr/tests/local_dfm_criteo.sh
+++ b/examples/ctr/tests/local_dfm_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model dfm_criteo --val
--- a/examples/ctr/tests/local_wdl_adult.sh
+++ b/examples/ctr/tests/local_wdl_adult.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model wdl_adult --val
--- a/examples/ctr/tests/local_wdl_criteo.sh
+++ b/examples/ctr/tests/local_wdl_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model wdl_criteo --val
--- a/examples/ctr/tests/ps_dcn_criteo.sh
+++ b/examples/ctr/tests/ps_dcn_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
--- a/examples/ctr/tests/ps_dfm_criteo.sh
+++ b/examples/ctr/tests/ps_dfm_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
--- a/examples/ctr/tests/ps_wdl_adult.sh
+++ b/examples/ctr/tests/ps_wdl_adult.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
--- a/examples/ctr/tests/ps_wdl_criteo.sh
+++ b/examples/ctr/tests/ps_wdl_criteo.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 workdir=$(cd $(dirname $0); pwd)
 mainpy=${workdir}/../run_hetu.py
 python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml