diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..1fccce5
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "third_party/GraphMix"]
+	path = third_party/GraphMix
+	url = https://github.com/nox-410/GraphMix.git
+[submodule "third_party/HetuML"]
+	path = third_party/HetuML
+	url = https://github.com/ccchengff/HetuML.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..fa225f1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,59 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(Hetu CXX)
+
+include(cmake/config.cmake)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_CXX_FLAGS "-O3 -Wall")
+
+# openmp
+find_package(OpenMP REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+# compile flag
+if(${HETU_VERSION} STREQUAL "all")
+    set(HETU_COMPILE_GPU ON)
+    set(HETU_COMPILE_MKL ON)
+elseif(${HETU_VERSION} STREQUAL "gpu")
+    set(HETU_COMPILE_GPU ON)
+    set(HETU_COMPILE_MKL OFF)
+elseif(${HETU_VERSION} STREQUAL "mkl")
+    set(HETU_COMPILE_GPU OFF)
+    set(HETU_COMPILE_MKL ON)
+else()
+    message(FATAL_ERROR "unknown hetu version")
+endif()
+message(STATUS "HETU version: ${HETU_VERSION}")
+
+# cuda
+if(${HETU_COMPILE_GPU})
+    set(CMAKE_CUDA_COMPILER ${CUDAToolkit_ROOT}/bin/nvcc)
+    file(READ ${CUDAToolkit_ROOT}/version.txt RAW_CUDA_VERSION)
+    string(REGEX MATCH "[0-9\.]+" CUDA_VERSION ${RAW_CUDA_VERSION})
+    if(${CUDA_VERSION} VERSION_LESS "10.1")
+        message(FATAL_ERROR "Required CUDA version >= 10.1, while current CUDA version is ${CUDA_VERSION}")
+    endif()
+    find_package(CUDAToolkit REQUIRED)
+    enable_language(CUDA)
+endif()
+
+include(FetchContent) # download third_party
+
+add_subdirectory(${CMAKE_SOURCE_DIR}/src)
+
+if(${HETU_PS})
+    add_subdirectory(${CMAKE_SOURCE_DIR}/ps-lite)
+endif()
+
+if(${HETU_GEOMETRIC})
+    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/GraphMix)
+endif()
+if (HETU_ML)
+    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/HetuML)
+endif()
+enable_testing()
diff --git a/COMMITTERS.md b/COMMITTERS.md
new file mode 100644
index 0000000..3406485
--- /dev/null
+++ b/COMMITTERS.md
@@ -0,0 +1,33 @@
+## Committer
+
+Any existing Committer can nominate an individual making significant and valuable contributions across the Hetu Project to become a new Committer. 
+
+One may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
+
+Committers should be familiar with the guidelines for new contributors in [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## Committer Members
+### Current Committer
+- [Hsword](https://github.com/Hsword) - **Xupeng Miao** <[swordonline@foxmail.com](swordonline@foxmail.com)>
+- [ccchengff](https://github.com/ccchengff) - **Fangcheng Fu** <[ccchengff@gmail.com](ccchengff@gmail.com)>
+- [codecaution](https://github.com/codecaution) - **Xiaonan Nie**
+- [HugoZHL](https://github.com/HugoZHL) - **Hailin Zhang**
+- [nox-410](https://github.com/nox-410) - **Yining Shi**
+- [initzhang](https://github.com/initzhang) - **Xin Zhang**
+- [lovelyhan](https://github.com/lovelyhan) - **Yuezihan Jiang**
+- [AFDWang](https://github.com/AFDWang) - **Yujie Wang**
+- [sj1104](https://github.com/sj1104) - **Jia Shen**
+- [zhouyuegit](https://github.com/zhouyuegit) - **Yue Zhou**
+- [zmxdream](https://github.com/zmxdream) - **Minxu Zhang**
+
+We would like to sincerely thank the following community members for their contributions to Hetu.
+
+- [leleyu](https://github.com/leleyu) - **Lele Yu (Bytedance)**
+- [lbluesjjw](https://github.com/bluesjjw) - **Jiawei Jiang (ETH)**
+- [ghandzhipeng](https://github.com/ghandzhipeng) - **Zhipeng Zhang (Alibaba)**
+- [xysmlx](https://github.com/xysmlx) - **Lingxiao Ma (MSRA)**
+- [hbsun2113](https://github.com/hbsun2113) - **Haobo Sun (Microsoft STCA)**
+- [M-Arimase](https://github.com/M-Arimase) - **Yikai Zhao**
+- [tsingyawn](https://github.com/tsingyawn) - **Xinlei Xue**
+- **Lizi Su**
+- **Dong Li**
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..239c267
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,53 @@
+# Contributing to Hetu
+Welcome to [report Issues](https://github.com/PKU-DAIR/Hetu/issues) or [pull requests](https://github.com/PKU-DAIR/Hetu/pulls). It's recommended to read the following Contributing Guide first before contributing. 
+
+
+## Issues
+We use Github Issues to track public bugs and feature requests.
+
+### Search Known Issues First
+Please search the existing issues to see if any similar issue or feature request has already been filed. You should make sure your issue isn't redundant.
+
+### Reporting New Issues
+If you open an issue, the more information the better. Such as detailed description, screenshot or video of your problem, logcat or code blocks for your crash.
+
+## Pull Requests
+We strongly welcome your pull request to make Hetu better. 
+
+### Branch Management
+There are three main branches here:
+
+1. `main` branch.
+
+	(1). It is the latest (pre-)release branch. We use `main` for tags, with version number `1.0.0`, `1.1.0`, `1.2.0`...
+
+	(2). **Don't submit any PR on `main` branch.**
+	
+2. `specific version` branchs. 
+
+	(1).There is a `specific version` for each Hetu version, such as `branch-1.0.0`, `branch-1.1.0`. It is our stable developing	 branch. After full testing, `specific version` branch will be merged to `main` branch for the next release.
+
+	(2). **You are recommended to submit bugfix or feature PR on `specific version` branch.**
+
+
+Normal bugfix or feature request should be submitted to `specific version` branch. After full testing, we will merge them to `main` branch for the next release. 
+
+
+### Make Pull Requests
+The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `main` branch at once, which have some delay.
+
+Before submitting a pull request, please make sure the followings are done:
+
+1. Fork the repo and create your branch from `main` or `specific version`.
+2. Update code or documentation if you have changed APIs.
+3. Add the copyright notice to the top of any new files you've added.
+4. Check your code lints and checkstyles.
+5. Test and test again your code.
+6. Now, you can submit your pull request on  `specific version` branch.
+
+## Code Style Guide
+Use [Code Style](./.clang-format) for Python and C++.
+
+## License
+By contributing to Hetu, you agree that your contributions will be licensed
+under [License](LICENSE)
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index d63dfb7..7849ac5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,124 +1,201 @@
-木兰宽松许可证， 第2版
-
-2020年1月 http://license.coscl.org.cn/MulanPSL2
-
-您对“软件”的复制、使用、修改及分发受木兰宽松许可证，第2版（“本许可证”）的如下条款的约束：
-
-0.   定义
-
-“软件” 是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。
-
-“贡献” 是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。
-
-“贡献者” 是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。
-
-“法人实体” 是指提交贡献的机构及其“关联实体”。
-
-“关联实体” 是指，对“本许可证”下的行为方而言，控制、受控制或与其共同受控制的机构，此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。
-
-1.   授予版权许可
-
-每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可，您可以复制、使用、修改、分发其“贡献”，不论修改与否。
-
-2.   授予专利许可
-
-每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的（根据本条规定撤销除外）专利许可，供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求，不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地，就“软件”或其中的“贡献”对任何人发起专利侵权诉讼（包括反诉或交叉诉讼）或其他专利维权行动，指控其侵犯专利权，则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。
-
-3.   无商标许可
-
-“本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可，但您为满足第4条规定的声明义务而必须使用除外。
-
-4.   分发限制
-
-您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发，不论修改与否，但您必须向接收者提供“本许可证”的副本，并保留“软件”中的版权、商标、专利及免责声明。
-
-5.   免责声明与责任限制
-
-“软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下，“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任，不论因何种原因导致或者基于何种法律理论，即使其曾被建议有此种损失的可能性。
-
-6.   语言
-
-“本许可证”以中英文双语表述，中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致，以中文版为准。
-
-条款结束
-
-如何将木兰宽松许可证，第2版，应用到您的软件
-
-如果您希望将木兰宽松许可证，第2版，应用到您的新软件，为了方便接收者查阅，建议您完成如下三步：
-
-1， 请您补充如下声明中的空白，包括软件名、软件的首次发表年份以及您作为版权人的名字；
-
-2， 请您在软件包的一级目录下创建以“LICENSE”为名的文件，将整个许可证文本放入该文件中；
-
-3， 请将如下声明文本放入每个源文件的头部注释中。
-
-Copyright (c) [Year] [name of copyright holder]
-[Software Name] is licensed under Mulan PSL v2.
-You can use this software according to the terms and conditions of the Mulan PSL v2.
-You may obtain a copy of Mulan PSL v2 at:
-         http://license.coscl.org.cn/MulanPSL2
-THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-See the Mulan PSL v2 for more details.
-Mulan Permissive Software License，Version 2
-Mulan Permissive Software License，Version 2 (Mulan PSL v2)
-
-January 2020 http://license.coscl.org.cn/MulanPSL2
-
-Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions:
-
-0. Definition
-
-Software means the program and related documents which are licensed under this License and comprise all Contribution(s).
-
-Contribution means the copyrightable work licensed by a particular Contributor under this License.
-
-Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License.
-
-Legal Entity means the entity making a Contribution and all its Affiliates.
-
-Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity.
-
-1. Grant of Copyright License
-
-Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not.
-
-2. Grant of Patent License
-
-Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken.
-
-3. No Trademark License
-
-No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in section 4.
-
-4. Distribution Restriction
-
-You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software.
-
-5. Disclaimer of Warranty and Limitation of Liability
-
-THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-6. Language
-
-THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL.
-
-END OF THE TERMS AND CONDITIONS
-
-How to Apply the Mulan Permissive Software License，Version 2 (Mulan PSL v2) to Your Software
-
-To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps:
-
-Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner;
-Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package;
-Attach the statement to the appropriate annotated syntax at the beginning of each source file.
-Copyright (c) [Year] [name of copyright holder]
-[Software Name] is licensed under Mulan PSL v2.
-You can use this software according to the terms and conditions of the Mulan PSL v2.
-You may obtain a copy of Mulan PSL v2 at:
-         http://license.coscl.org.cn/MulanPSL2
-THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-See the Mulan PSL v2 for more details.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2021] [Peking University]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
index 6ec32e1..1accd40 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,177 @@
-#### 从命令行创建一个新的仓库
+<div align=center>
+<img src="./img/hetu.png" width="300" />
+</div>
 
+# HETU
+
+<!--- [![license](https://img.shields.io/github/license/apache/zookeeper?color=282661)](LICENSE) --->
+
+[Documentation](https://hetu-doc.readthedocs.io) |
+[Examples](https://hetu-doc.readthedocs.io/en/latest/Overview/performance.html)
+
+Hetu is a high-performance distributed deep learning system targeting trillions of parameters DL model training, developed by <a href="http://net.pku.edu.cn/~cuibin/" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University. It takes account of both high availability in industry and innovation in academia, which has a number of advanced characteristics:
+
+- Applicability. DL model definition with standard dataflow graph; many basic CPU and GPU operators; efficient implementation of more than plenty of DL models and at least popular 10 ML algorithms.
+
+- Efficiency. Achieve at least 30% speedup compared to TensorFlow on DNN, CNN, RNN benchmarks.
+
+- Flexibility. Supporting various parallel training protocols and distributed communication architectures, such as Data/Model/Pipeline parallel; Parameter server & AllReduce.
+
+- Scalability. Deployment on more than 100 computation nodes; Training giant models with trillions of model parameters, e.g., Criteo Kaggle, Open Graph Benchmark
+
+- Agility. Automatically ML pipeline: feature engineering, model selection, hyperparameter search.
+
+We welcome everyone interested in machine learning or graph computing to contribute codes, create issues or pull requests. Please refer to [Contribution Guide](CONTRIBUTING.md) for more details.
+
+## Installation
+1. Clone the repository.
+
+2. Prepare the environment. We use Anaconda to manage packages. The following command create the conda environment to be used:
+```conda env create -f environment.yml``` .
+Please prepare Cuda toolkit and CuDNN in advance.
+
+3. We use CMake to compile Hetu. Please copy the example configuration for compilation by `cp cmake/config.example.cmake cmake/config.cmake`. Users can modify the configuration file to enable/disable the compilation of each module. For advanced users (who not using the provided conda environment), the prerequisites for different modules in Hetu is listed in appendix.
 ```bash
-touch README.md
-git init
-git add README.md
-git commit -m "first commit"
-git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
-git push -u origin master
+# modify paths and configurations in cmake/config.cmake
+
+# generate Makefile
+mkdir build && cd build && cmake ..
 
+# compile
+# make all
+make -j 8
+# make hetu, version is specified in cmake/config.cmake
+make hetu -j 8
+# make allreduce module
+make allreduce -j 8
+# make ps module
+make ps -j 8
+# make geometric module
+make geometric -j 8
+# make hetu-cache module
+make hetu_cache -j 8
 ```
 
-#### 从命令行推送已经创建的仓库
+
+4. Prepare environment for running. Edit the hetu.exp file and set the environment path for python and the path for executable mpirun if necessary (for advanced users not using the provided conda environment). Then execute the command `source hetu.exp` .
+
+
+
+## Usage
+
+Train logistic regression on gpu:
 
 ```bash
-git remote add origin https://git.trustie.net/PKU-DAIR/Hetu.git
-git push -u origin master
+bash examples/cnn/scripts/hetu_1gpu.sh logreg MNIST
+```
+
+Train a 3-layer mlp on gpu:
 
+```bash
+bash examples/cnn/scripts/hetu_1gpu.sh mlp CIFAR10
+```
+
+Train a 3-layer cnn with gpu:
+
+```bash
+bash examples/cnn/scripts/hetu_1gpu.sh cnn_3_layers MNIST
 ```
 
+Train a 3-layer mlp with allreduce on 8 gpus (use mpirun):
+```bash
+bash examples/cnn/scripts/hetu_8gpu.sh mlp CIFAR10
+```
+
+Train a 3-layer mlp with PS on 1 server and 2 workers:
+```bash
+# in the script we launch the scheduler and server, and two workers
+bash examples/cnn/scripts/hetu_2gpu_ps.sh mlp CIFAR10
+```
+
+
+## More Examples
+Please refer to examples directory, which contains CNN, NLP, CTR, GNN training scripts. For distributed training, please refer to CTR and GNN tasks.
+
+## Community
+* Email: xupeng.miao@pku.edu.cn
+* Slack: coming soon
+* Hetu homepage: https://hetu-doc.readthedocs.io
+* [Committers & Contributors](COMMITTERS.md)
+* [Contributing to Hetu](CONTRIBUTING.md)
+* [Development plan](https://hetu-doc.readthedocs.io/en/latest/plan.html)
+
+## Enterprise Users
+
+If you are enterprise users and find Hetu is useful in your work, please let us know, and we are glad to add your company logo here.
+
+<img src="./img/tencent.png" width = "200"/>
+<img src="./img/alibabacloud.png" width = "200"/>
+<img src="./img/kuaishou.png" width = "200"/>
+
+## License
+
+The entire codebase is under [license](LICENSE)
+
+## Papers
+  1. Xupeng Miao, Linxiao Ma, Zhi Yang, Yingxia Shao, Bin Cui, Lele Yu, Jiawei Jiang. [CuWide: Towards Efficient Flow-based Training for Sparse Wide Models on GPUs.](https://ieeexplore.ieee.org/document/9261124). TKDE 2021, ICDE 2021
+  2. Xupeng Miao, Xiaonan Nie, Yingxia Shao, Zhi Yang, Jiawei Jiang, Lingxiao Ma, Bin Cui. [Heterogeneity-Aware Distributed Machine Learning Training via Partial Reduce](https://doi.org/10.1145/3448016.3452773) SIGMOD 2021
+  3. coming soon
+
+## Acknowledgements
+
+We learned and borrowed insights from a few open source projects including [TinyFlow](https://github.com/tqchen/tinyflow), [autodist](https://github.com/petuum/autodist), [tf.distribute](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/distribute) and [Angel](https://github.com/Angel-ML/angel).
+
+## Appendix
+The prerequisites for different modules in Hetu is listed as follows:
+  ```
+  "*" means you should prepare by yourself, while others support auto-download
+  
+  Hetu: OpenMP(*), CMake(*)
+  Hetu (version mkl): MKL 1.6.1
+  Hetu (version gpu): CUDA 10.1(*), CUDNN 7.5(*)
+  Hetu (version all): both
+
+  Hetu-AllReduce: MPI 3.1, NCCL 2.8(*), this module needs GPU version
+
+  Hetu-PS: Protobuf(*), ZeroMQ 4.3.2
+
+  Hetu-Geometric: Pybind11(*), Metis(*)
+
+  Hetu-Cache: Pybind11(*), this module needs PS module
+
+  ##################################################################
+  Tips for preparing the prerequisites
+  
+  Preparing CUDA, CUDNN, NCCL(NCCl is already in conda environment):
+  1. download from https://developer.nvidia.com
+  2. install
+  3. modify paths in cmake/config.cmake if necessary
+  
+  Preparing OpenMP:
+  Your just need to ensure your compiler support openmp.
+
+  Preparing CMake, Protobuf, Pybind11, Metis:
+  Install by anaconda: 
+  conda install cmake=3.18 libprotobuf pybind11=2.6.0 metis
+
+  Preparing OpenMPI (not necessary):
+  install by anaconda: `conda install -c conda-forge openmpi=4.0.3`
+  or
+  1. download from https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
+  2. build openmpi by `./configure /path/to/build && make -j8 && make install`
+  3. modify MPI_HOME to /path/to/build in cmake/config.cmake
+
+  Preparing MKL (not necessary):
+  install by anaconda: `conda install -c conda-forge onednn`
+  or
+  1. download from https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz
+  2. build mkl by `mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8` 
+  3. modify MKL_ROOT to /path/to/root and MKL_BUILD to /path/to/build in cmake/config.cmake 
+
+  Preparing ZeroMQ (not necessary):
+  install by anaconda: `conda install -c anaconda zeromq=4.3.2`
+  or
+  1. download from https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip
+  2. build zeromq by 'mkdir /path/to/build && cd /path/to/build && cmake /path/to/root && make -j8`
+  3. modify ZMQ_ROOT to /path/to/build in cmake/config.cmake
+  ```
diff --git a/bin/heturun b/bin/heturun
new file mode 100755
index 0000000..5bf3693
--- /dev/null
+++ b/bin/heturun
@@ -0,0 +1,2 @@
+#!/bin/bash
+python $(cd $(dirname $0); pwd)/../python/runner.py $@
diff --git a/cmake/Modules/FindCUDNN.cmake b/cmake/Modules/FindCUDNN.cmake
new file mode 100644
index 0000000..5cfbb0d
--- /dev/null
+++ b/cmake/Modules/FindCUDNN.cmake
@@ -0,0 +1,75 @@
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+#  CUDNN_ROOT: Base directory where CUDNN is found
+#  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+#  CUDNN_LIBRARY: Directory where CUDNN library is searched for
+#  CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+#  CUDNN_FOUND
+#  CUDNN_INCLUDE_PATH
+#  CUDNN_LIBRARY_PATH
+#
+
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+    message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+    HINTS ${CUDNN_INCLUDE_DIR}
+    PATH_SUFFIXES cuda/include cuda include
+    REQUIRED)
+
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+    set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+    set(CUDNN_LIBNAME "cudnn")
+endif()
+
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+    message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+    PATHS ${CUDNN_LIBRARY}
+    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
+    REQUIRED)
+
+set(file "${PROJECT_BINARY_DIR}/detect_cudnn_version.cc")
+file(WRITE ${file} "
+    #include <iostream>
+    #include \"${CUDNN_INCLUDE_PATH}/cudnn.h\"
+    int main()
+    {
+        std::cout << CUDNN_MAJOR << '.' << CUDNN_MINOR << '.' << CUDNN_PATCHLEVEL;
+        int x = cudnnGetVersion();
+        return x == CUDNN_VERSION;
+    }
+")
+try_run(CUDNN_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+    RUN_OUTPUT_VARIABLE CUDNN_VERSION
+    CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
+    LINK_LIBRARIES ${CUDNN_LIBRARY_PATH})
+if (NOT CUDNN_VERSION_MATCHED)
+    message(FATAL_ERROR "Found CUDNN header version and library version do not match! \
+        (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH}). Please set CUDNN_ROOT manually.")
+endif()
+message(STATUS "CUDNN version: ${CUDNN_VERSION}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    CUDNN
+    REQUIRED_VARS CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH
+    VERSION_VAR CUDNN_VERSION)
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/cmake/Modules/FindMETIS.cmake b/cmake/Modules/FindMETIS.cmake
new file mode 100644
index 0000000..eaf818b
--- /dev/null
+++ b/cmake/Modules/FindMETIS.cmake
@@ -0,0 +1,70 @@
+# Accepts the following variables:
+#
+# METIS_ROOT: Prefix where METIS is installed.
+# METIS_LIB_NAME: Name of the METIS library (default: metis).
+# METIS_LIBRARY: Full path of the METIS library.
+
+# Sets the following variables:
+#
+# METIS_LIBRARY: Full path of the METIS library.
+# METIS_FOUND: True if ParMETIS was found.
+# METIS_LIBRARIES: List of all libraries needed for linking with METIS,
+#
+# Provides the following macros:
+#
+# find_package(METIS)
+#
+# Searches for METIS (See above)
+
+
+# search metis header
+find_path(METIS_INCLUDE_DIR metis.h
+    PATHS ${METIS_DIR} ${METIS_ROOT}
+    PATH_SUFFIXES metis include include/metis Lib METISLib
+    NO_DEFAULT_PATH
+    DOC "Include directory of metis")
+find_path(METIS_INCLUDE_DIR metis.h
+    PATH_SUFFIXES metis include include/metis Lib METISLib)
+
+set(METIS_LIBRARY METIS_LIBRARY-NOTFOUND CACHE FILEPATH "Full path of the METIS library")
+
+# search metis library
+if(NOT METIS_LIB_NAME)
+    set(METIS_LIB_NAME metis)
+endif(NOT METIS_LIB_NAME)
+
+find_library(METIS_LIBRARY ${METIS_LIB_NAME}
+    PATHS ${METIS_DIR} ${METIS_ROOT}
+    PATH_SUFFIXES lib
+    NO_DEFAULT_PATH)
+find_library(METIS_LIBRARY ${METIS_LIB_NAME}
+    PATH_SUFFIXES lib
+)
+
+# behave like a CMake module is supposed to behave
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    "METIS"
+    DEFAULT_MSG
+    METIS_INCLUDE_DIR
+    METIS_LIBRARY
+)
+
+mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARIES METIS_LIB_NAME)
+
+# if both headers and library are found, store results
+if(METIS_FOUND)
+    set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR})
+    set(METIS_LIBRARIES ${METIS_LIBRARY})
+    # log result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+        "Determing location of METIS succeded:\n"
+        "Include directory: ${METIS_INCLUDE_DIRS}\n"
+        "Library directory: ${METIS_LIBRARIES}\n\n")
+else(METIS_FOUND)
+    # log errornous result
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+        "Determing location of METIS failed:\n"
+        "Include directory: ${METIS_INCLUDE_DIRS}\n"
+        "Library directory: ${METIS_LIBRARIES}\n\n")
+endif(METIS_FOUND)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000..322a80a
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,14 @@
+# - Try to find DNNL(MKL-DNN)
+# Once done this will define
+# DNNL_FOUND - System has DNNL
+# DNNL_INCLUDE_DIR - The DNNL include directories
+# DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build
+# DNNL_LIBRARY - The libraries needed to use DNNL
+# DNNL_DEFINITIONS - Compiler switches required for using DNNL
+
+find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include )
+find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include )
+find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src )
+
+include ( FindPackageHandleStandardArgs )
+find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR )
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000..b0a81d9
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,97 @@
+# Try to find NCCL
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are found
+#  NCCL_ROOT_DIR: Base directory where all NCCL components are found
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is found
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+if (NOT DEFINED NCCL_ROOT)
+    set(NCCL_ROOT $ENV{CONDA_PREFIX})
+endif()
+
+set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")
+
+find_path(NCCL_INCLUDE_DIRS
+    NAMES nccl.h
+    HINTS
+    ${NCCL_ROOT}
+    ${NCCL_ROOT}/include
+    ${NCCL_INCLUDE_DIR}
+    ${NCCL_ROOT_DIR}
+    ${NCCL_ROOT_DIR}/include
+    ${CUDA_TOOLKIT_ROOT_DIR}/include
+    REQUIRED)
+
+if ($ENV{USE_STATIC_NCCL})
+    message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+    set(NCCL_LIBNAME "libnccl_static.a")
+else()
+    set(NCCL_LIBNAME "nccl")
+endif()
+
+find_library(NCCL_LIBRARIES
+    NAMES ${NCCL_LIBNAME}
+    HINTS
+    ${NCCL_LIB_DIR}
+    ${NCCL_ROOT}
+    ${NCCL_ROOT}/lib
+    ${NCCL_ROOT}/lib/x86_64-linux-gnu
+    ${NCCL_ROOT}/lib64
+    ${NCCL_ROOT_DIR}
+    ${NCCL_ROOT_DIR}/lib
+    ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+    ${NCCL_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    REQUIRED)
+
+set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIR})
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+        #include <iostream>
+        #include \"${NCCL_HEADER_FILE}\"
+        int main()
+        {
+            std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH;
+            int x;
+            ncclGetVersion(&x);
+            return x == NCCL_VERSION_CODE;
+        }
+    ")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+        RUN_OUTPUT_VARIABLE NCCL_VERSION
+        CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CUDAToolkit_INCLUDE_DIR}"
+        LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+        message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+            (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}). Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION}")
+else()
+    message(STATUS "NCCL version < 2.3.5-5")
+endif ()
+set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    NCCL
+    REQUIRED_VARS NCCL_INCLUDE_DIRS NCCL_LIBRARIES
+    VERSION_VAR NCCL_VERSION)
diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake
new file mode 100644
index 0000000..806c9c6
--- /dev/null
+++ b/cmake/Modules/FindZMQ.cmake
@@ -0,0 +1,47 @@
+# - Try to find ZMQ
+# Once done this will define
+# ZMQ_FOUND - System has ZMQ
+# ZMQ_INCLUDE_DIRS - The ZMQ include directories
+# ZMQ_LIBRARIES - The libraries needed to use ZMQ
+# ZMQ_DEFINITIONS - Compiler switches required for using ZMQ
+
+find_path ( ZMQ_INCLUDE_DIR zmq.h HINTS ${ZMQ_ROOT}/include )
+find_library ( ZMQ_LIBRARY NAMES zmq HINTS ${ZMQ_BUILD}/lib )
+
+set ( ZMQ_LIBRARIES ${ZMQ_LIBRARY} )
+set ( ZMQ_INCLUDE_DIRS ${ZMQ_INCLUDE_DIR} )
+
+if (DEFINED ZMQ_LIBRARIES AND DEFINED ZMQ_INCLUDE_DIRS)
+    set(file "${PROJECT_BINARY_DIR}/detect_zeromq_version.cc")
+    file(WRITE ${file} "
+        #include <iostream>
+        #include \"${ZMQ_INCLUDE_DIRS}/zmq.h\"
+        int main()
+        {
+            std::cout << ZMQ_VERSION_MAJOR << '.' << ZMQ_VERSION_MINOR << '.' << ZMQ_VERSION_PATCH;
+            int x, y, z;
+            zmq_version(&x, &y, &z);
+            return x == ZMQ_VERSION_MAJOR && y == ZMQ_VERSION_MINOR && z == ZMQ_VERSION_PATCH;
+        }
+    ")
+    try_run(ZMQ_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+        RUN_OUTPUT_VARIABLE ZMQ_VERSION
+        LINK_LIBRARIES ${ZMQ_LIBRARIES})
+    if (NOT ZMQ_VERSION_MATCHED)
+        message(WARNING "Found ZMQ header version and library version do not match! \
+            (include: ${ZMQ_INCLUDE_DIRS}, library: ${ZMQ_LIBRARIES}). Please set ZMQ_ROOT and ZMQ_BUILD carefully.")
+        unset(ZMQ_INCLUDE_DIRS)
+        unset(ZMQ_LIBRARIES)
+        unset(ZMQ_VERSION)
+    else ()
+        message(STATUS "ZMQ version: ${ZMQ_VERSION}")
+    endif()
+endif()
+
+include ( FindPackageHandleStandardArgs )
+# handle the QUIETLY and REQUIRED arguments and set ZMQ_FOUND to TRUE
+# if all listed variables are TRUE
+find_package_handle_standard_args (
+    ZMQ
+    REQUIRED_VARS ZMQ_LIBRARIES ZMQ_INCLUDE_DIRS
+    VERSION_VAR ZMQ_VERSION)
diff --git a/cmake/config.example.cmake b/cmake/config.example.cmake
new file mode 100644
index 0000000..86224f1
--- /dev/null
+++ b/cmake/config.example.cmake
@@ -0,0 +1,55 @@
+######################
+### Set targets ######
+######################
+
+# hetu main version, choose from (mkl, gpu, all)
+# if using mkl (for CPU) or all, OpenMP(*), mkl required
+# if using gpu or all, OpenMP(*), CUDA(*), CUDNN(*) required
+set(HETU_VERSION "all")
+
+# whether to compile allreduce module
+# nccl(*), openmpi required
+set(HETU_ALLREDUCE ON)
+
+# whether to compile ps module
+# protobuf(*), zeromq required
+set(HETU_PS ON)
+
+# whether to compile geometric module (for GNNs)
+# pybind11(*), metis(*) required
+set(HETU_GEOMETRIC ON)
+
+# whether to compile cache module (for PS)
+# to enable this, you must turn HETU_PS on
+# pybind11(*) required
+set(HETU_CACHE ON)
+
+# whether to compile Hetu ML Module
+set(HETU_ML ON)
+set(HETU_PARALLEL_ML ON)
+
+######################
+### Set paths ########
+######################
+
+# CUDA version >= 10.1
+set(CUDAToolkit_ROOT /usr/local/cuda)
+
+# NCCL version >= 2.8
+set(NCCL_ROOT $ENV{CONDA_PREFIX})
+
+set(CUDNN_ROOT)
+
+# MPI version >= 3.1 (OpenMPI version >= 4.0.3)
+# if valid version not found, we'll download and compile it in time (openmpi-4.0.3)
+set(MPI_HOME $ENV{CONDA_PREFIX})
+
+# MKL 1.6.1, MKL_ROOT: root directory of mkl, MKL_BUILD: build directory of mkl
+# if not found, we'll download and compile it in time
+set(MKL_ROOT $ENV{CONDA_PREFIX})
+set(MKL_BUILD $ENV{CONDA_PREFIX})
+
+# ZMQ 4.3.2, ZMQ_ROOT: root directory of zeromq, ZMQ_BUILD: build directory of zeromq
+# if not found, we'll download and compile it in time
+set(ZMQ_ROOT $ENV{CONDA_PREFIX})
+set(ZMQ_BUILD $ENV{CONDA_PREFIX})
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..a230326
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,84 @@
+name: hetu
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - bcrypt=3.2.0=py37h5e8e339_1
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2021.7.5=h06a4308_1
+  - certifi=2021.5.30=py37h06a4308_0
+  - cffi=1.14.6=py37hc58025e_0
+  - cmake=3.18.2=ha30ef3c_0
+  - cryptography=3.4.7=py37h5d9358c_0
+  - cudatoolkit=10.1.243=h6bb024c_0
+  - expat=2.4.1=h2531618_2
+  - intel-openmp=2021.3.0=h06a4308_3350
+  - joblib=1.0.1=pyhd3eb1b0_0
+  - krb5=1.18.2=h173b8e3_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - libcurl=7.71.1=h20c2e04_1
+  - libedit=3.1.20210216=h27cfd23_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgfortran-ng=7.5.0=h14aa051_19
+  - libgfortran4=7.5.0=h14aa051_19
+  - libgomp=9.3.0=h5101ec6_17
+  - libprotobuf=3.15.8=h780b84a_0
+  - libsodium=1.0.18=h7b6447c_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libuv=1.40.0=h7b6447c_0
+  - lz4-c=1.9.3=h2531618_0
+  - metis=5.1.0=hf484d3e_4
+  - mkl=2021.3.0=h06a4308_520
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.0=py37h42c9631_2
+  - mkl_random=1.2.2=py37h51133e4_0
+  - mpi=1.0=openmpi
+  - nccl=2.8.3.1=hcaf9a05_0
+  - ncurses=6.2=he6710b0_1
+  - numpy=1.20.3=py37hf144106_0
+  - numpy-base=1.20.3=py37h74d4b33_0
+  - onednn=2.3=omp_hf4ef041_0
+  - onnx=1.9.0=py37h284874a_0
+  - onnxruntime=1.7.2=py37he8cb6d3_1
+  - openmpi=4.0.3=hdf1f1ad_1
+  - openssl=1.1.1k=h27cfd23_0
+  - pandas=1.2.5=py37h295c915_0
+  - paramiko=2.7.2=pyh9f0ad1d_0
+  - pip=21.1.3=py37h06a4308_0
+  - protobuf=3.15.8=py37hcd2ae1e_0
+  - psutil=5.8.0=py37h5e8e339_1
+  - pybind11=2.6.2=py37hff7bd54_1
+  - pycparser=2.20=pyh9f0ad1d_2
+  - pynacl=1.4.0=py37h5e8e339_2
+  - python=3.7.10=h12debd9_4
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python_abi=3.7=2_cp37m
+  - pytz=2021.1=pyhd3eb1b0_0
+  - pyyaml=5.4.1=py37h27cfd23_1
+  - re2=2021.04.01=h9c3ff4c_0
+  - readline=8.1=h27cfd23_0
+  - rhash=1.4.1=h3c74f83_1
+  - scikit-learn=0.24.2=py37ha9443f7_0
+  - scipy=1.6.2=py37had2a1c9_1
+  - setuptools=52.0.0=py37h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_0
+  - sqlite=3.36.0=hc218d9a_0
+  - threadpoolctl=2.2.0=pyhb85f177_0
+  - tk=8.6.10=hbc83047_0
+  - tqdm=4.61.2=pyhd3eb1b0_1
+  - typing-extensions=3.10.0.0=hd8ed1ab_0
+  - typing_extensions=3.10.0.0=pyha770c72_0
+  - wheel=0.36.2=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - zeromq=4.3.2=he6710b0_3
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.4.9=haebb681_0
+  - pip:
+    - cloudpickle==1.6.0
+    - wget==3.2
diff --git a/examples/cnn/README.md b/examples/cnn/README.md
new file mode 100644
index 0000000..474b927
--- /dev/null
+++ b/examples/cnn/README.md
@@ -0,0 +1,49 @@
+# CNN Examples
+In this directory we provide simple implementations for CNN models, including both hetu and tensorflow versions for comparison.
+## Structure
+```
+- cnn
+    - models/               CNN models in HETU
+    - pytorch_models/       CNN models in PyTorch
+    - tf_models/            CNN models in TensorFlow
+    - scripts/              Test scripts
+    - main.py               Trainer for HETU
+    - run_tf_horovod.py     Trainer for Horovod
+    - tf_launch_server.py   Trainer for TF-PS (role: server)
+    - tf_launch_worker.py   Trainer for TF-PS (role: worker)
+    - tf_main.py            Trainer for TensorFlow
+    - torch_main.py         Trainer for Pytorch
+    - 
+```
+## Usage
+Here are some examples of running scripts.
+```bash
+bash scripts/hetu_1gpu.sh mlp CIFAR10   # mlp with CIFAR10 dataset in hetu
+bash scripts/hetu_8gpu.sh mlp CIFAR10   # mlp with CIFAR10 in hetu with 8-GPU (1-node)
+bash scripts/hetu_16gpu.sh mlp CIFAR10  # mlp with CIFAR10 in hetu with 8-GPU (2-nodes)            
+```
+To train in PS setting, we also need to launch scheduler and server first. For more information about distributed training, please refer to CTR or GNN examples.
+
+We can change the setting in scripts. See `mnist_mlp.sh` below.
+```bash
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+### validate and timing
+python ${mainpy} --model mlp --dataset CIFAR10  --validate --timing
+
+### run in cpu
+# python ${mainpy} --model mlp --dataset CIFAR10 --gpu -1 --validate --timing
+
+```
+
+For more details about training setting, please refer to `main.py`.
+## Models
+We provide following models with specific datasets.
+```
+CIFAR100: VGG, ResNet
+CIFAR10: MLP, VGG, ResNet
+MNIST: AlexNet, CNN(3-layer), LeNet, LogisticRegression, LSTM, RNN
+```
diff --git a/examples/cnn/local_s1.yml b/examples/cnn/local_s1.yml
new file mode 100644
index 0000000..20ffc94
--- /dev/null
+++ b/examples/cnn/local_s1.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13030
+  DMLC_NUM_WORKER : 2
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 0
+  server : 1
+  scheduler : true
diff --git a/examples/cnn/main.py b/examples/cnn/main.py
new file mode 100644
index 0000000..1a4a224
--- /dev/null
+++ b/examples/cnn/main.py
@@ -0,0 +1,202 @@
+import hetu as ht
+import models
+import os
+import numpy as np
+import argparse
+import json
+import logging
+from time import time
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+    if device_id == 0:
+        logger.info(msg)
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        help='model to be tested')
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='dataset to be trained on')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=10, help='epoch number')
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='gpu to be used, -1 means cpu')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    parser.add_argument('--comm-mode', default=None, help='communication mode')
+    args = parser.parse_args()
+
+    global device_id
+    device_id = 0
+    print_rank0("Training {} on HETU".format(args.model))
+    if args.comm_mode in ('AllReduce', 'Hybrid'):
+        comm, device_id = ht.mpi_nccl_init()
+        executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
+    else:
+        if args.gpu == -1:
+            executor_ctx = ht.cpu(0)
+            print_rank0('Use CPU.')
+        else:
+            executor_ctx = ht.gpu(args.gpu)
+            print_rank0('Use GPU %d.' % args.gpu)
+    if args.comm_mode in ('PS', 'Hybrid'):
+        settings_file = open(os.path.join(os.path.abspath(
+            os.path.dirname(__file__)), 'worker_conf%d.json' % args.gpu))
+        settings = json.load(settings_file)
+        for key in settings:
+            if type(settings[key]) == str:
+                os.environ[key] = settings[key]
+            else:
+                os.environ[key] = str(settings[key])  # type is str
+
+    assert args.model in ['alexnet', 'cnn_3_layers', 'lenet', 'logreg', 'lstm', 'mlp', 'resnet18', 'resnet34', 'rnn', 'vgg16', 'vgg19'], \
+        'Model not supported!'
+    model = eval('models.' + args.model)
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+
+    if args.opt == 'sgd':
+        print_rank0('Use SGD Optimizer.')
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+    elif args.opt == 'momentum':
+        print_rank0('Use Momentum Optimizer.')
+        opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
+    elif args.opt == 'nesterov':
+        print_rank0('Use Nesterov Momentum Optimizer.')
+        opt = ht.optim.MomentumOptimizer(
+            learning_rate=args.learning_rate, nesterov=True)
+    elif args.opt == 'adagrad':
+        print_rank0('Use AdaGrad Optimizer.')
+        opt = ht.optim.AdaGradOptimizer(
+            learning_rate=args.learning_rate, initial_accumulator_value=0.1)
+    else:
+        print_rank0('Use Adam Optimizer.')
+        opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)
+
+    # data loading
+    print_rank0('Loading %s data...' % dataset)
+    if dataset == 'MNIST':
+        datasets = ht.data.mnist()
+        train_set_x, train_set_y = datasets[0]
+        valid_set_x, valid_set_y = datasets[1]
+        test_set_x, test_set_y = datasets[2]
+        # train_set_x: (50000, 784), train_set_y: (50000, 10)
+        # valid_set_x: (10000, 784), valid_set_y: (10000, 10)
+        # x_shape = (args.batch_size, 784)
+        # y_shape = (args.batch_size, 10)
+    elif dataset == 'CIFAR10':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+            num_class=10)
+        if args.model == "mlp":
+            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+        # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 10)
+        # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 10)
+        # x_shape = (args.batch_size, 3, 32, 32)
+        # y_shape = (args.batch_size, 10)
+    elif dataset == 'CIFAR100':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+            num_class=100)
+        # train_set_x: (50000, 3, 32, 32), train_set_y: (50000, 100)
+        # valid_set_x: (10000, 3, 32, 32), valid_set_y: (10000, 100)
+    else:
+        raise NotImplementedError
+
+    # model definition
+    print_rank0('Building model {}'.format(args.model))
+    x = ht.dataloader_op([
+        ht.Dataloader(train_set_x, args.batch_size, 'train'),
+        ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
+    ])
+    y_ = ht.dataloader_op([
+        ht.Dataloader(train_set_y, args.batch_size, 'train'),
+        ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
+    ])
+    if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
+        loss, y = model(x, y_, 100)
+    else:
+        loss, y = model(x, y_)
+
+    train_op = opt.minimize(loss)
+
+    eval_nodes = {'train': [loss, y, y_, train_op], 'validate': [loss, y, y_]}
+    executor = ht.Executor(eval_nodes, ctx=executor_ctx,
+                           comm_mode=args.comm_mode)
+    n_train_batches = executor.get_batch_num('train')
+    n_valid_batches = executor.get_batch_num('validate')
+
+    # training
+    print_rank0("Start training loop...")
+    running_time = 0
+    for i in range(args.num_epochs + 1):
+        print_rank0("Epoch %d" % i)
+        loss_all = 0
+        batch_num = 0
+        if args.timing:
+            start = time()
+        correct_predictions = []
+        for minibatch_index in range(n_train_batches):
+            loss_val, predict_y, y_val, _ = executor.run(
+                'train', eval_node_list=[loss, y, y_, train_op])
+            # Loss for this minibatch
+            predict_y = predict_y.asnumpy()
+            y_val = y_val.asnumpy()
+            loss_all += loss_val.asnumpy()
+            batch_num += 1
+            # Predict accuracy for this minibatch
+            correct_prediction = np.equal(
+                np.argmax(y_val, 1),
+                np.argmax(predict_y, 1)).astype(np.float32)
+            correct_predictions.extend(correct_prediction)
+
+        loss_all /= batch_num
+        accuracy = np.mean(correct_predictions)
+        print_rank0("Train loss = %f" % loss_all)
+        print_rank0("Train accuracy = %f" % accuracy)
+
+        if args.timing:
+            end = time()
+            during_time = end - start
+            print_rank0("Running time of current epoch = %fs" % (during_time))
+            if i != 0:
+                running_time += during_time
+        if args.validate:
+            val_loss_all = 0
+            batch_num = 0
+            correct_predictions = []
+            for minibatch_index in range(n_valid_batches):
+                loss_val, valid_y_predicted, y_val = executor.run(
+                    'validate', eval_node_list=[loss, y, y_], convert_to_numpy_ret_vals=True)
+                val_loss_all += loss_val
+                batch_num += 1
+                correct_prediction = np.equal(
+                    np.argmax(y_val, 1),
+                    np.argmax(valid_y_predicted, 1)).astype(np.float32)
+                correct_predictions.extend(correct_prediction)
+
+            val_loss_all /= batch_num
+            accuracy = np.mean(correct_predictions)
+            print_rank0("Validation loss = %f" % val_loss_all)
+            print_rank0("Validation accuracy = %f" % accuracy)
+    print_rank0("*"*50)
+    print_rank0("Running time of total %d epoch = %fs" %
+                (args.num_epochs, running_time))
+    if args.comm_mode in ('AllReduce', 'Hybrid'):
+        ht.mpi_nccl_finish(comm)
diff --git a/examples/cnn/models/AlexNet.py b/examples/cnn/models/AlexNet.py
new file mode 100644
index 0000000..1491dbf
--- /dev/null
+++ b/examples/cnn/models/AlexNet.py
@@ -0,0 +1,61 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_bn_relu_pool(x, in_channel, out_channel, name, with_relu=True, with_pool=False):
+    weight = init.random_normal(
+        shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
+    bn_scale = init.random_normal(
+        shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_scale')
+    bn_bias = init.random_normal(
+        shape=(1, out_channel, 1, 1), stddev=0.1, name=name+'_bn_bias')
+    x = ht.conv2d_op(x, weight, stride=1, padding=1)
+    x = ht.batch_normalization_op(x, bn_scale, bn_bias)
+    if with_relu:
+        x = ht.relu_op(x)
+    if with_pool:
+        x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, stride=2, padding=0)
+    return x
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+def alexnet(x, y_):
+    '''
+    AlexNet model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print('Building AlexNet model...')
+    x = ht.array_reshape_op(x, [-1, 1, 28, 28])
+    x = conv_bn_relu_pool(x,   1,  32, 'alexnet_conv1',
+                          with_relu=True, with_pool=True)
+    x = conv_bn_relu_pool(x,  32,  64, 'alexnet_conv2',
+                          with_relu=True, with_pool=True)
+    x = conv_bn_relu_pool(x,  64, 128, 'alexnet_conv3',
+                          with_relu=True, with_pool=False)
+    x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4',
+                          with_relu=True, with_pool=False)
+    x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5',
+                          with_relu=False, with_pool=True)
+    x = ht.array_reshape_op(x, (-1, 256*3*3))
+    x = fc(x, (256*3*3, 1024), name='alexnet_fc1', with_relu=True)
+    x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True)
+    y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/CNN.py b/examples/cnn/models/CNN.py
new file mode 100644
index 0000000..ed84059
--- /dev/null
+++ b/examples/cnn/models/CNN.py
@@ -0,0 +1,41 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_relu_avg(x, shape):
+    weight = init.random_normal(shape=shape, stddev=0.1)
+    x = ht.conv2d_op(x, weight, padding=2, stride=1)
+    x = ht.relu_op(x)
+    x = ht.avg_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    return x
+
+
+def fc(x, shape):
+    weight = init.random_normal(shape=shape, stddev=0.1)
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1)
+    x = ht.array_reshape_op(x, (-1, shape[0]))
+    x = ht.matmul_op(x, weight)
+    y = x + ht.broadcastto_op(bias, x)
+    return y
+
+
+def cnn_3_layers(x, y_):
+    '''
+    3-layer-CNN model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print('Building 3-layer-CNN model...')
+    x = ht.array_reshape_op(x, [-1, 1, 28, 28])
+    x = conv_relu_avg(x, (32, 1, 5, 5))
+    x = conv_relu_avg(x, (64, 32, 5, 5))
+    y = fc(x, (7 * 7 * 64, 10))
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/LSTM.py b/examples/cnn/models/LSTM.py
new file mode 100644
index 0000000..7469a7c
--- /dev/null
+++ b/examples/cnn/models/LSTM.py
@@ -0,0 +1,90 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def lstm(x, y_):
+    '''
+    LSTM model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+    diminput = 28
+    dimhidden = 128
+    dimoutput = 10
+    nsteps = 28
+
+    forget_gate_w = init.random_normal(
+        shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w")
+    forget_gate_u = init.random_normal(
+        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u")
+    forget_gate_b = init.random_normal(
+        shape=(dimhidden,), stddev=0.1, name="lstm_forget_gate_b")
+    input_gate_w = init.random_normal(
+        shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w")
+    input_gate_u = init.random_normal(
+        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u")
+    input_gate_b = init.random_normal(
+        shape=(dimhidden,), stddev=0.1, name="lstm_input_gate_b")
+    output_gate_w = init.random_normal(
+        shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w")
+    output_gate_u = init.random_normal(
+        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u")
+    output_gate_b = init.random_normal(
+        shape=(dimhidden,), stddev=0.1, name="lstm_output_gate_b")
+    tanh_w = init.random_normal(
+        shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w")
+    tanh_u = init.random_normal(
+        shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u")
+    tanh_b = init.random_normal(
+        shape=(dimhidden,), stddev=0.1, name="lstm_tanh_b")
+    out_weights = init.random_normal(
+        shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight")
+    out_bias = init.random_normal(
+        shape=(dimoutput,), stddev=0.1, name="lstm_out_bias")
+    initial_state = ht.Variable(value=np.zeros((1,)).astype(
+        np.float32), name='initial_state', trainable=False)
+
+    for i in range(nsteps):
+        cur_x = ht.slice_op(x, (0, i * diminput), (-1, diminput))
+        # forget gate
+        if i == 0:
+            temp = ht.matmul_op(cur_x, forget_gate_w)
+            last_c_state = ht.broadcastto_op(initial_state, temp)
+            last_h_state = ht.broadcastto_op(initial_state, temp)
+            cur_forget = ht.matmul_op(last_h_state, forget_gate_u) + temp
+        else:
+            cur_forget = ht.matmul_op(
+                last_h_state, forget_gate_u) + ht.matmul_op(cur_x, forget_gate_w)
+        cur_forget = cur_forget + ht.broadcastto_op(forget_gate_b, cur_forget)
+        cur_forget = ht.sigmoid_op(cur_forget)
+        # input gate
+        cur_input = ht.matmul_op(
+            last_h_state, input_gate_u) + ht.matmul_op(cur_x, input_gate_w)
+        cur_input = cur_input + ht.broadcastto_op(input_gate_b, cur_input)
+        cur_input = ht.sigmoid_op(cur_input)
+        # output gate
+        cur_output = ht.matmul_op(
+            last_h_state, output_gate_u) + ht.matmul_op(cur_x, output_gate_w)
+        cur_output = cur_output + ht.broadcastto_op(output_gate_b, cur_output)
+        cur_output = ht.sigmoid_op(cur_output)
+        # tanh
+        cur_tanh = ht.matmul_op(last_h_state, tanh_u) + \
+            ht.matmul_op(cur_x, tanh_w)
+        cur_tanh = cur_tanh + ht.broadcastto_op(tanh_b, cur_tanh)
+        cur_tanh = ht.tanh_op(cur_tanh)
+
+        last_c_state = ht.mul_op(last_c_state, cur_forget) + \
+            ht.mul_op(cur_input, cur_tanh)
+        last_h_state = ht.tanh_op(last_c_state) * cur_output
+
+    x = ht.matmul_op(last_h_state, out_weights)
+    y = x + ht.broadcastto_op(out_bias, x)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/LeNet.py b/examples/cnn/models/LeNet.py
new file mode 100644
index 0000000..a14903c
--- /dev/null
+++ b/examples/cnn/models/LeNet.py
@@ -0,0 +1,46 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_pool(x, in_channel, out_channel, name):
+    weight = init.random_normal(
+        shape=(out_channel, in_channel, 5, 5), stddev=0.1, name=name+'_weight')
+    x = ht.conv2d_op(x, weight, padding=2, stride=1)
+    x = ht.relu_op(x)
+    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    return x
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+def lenet(x, y_):
+    '''
+    LeNet model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print('Building LeNet model...')
+    x = ht.array_reshape_op(x, (-1, 1, 28, 28))
+    x = conv_pool(x, 1,  6, name='lenet_conv1')
+    x = conv_pool(x, 6, 16, name='lenet_conv2')
+    x = ht.array_reshape_op(x, (-1, 7*7*16))
+    x = fc(x, (7*7*16, 120), name='lenet_fc1', with_relu=True)
+    x = fc(x, (120, 84), name='lenet_fc2', with_relu=True)
+    y = fc(x, (84,  10), name='lenet_fc3', with_relu=False)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/LogReg.py b/examples/cnn/models/LogReg.py
new file mode 100644
index 0000000..a8a05a6
--- /dev/null
+++ b/examples/cnn/models/LogReg.py
@@ -0,0 +1,24 @@
+import hetu as ht
+from hetu import init
+
+
+def logreg(x, y_):
+    '''
+    Logistic Regression model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print("Build logistic regression model...")
+    weight = init.zeros((784, 10), name='logreg_weight')
+    bias = init.zeros((10,), name='logreg_bias')
+    x = ht.matmul_op(x, weight)
+    y = x + ht.broadcastto_op(bias, x)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/MLP.py b/examples/cnn/models/MLP.py
new file mode 100644
index 0000000..184e200
--- /dev/null
+++ b/examples/cnn/models/MLP.py
@@ -0,0 +1,33 @@
+import hetu as ht
+from hetu import init
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+def mlp(x, y_):
+    '''
+    MLP model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print("Building MLP model...")
+    x = fc(x, (3072, 256), 'mlp_fc1', with_relu=True)
+    x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
+    y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/RNN.py b/examples/cnn/models/RNN.py
new file mode 100644
index 0000000..a767952
--- /dev/null
+++ b/examples/cnn/models/RNN.py
@@ -0,0 +1,56 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def rnn(x, y_):
+    '''
+    RNN model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print("Building RNN model...")
+    diminput = 28
+    dimhidden = 128
+    dimoutput = 10
+    nsteps = 28
+
+    weight1 = init.random_normal(
+        shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1')
+    bias1 = init.random_normal(
+        shape=(dimhidden, ), stddev=0.1, name='rnn_bias1')
+    weight2 = init.random_normal(
+        shape=(dimhidden+dimhidden, dimhidden), stddev=0.1, name='rnn_weight2')
+    bias2 = init.random_normal(
+        shape=(dimhidden, ), stddev=0.1, name='rnn_bias2')
+    weight3 = init.random_normal(
+        shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3')
+    bias3 = init.random_normal(
+        shape=(dimoutput, ), stddev=0.1, name='rnn_bias3')
+    last_state = ht.Variable(value=np.zeros((1,)).astype(
+        np.float32), name='initial_state', trainable=False)
+
+    for i in range(nsteps):
+        cur_x = ht.slice_op(x, (0, i*diminput), (-1, diminput))
+        h = ht.matmul_op(cur_x, weight1)
+        h = h + ht.broadcastto_op(bias1, h)
+
+        if i == 0:
+            last_state = ht.broadcastto_op(last_state, h)
+        s = ht.concat_op(h, last_state, axis=1)
+        s = ht.matmul_op(s, weight2)
+        s = s + ht.broadcastto_op(bias2, s)
+        last_state = ht.relu_op(s)
+
+    final_state = last_state
+    x = ht.matmul_op(final_state, weight3)
+    y = x + ht.broadcastto_op(bias3, x)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/cnn/models/ResNet.py b/examples/cnn/models/ResNet.py
new file mode 100644
index 0000000..cd1d30e
--- /dev/null
+++ b/examples/cnn/models/ResNet.py
@@ -0,0 +1,125 @@
+import hetu as ht
+from hetu import init
+
+
+def conv2d(x, in_channel, out_channel, stride=1, padding=1, name=''):
+    weight = init.random_normal(
+        shape=(out_channel, in_channel, 3, 3), stddev=0.1, name=name+'_weight')
+    x = ht.conv2d_op(x, weight, stride=stride, padding=padding)
+    return x
+
+
+def batch_norm_with_relu(x, hidden, name):
+    scale = init.random_normal(
+        shape=(1, hidden, 1, 1), stddev=0.1, name=name+'_scale')
+    bias = init.random_normal(shape=(1, hidden, 1, 1),
+                              stddev=0.1, name=name+'_bias')
+    x = ht.batch_normalization_op(x, scale, bias)
+    x = ht.relu_op(x)
+    return x
+
+
+def resnet_block(x, in_channel, num_blocks, is_first=False, name=''):
+    if is_first:
+        out_channel = in_channel
+        identity = x
+        x = conv2d(x, in_channel, out_channel, stride=1,
+                   padding=1, name=name+'_conv1')
+        x = batch_norm_with_relu(x, out_channel, name+'_bn1')
+        x = conv2d(x, out_channel, out_channel, stride=1,
+                   padding=1, name=name+'_conv2')
+        x = x + identity
+    else:
+        out_channel = 2 * in_channel
+        identity = x
+        x = batch_norm_with_relu(x, in_channel, name+'_bn0')
+        x = ht.pad_op(x, [[0, 0], [0, 0], [0, 1], [0, 1]])
+        x = conv2d(x, in_channel, out_channel, stride=2,
+                   padding=0, name=name+'_conv1')
+        x = batch_norm_with_relu(x, out_channel, name+'_bn1')
+        x = conv2d(x, out_channel, out_channel, stride=1,
+                   padding=1, name=name+'_conv2')
+        identity = ht.avg_pool2d_op(
+            identity, kernel_H=2, kernel_W=2, padding=0, stride=2)
+        identity = ht.pad_op(
+            identity, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])
+        x = x + identity
+
+    for i in range(1, num_blocks):
+        identity = x
+        x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i))
+        x = conv2d(x, out_channel, out_channel, stride=1,
+                   padding=1, name=name+'_conv%d' % (2 * i + 1))
+        x = batch_norm_with_relu(x, out_channel, name+'_bn%d' % (2 * i + 1))
+        x = conv2d(x, out_channel, out_channel, stride=1,
+                   padding=1, name=name+'_conv%d' % (2 * i + 2))
+        x = x + identity
+
+    return x
+
+
+def fc(x, shape, name):
+    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    return x
+
+
+def resnet(x, y_, num_layers=18, num_class=10):
+    '''
+    ResNet model, for CIFAR10 dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+        num_layers: 18 or 34
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    base_size = 16
+
+    x = conv2d(x, 3, base_size, stride=1, padding=1,
+               name='resnet_initial_conv')
+    x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn')
+
+    if num_layers == 18:
+        print("Building ResNet-18 model...")
+        x = resnet_block(x,     base_size, num_blocks=2,
+                         is_first=True, name='resnet_block1')
+        x = resnet_block(x,     base_size, num_blocks=2,
+                         is_first=False, name='resnet_block2')
+        x = resnet_block(x, 2 * base_size, num_blocks=2,
+                         is_first=False, name='resnet_block3')
+        x = resnet_block(x, 4 * base_size, num_blocks=2,
+                         is_first=False, name='resnet_block4')
+    elif num_layers == 34:
+        print("Building ResNet-34 model...")
+        x = resnet_block(x,     base_size, num_blocks=3,
+                         is_first=True, name='resnet_block1')
+        x = resnet_block(x,     base_size, num_blocks=4,
+                         is_first=False, name='resnet_block2')
+        x = resnet_block(x, 2 * base_size, num_blocks=6,
+                         is_first=False, name='resnet_block3')
+        x = resnet_block(x, 4 * base_size, num_blocks=3,
+                         is_first=False, name='resnet_block4')
+    else:
+        assert False, "Number of layers should be 18 or 34 !"
+
+    x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn')
+    x = ht.array_reshape_op(x, (-1, 128 * base_size))
+    y = fc(x, (128 * base_size, num_class), name='resnet_final_fc')
+    # here we don't use cudnn for softmax crossentropy to avoid overflows
+    loss = ht.softmaxcrossentropy_op(y, y_, use_cudnn=False)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
+
+
+def resnet18(x, y_, num_class=10):
+    return resnet(x, y_, 18, num_class)
+
+
+def resnet34(x, y_, num_class=10):
+    return resnet(x, y_, 34, num_class)
diff --git a/examples/cnn/models/VGG.py b/examples/cnn/models/VGG.py
new file mode 100644
index 0000000..9cdab5f
--- /dev/null
+++ b/examples/cnn/models/VGG.py
@@ -0,0 +1,100 @@
+import hetu as ht
+from hetu import init
+
+
+def conv_bn_relu(x, in_channel, out_channel, name):
+    weight = init.random_normal(shape=(out_channel, in_channel, 3, 3),
+                                stddev=0.1, name=name+'_weight')
+    bn_scale = init.random_normal(shape=(1, out_channel, 1, 1),
+                                  stddev=0.1, name=name+'_bn_scale')
+    bn_bias = init.random_normal(shape=(1, out_channel, 1, 1),
+                                 stddev=0.1, name=name+'_bn_bias')
+
+    x = ht.conv2d_op(x, weight, padding=1, stride=1)
+    x = ht.batch_normalization_op(x, bn_scale, bn_bias)
+    act = ht.relu_op(x)
+    return act
+
+
+def vgg_2block(x, in_channel, out_channel, name):
+    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    return x
+
+
+def vgg_3block(x, in_channel, out_channel, name):
+    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
+    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    return x
+
+
+def vgg_4block(x, in_channel, out_channel, name):
+    x = conv_bn_relu(x, in_channel, out_channel, name=name+'_layer1')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer2')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer3')
+    x = conv_bn_relu(x, out_channel, out_channel, name=name+'_layer4')
+    x = ht.max_pool2d_op(x, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    return x
+
+
+def vgg_fc(x, in_feat, out_feat, name):
+    weight = init.random_normal(shape=(in_feat, out_feat),
+                                stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=(out_feat,),
+                              stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    return x
+
+
+def vgg(x, y_, num_layers, num_class=10):
+    '''
+    VGG model, for CIFAR10/CIFAR100 dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+        num_layers: 16 or 19
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    if num_layers == 16:
+        print('Building VGG-16 model...')
+        x = vgg_2block(x,   3,  64, 'vgg_block1')
+        x = vgg_2block(x,  64, 128, 'vgg_block2')
+        x = vgg_3block(x, 128, 256, 'vgg_block3')
+        x = vgg_3block(x, 256, 512, 'vgg_block4')
+        x = vgg_3block(x, 512, 512, 'vgg_block5')
+
+    elif num_layers == 19:
+        print('Building VGG-19 model...')
+        x = vgg_2block(x,   3,  64, 'vgg_block1')
+        x = vgg_2block(x,  64, 128, 'vgg_block2')
+        x = vgg_4block(x, 128, 256, 'vgg_block3')
+        x = vgg_4block(x, 256, 512, 'vgg_block4')
+        x = vgg_4block(x, 512, 512, 'vgg_block5')
+
+    else:
+        assert False, 'VGG model should have 16 or 19 layers!'
+
+    x = ht.array_reshape_op(x, (-1, 512))
+    x = vgg_fc(x,  512, 4096, 'vgg_fc1')
+    x = vgg_fc(x, 4096, 4096, 'vgg_fc2')
+    y = vgg_fc(x, 4096, num_class, 'vgg_fc3')
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+
+    return loss, y
+
+
+def vgg16(x, y_, num_class=10):
+    return vgg(x, y_, 16, num_class)
+
+
+def vgg19(x, y_, num_class=10):
+    return vgg(x, y_, 19, num_class)
diff --git a/examples/cnn/models/__init__.py b/examples/cnn/models/__init__.py
new file mode 100644
index 0000000..f6306d2
--- /dev/null
+++ b/examples/cnn/models/__init__.py
@@ -0,0 +1,9 @@
+from .VGG import vgg, vgg16, vgg19
+from .LogReg import logreg
+from .CNN import cnn_3_layers
+from .AlexNet import alexnet
+from .LeNet import lenet
+from .MLP import mlp
+from .RNN import rnn
+from .LSTM import lstm
+from .ResNet import resnet, resnet18, resnet34
diff --git a/examples/cnn/pytorch_models/__init__.py b/examples/cnn/pytorch_models/__init__.py
new file mode 100644
index 0000000..022e62e
--- /dev/null
+++ b/examples/cnn/pytorch_models/__init__.py
@@ -0,0 +1,4 @@
+from .mlp import mlp
+from .resnet import resnet18, resnet34, resnet50
+from .vgg import vgg16, vgg19
+from .rnn import rnn
diff --git a/examples/cnn/pytorch_models/mlp.py b/examples/cnn/pytorch_models/mlp.py
new file mode 100644
index 0000000..398799f
--- /dev/null
+++ b/examples/cnn/pytorch_models/mlp.py
@@ -0,0 +1,20 @@
+import torch.nn.functional as F
+import torch.nn as nn
+
+
+class MLP(nn.Module):
+    def __init__(self):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(3072, 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 10)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        out = self.fc3(x)
+        return out
+
+
+def mlp():
+    return MLP()
diff --git a/examples/cnn/pytorch_models/resnet.py b/examples/cnn/pytorch_models/resnet.py
new file mode 100644
index 0000000..927adec
--- /dev/null
+++ b/examples/cnn/pytorch_models/resnet.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion *
+                               planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def resnet18(num_classes=10):
+    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
+
+
+def resnet34(num_classes=10):
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
+
+
+def resnet50(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
+
+
+def resnet101(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+
+
+def resnet152(num_classes=10):
+    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
diff --git a/examples/cnn/pytorch_models/rnn.py b/examples/cnn/pytorch_models/rnn.py
new file mode 100644
index 0000000..0a298d7
--- /dev/null
+++ b/examples/cnn/pytorch_models/rnn.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class RNN(nn.Module):
+    def __init__(self, diminput, dimoutput, dimhidden, nsteps):
+        super(RNN, self).__init__()
+        self.diminput = diminput
+        self.dimoutput = dimoutput
+        self.dimhidden = dimhidden
+        self.nsteps = nsteps
+        self.fc1 = nn.Linear(diminput, dimhidden)
+        self.fc2 = nn.Linear(dimhidden*2, dimhidden)
+        self.fc3 = nn.Linear(dimhidden, dimoutput)
+
+    def forward(self, x):
+        last_state = torch.zeros((x.shape[0], self.dimhidden)).to(x.device)
+        for i in range(self.nsteps):
+            t = i % self.nsteps
+            index = torch.Tensor([idx for idx in range(
+                t*self.diminput, (t+1)*self.diminput)]).long().to(x.device)
+            cur_x = torch.index_select(x, 1, index)
+            h = self.fc1(cur_x)
+            s = torch.cat([h, last_state], axis=1)
+            s = self.fc2(s)
+            last_state = F.relu(s)
+
+        final_state = last_state
+        y = self.fc3(final_state)
+        return y
+
+
+def rnn(diminput, dimoutput, dimhidden, nsteps):
+
+    return RNN(diminput, dimoutput, dimhidden, nsteps)
diff --git a/examples/cnn/pytorch_models/vgg.py b/examples/cnn/pytorch_models/vgg.py
new file mode 100644
index 0000000..0fa2e88
--- /dev/null
+++ b/examples/cnn/pytorch_models/vgg.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+cfg = {
+    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class VGG(nn.Module):
+    def __init__(self, vgg_name, num_class=10):
+        super(VGG, self).__init__()
+        self.features = self._make_layers(cfg[vgg_name])
+        self.fc1 = nn.Linear(512, 4096)
+        self.fc2 = nn.Linear(4096, 4096)
+        self.classifier = nn.Linear(4096, num_class)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = self.fc2(self.fc1(out))
+        out = self.classifier(out)
+        return out
+
+    def _make_layers(self, cfg):
+        layers = []
+        in_channels = 3
+        for x in cfg:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(x),
+                           nn.ReLU(inplace=True)]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
+
+
+def vgg16(num_class=10):
+    return VGG('VGG16', num_class)
+
+
+def vgg19(num_class=10):
+    return VGG('VGG19', num_class)
diff --git a/examples/cnn/run_tf_horovod.py b/examples/cnn/run_tf_horovod.py
new file mode 100644
index 0000000..fa2b96f
--- /dev/null
+++ b/examples/cnn/run_tf_horovod.py
@@ -0,0 +1,309 @@
+import os
+import numpy as np
+import tensorflow as tf
+import tf_models
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+import horovod.tensorflow as hvd
+import hetu as ht
+import logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+    if rank % 8 == 0:
+        logger.info(msg)
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
+# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
+# horovodrun -np 16 --start-timeout 3000 -H daim116:8,daim117:8
+#    python /home/public/nxn/Athena-master/examples/cnn/run_tf_horovod.py --model tf_rnn
+
+
+# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
+# we can also use mpirun (default gloo):
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+#  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
+'''
+def train(model, args):
+    hvd.init()
+
+    def get_current_shard(data):
+        part_size = data.shape[0] // hvd.size()
+        start = part_size * hvd.rank()
+        end = start + part_size if hvd.rank() != hvd.size() - 1 else data.shape[0]
+        return data[start:end]
+
+    batch_size = 128
+    if args.model == 'tf_resnet34':
+        train_images, train_labels, test_images,\
+                test_labels = ht.data.tf_normalize_cifar10()
+        x = tf.compat.v1.placeholder(tf.float32, [batch_size, 32, 32, 3])
+        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
+    else:
+        datasets = ht.data.mnist()
+        train_images, train_labels = datasets[0]
+        test_images, test_labels = datasets[2]
+        x = tf.compat.v1.placeholder(tf.float32, [batch_size, 784])
+        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 10])
+
+
+    n_train_batches = train_images.shape[0] // batch_size
+
+    loss, y = model(x, y_)
+    opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+    global_step = tf.train.get_or_create_global_step()
+    # here in DistributedOptimizer by default all tensor are reduced on GPU
+    # can use device_sparse=xxx, device_dense=xxx to modify
+    # if using device_sparse='/cpu:0', the performance degrades
+    train_op = hvd.DistributedOptimizer(opt).minimize(loss, global_step=global_step)
+
+    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank()))
+    # here horovod default use gpu to initialize, which will cause OOM
+    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+    sess = tf.compat.v1.train.MonitoredTrainingSession(hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+    iterations = train_images.shape[0] // batch_size
+    total_epoch = 10
+    start_index = 0
+    total_time = 0
+    for ep in range(total_epoch + 1):
+        print("epoch %d" % ep)
+        st_time = time.time()
+        train_loss, train_acc = [], []
+        for it in range(n_train_batches):
+            x_val = train_images[start_index: start_index + batch_size]
+            y_val = train_labels[start_index : start_index+batch_size]
+            start_index += batch_size
+            if start_index + batch_size > train_images.shape[0]:
+                start_index = 0
+            loss_val = sess.run([loss, y, y_, train_op], feed_dict={x:x_val, y_:y_val})
+            pred_val = loss_val[1]
+            true_val = loss_val[2]
+            acc_val = np.equal(
+                true_val,
+                pred_val > 0.5)
+            train_loss.append(loss_val[0])
+            train_acc.append(acc_val)
+        tra_accuracy = np.mean(train_acc)
+        tra_loss = np.mean(train_loss)
+        en_time = time.time()
+        train_time = en_time - st_time
+        if ep != 0:
+            total_time += train_time
+        printstr = "train_loss: %.4f, train_acc: %.4f, train_time: %.4f"\
+                    % (tra_loss, tra_accuracy, train_time)
+
+    print("training time:", total_time)
+
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True, help="model to be tested")
+    parser.add_argument("--all", action="store_true", help="whether to use all data")
+    args = parser.parse_args()
+    raw_model = args.model
+    import tf_models
+    model = eval('tf_models.' + raw_model)
+    print('Model:', raw_model)
+    train(model, args)
+
+if __name__ == '__main__':
+    main()
+'''
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        help='model to be tested')
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='dataset to be trained on')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=20, help='epoch number')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    args = parser.parse_args()
+
+    hvd.init()
+    global rank
+    rank = hvd.rank()
+    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+        'Model not supported now.'
+    model = eval('tf_models.' + args.model)
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+    if args.opt == 'sgd':
+        print_rank0('Use SGD Optimizer.')
+        opt = tf.train.GradientDescentOptimizer(
+            learning_rate=args.learning_rate)
+    elif args.opt == 'momentum':
+        print_rank0('Use Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9)
+    elif args.opt == 'nesterov':
+        print_rank0('Use Nesterov Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+    elif args.opt == 'adagrad':
+        print_rank0('Use AdaGrad Optimizer.')
+        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+    else:
+        print_rank0('Use Adam Optimizer.')
+        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    if dataset == 'MNIST':
+        datasets = ht.data.mnist()
+        train_set_x, train_set_y = datasets[0]
+        valid_set_x, valid_set_y = datasets[1]
+        test_set_x, test_set_y = datasets[2]
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        # train_set_x: (50000, 784), train_set_y: (50000,)
+        # valid_set_x: (10000, 784), valid_set_y: (10000,)
+    elif dataset == 'CIFAR10':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+            num_class=10)
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        if args.model == "tf_mlp":
+            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+    elif dataset == 'CIFAR100':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+            num_class=100)
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+    else:
+        raise NotImplementedError
+
+    if dataset == 'MNIST':
+        x = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=(None, 784), name='x')
+        y_ = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=(None, 10), name='y_')
+        loss, y = model(x, y_)
+    elif dataset == 'CIFAR10':
+        if args.model == "tf_mlp":
+            x = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=(None, 3072), name='x')
+            y_ = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=(None, 10), name='y_')
+        else:
+            x = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
+            y_ = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=(None, 10), name='y_')
+        loss, y = model(x, y_, 10)
+    elif dataset == 'CIFAR100':
+        x = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=(None, 32, 32, 3), name='x')
+        y_ = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=(None, 100), name='y_')
+        loss, y = model(x, y_, 100)
+
+    global_step = tf.train.get_or_create_global_step()
+    # here in DistributedOptimizer by default all tensor are reduced on GPU
+    # can use device_sparse=xxx, device_dense=xxx to modify
+    # if using device_sparse='/cpu:0', the performance degrades
+    train_op = hvd.DistributedOptimizer(
+        opt).minimize(loss, global_step=global_step)
+
+    gpu_options = tf.compat.v1.GPUOptions(
+        allow_growth=True, visible_device_list=str(hvd.local_rank()))
+    # here horovod default use gpu to initialize, which will cause OOM
+    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+    sess = tf.compat.v1.train.MonitoredTrainingSession(
+        hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+    # sess.run(tf.compat.v1.global_variables_initializer())
+
+    # training
+    print_rank0("Start training loop...")
+    running_time = 0
+    for i in range(args.num_epochs + 1):
+        print_rank0("Epoch %d" % i)
+        loss_all = 0
+        batch_num = 0
+        if args.timing:
+            start = time.time()
+        correct_predictions = []
+        for minibatch_index in range(n_train_batches):
+            minibatch_start = minibatch_index * args.batch_size
+            minibatch_end = (minibatch_index + 1) * args.batch_size
+            x_val = train_set_x[minibatch_start:minibatch_end]
+            y_val = train_set_y[minibatch_start:minibatch_end]
+            loss_val, predict_y, _ = sess.run([loss, y, train_op],
+                                              feed_dict={x: x_val, y_: y_val})
+            correct_prediction = np.equal(
+                np.argmax(y_val, 1),
+                np.argmax(predict_y, 1)).astype(np.float32)
+            correct_predictions.extend(correct_prediction)
+            batch_num += 1
+            loss_all += loss_val
+        loss_all /= batch_num
+        accuracy = np.mean(correct_predictions)
+        print_rank0("Train loss = %f" % loss_all)
+        print_rank0("Train accuracy = %f" % accuracy)
+
+        if args.timing:
+            end = time.time()
+            print_rank0("Running time of current epoch = %fs" % (end - start))
+            if i != 0:
+                running_time += (end - start)
+
+        if args.validate:
+            val_loss_all = 0
+            batch_num = 0
+            correct_predictions = []
+            for minibatch_index in range(n_valid_batches):
+                minibatch_start = minibatch_index * args.batch_size
+                minibatch_end = (minibatch_index + 1) * args.batch_size
+                valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+                valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+                loss_val, valid_y_predicted = sess.run([loss, y],
+                                                       feed_dict={x: valid_x_val, y_: valid_y_val})
+                correct_prediction = np.equal(
+                    np.argmax(valid_y_val, 1),
+                    np.argmax(valid_y_predicted, 1)).astype(np.float32)
+                correct_predictions.extend(correct_prediction)
+                val_loss_all += loss_all
+                batch_num += 1
+            val_loss_all /= batch_num
+            accuracy = np.mean(correct_predictions)
+            print_rank0("Validation loss = %f" % val_loss_all)
+            print_rank0("Validation accuracy = %f" % accuracy)
+    print_rank0("*"*50)
+    print_rank0("Running time of total %d epoch = %fs" %
+                (args.num_epochs, running_time))
diff --git a/examples/cnn/scripts/hetu_16gpu.sh b/examples/cnn/scripts/hetu_16gpu.sh
new file mode 100644
index 0000000..4b4c130
--- /dev/null
+++ b/examples/cnn/scripts/hetu_16gpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+depsdir=${workdir}/../../..
+echo $depsdir
+### validate and timing
+$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce
+
diff --git a/examples/cnn/scripts/hetu_1gpu.sh b/examples/cnn/scripts/hetu_1gpu.sh
new file mode 100644
index 0000000..e63b3eb
--- /dev/null
+++ b/examples/cnn/scripts/hetu_1gpu.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+
+# model: 
+# e.g. bash hetu_1gpu.sh mlp CIFAR10
+
+### validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
diff --git a/examples/cnn/scripts/hetu_2gpu_ps.sh b/examples/cnn/scripts/hetu_2gpu_ps.sh
new file mode 100644
index 0000000..92f5eae
--- /dev/null
+++ b/examples/cnn/scripts/hetu_2gpu_ps.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+
+### validate and timing
+python -m hetu.launcher ${workdir}/../local_s1.yml -n 1 --sched &
+python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 0 &
+python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS --gpu 1 &
+wait
\ No newline at end of file
diff --git a/examples/cnn/scripts/hetu_8gpu.sh b/examples/cnn/scripts/hetu_8gpu.sh
new file mode 100644
index 0000000..f2a99e3
--- /dev/null
+++ b/examples/cnn/scripts/hetu_8gpu.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../main.py
+depsdir=${workdir}/../../..
+
+### validate and timing
+# 
+NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
diff --git a/examples/cnn/scripts/horovod_16gpu.sh b/examples/cnn/scripts/horovod_16gpu.sh
new file mode 100644
index 0000000..7db8800
--- /dev/null
+++ b/examples/cnn/scripts/horovod_16gpu.sh
@@ -0,0 +1,11 @@
+
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_tf_horovod.py
+
+# horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing
+
+horovodrun -np 16 --start-timeout 3000 -H daim118:8,daim117:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
+
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+#  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
diff --git a/examples/cnn/scripts/horovod_8gpu.sh b/examples/cnn/scripts/horovod_8gpu.sh
new file mode 100644
index 0000000..10e3cc7
--- /dev/null
+++ b/examples/cnn/scripts/horovod_8gpu.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_tf_horovod.py
+
+horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing
diff --git a/examples/cnn/scripts/pytorch_16gpu_0.sh b/examples/cnn/scripts/pytorch_16gpu_0.sh
new file mode 100644
index 0000000..2a847f9
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_16gpu_0.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=162.105.146.117
+MASTER_PORT=6000
+NNODES=2
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+        ${mainpy} \
+        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/pytorch_16gpu_1.sh b/examples/cnn/scripts/pytorch_16gpu_1.sh
new file mode 100644
index 0000000..dba0ecf
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_16gpu_1.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=162.105.146.117
+MASTER_PORT=39575
+NNODES=2
+NODE_RANK=1
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+        ${mainpy} \
+        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/pytorch_1gpu.sh b/examples/cnn/scripts/pytorch_1gpu.sh
new file mode 100644
index 0000000..cc40166
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_1gpu.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+## validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
diff --git a/examples/cnn/scripts/pytorch_8gpu.sh b/examples/cnn/scripts/pytorch_8gpu.sh
new file mode 100644
index 0000000..ebbb049
--- /dev/null
+++ b/examples/cnn/scripts/pytorch_8gpu.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../torch_main.py
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+        ${mainpy} \
+        --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed
\ No newline at end of file
diff --git a/examples/cnn/scripts/tf_16gpu_worker0.sh b/examples/cnn/scripts/tf_16gpu_worker0.sh
new file mode 100644
index 0000000..0f11c59
--- /dev/null
+++ b/examples/cnn/scripts/tf_16gpu_worker0.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 0 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 1 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 2 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 3 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 4 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 5 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 6 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 7 --gpu 7 --timing --validate &
+wait
+
diff --git a/examples/cnn/scripts/tf_16gpu_worker1.sh b/examples/cnn/scripts/tf_16gpu_worker1.sh
new file mode 100644
index 0000000..dcdde09
--- /dev/null
+++ b/examples/cnn/scripts/tf_16gpu_worker1.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 8 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 9 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 10 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 11 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 12 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 13 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 14 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --config ${workdir}/../settings/tf_dist_s1_w16.json --rank 15 --gpu 7 --timing --validate &
+wait
\ No newline at end of file
diff --git a/examples/cnn/scripts/tf_1gpu.sh b/examples/cnn/scripts/tf_1gpu.sh
new file mode 100644
index 0000000..f58f323
--- /dev/null
+++ b/examples/cnn/scripts/tf_1gpu.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_main.py
+
+### validate and timing
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
+
+### run in cpu
+# python ${mainpy} --model tf_mlp --gpu -1 --validate --timing
diff --git a/examples/cnn/scripts/tf_8gpu.sh b/examples/cnn/scripts/tf_8gpu.sh
new file mode 100644
index 0000000..70ecb79
--- /dev/null
+++ b/examples/cnn/scripts/tf_8gpu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 0 --gpu 0 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 1 --gpu 1 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 2 --gpu 2 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 3 --gpu 3 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 4 --gpu 4 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 5 --gpu 5 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 6 --gpu 6 --timing --validate &
+python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --config ${workdir}/../settings/tf_dist_s1_w8.json --rank 7 --gpu 7 --timing --validate &
+wait
+
diff --git a/examples/cnn/settings/tf_dist_s1_w16.json b/examples/cnn/settings/tf_dist_s1_w16.json
new file mode 100644
index 0000000..2b3c1d0
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w16.json
@@ -0,0 +1,23 @@
+{
+    "worker": [
+        "162.105.146.117:34569",
+        "162.105.146.117:34568",
+        "162.105.146.117:34567",
+        "162.105.146.117:34566",
+        "162.105.146.117:34565",
+        "162.105.146.117:34564",
+        "162.105.146.117:34563",
+        "162.105.146.117:34562",
+        "162.105.146.118:34779",
+        "162.105.146.118:34778",
+        "162.105.146.118:34777",
+        "162.105.146.118:34776",
+        "162.105.146.118:34775",
+        "162.105.146.118:34774",
+        "162.105.146.118:34773",
+        "162.105.146.118:34772"
+    ],
+    "ps": [
+        "162.105.146.117:34575"
+    ]
+}
\ No newline at end of file
diff --git a/examples/cnn/settings/tf_dist_s1_w4.json b/examples/cnn/settings/tf_dist_s1_w4.json
new file mode 100644
index 0000000..c9f08f6
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w4.json
@@ -0,0 +1,11 @@
+{
+    "worker": [
+        "162.105.146.119:34569",
+        "162.105.146.119:34568",
+        "162.105.146.119:34567",
+        "162.105.146.119:34566"
+    ],
+    "ps": [
+        "162.105.146.119:34575"
+    ]
+}
\ No newline at end of file
diff --git a/examples/cnn/settings/tf_dist_s1_w8.json b/examples/cnn/settings/tf_dist_s1_w8.json
new file mode 100644
index 0000000..92ce51c
--- /dev/null
+++ b/examples/cnn/settings/tf_dist_s1_w8.json
@@ -0,0 +1,15 @@
+{
+    "worker": [
+        "162.105.146.119:34569",
+        "162.105.146.119:34568",
+        "162.105.146.119:34567",
+        "162.105.146.119:34566",
+        "162.105.146.119:34565",
+        "162.105.146.119:34564",
+        "162.105.146.119:34563",
+        "162.105.146.119:34562"
+    ],
+    "ps": [
+        "162.105.146.119:34575"
+    ]
+}
\ No newline at end of file
diff --git a/examples/cnn/tf_launch_server.py b/examples/cnn/tf_launch_server.py
new file mode 100644
index 0000000..67ca216
--- /dev/null
+++ b/examples/cnn/tf_launch_server.py
@@ -0,0 +1,49 @@
+import os
+import tensorflow as tf
+import multiprocessing
+import signal
+import json
+import argparse
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+
+
+pop_env()
+
+
+def start_server(cluster, task_id):
+    server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
+    server.join()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, default='./settings/tf_dist_s1_w8.json', help="config file path")
+    parser.add_argument("--id", type=int, required=True)
+    args = parser.parse_args()
+    raw_config = args.config
+    config = json.load(open(raw_config))
+    cluster = tf.train.ClusterSpec(config)
+    global proc
+    proc = multiprocessing.Process(
+        target=start_server, args=[cluster, args.id, ])
+    proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    proc.join()
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    global proc
+    proc.kill()
+    exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/cnn/tf_launch_worker.py b/examples/cnn/tf_launch_worker.py
new file mode 100644
index 0000000..925bc83
--- /dev/null
+++ b/examples/cnn/tf_launch_worker.py
@@ -0,0 +1,234 @@
+import tensorflow as tf
+import tf_models
+import hetu as ht
+
+import numpy as np
+import argparse
+import json
+from time import time
+import os
+import logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+    if task_id % 8 == 0:
+        logger.info(msg)
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        help='model to be tested')
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='dataset to be trained on')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=20, help='epoch number')
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='gpu to be used, -1 means cpu')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    parser.add_argument("--rank", type=int, required=True,
+                        help="rank of process")
+    parser.add_argument(
+        "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")
+
+    args = parser.parse_args()
+    global task_id
+
+    task_id = int(args.rank)
+    print_rank0("task id %d" % (task_id))
+    raw_config = args.config
+
+    if args.gpu == -1:
+        device = '/job:worker/task:%d/cpu:0' % (task_id)
+        print_rank0('Use CPU.')
+    else:
+        device = "/job:worker/task:%d/gpu:%d" % (task_id, args.gpu)
+        print_rank0('Use GPU %d.' % args.gpu)
+
+    config = json.load(open(raw_config))
+    cluster = tf.train.ClusterSpec(config)
+
+    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+        'Model not supported now.'
+    model = eval('tf_models.' + args.model)
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+    if args.opt == 'sgd':
+        print_rank0('Use SGD Optimizer.')
+        opt = tf.train.GradientDescentOptimizer(
+            learning_rate=args.learning_rate)
+    elif args.opt == 'momentum':
+        print_rank0('Use Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9)
+    elif args.opt == 'nesterov':
+        print_rank0('Use Nesterov Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+    elif args.opt == 'adagrad':
+        print_rank0('Use AdaGrad Optimizer.')
+        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+    else:
+        print_rank0('Use Adam Optimizer.')
+        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    with tf.device(
+            tf.compat.v1.train.replica_device_setter(
+                worker_device=device,
+                cluster=cluster)):
+        # data loading
+        print_rank0('Loading %s data...' % dataset)
+        if dataset == 'MNIST':
+            datasets = ht.data.mnist()
+            train_set_x, train_set_y = datasets[0]
+            valid_set_x, valid_set_y = datasets[1]
+            test_set_x, test_set_y = datasets[2]
+            n_train_batches = train_set_x.shape[0] // args.batch_size
+            n_valid_batches = valid_set_x.shape[0] // args.batch_size
+            # train_set_x: (50000, 784), train_set_y: (50000,)
+            # valid_set_x: (10000, 784), valid_set_y: (10000,)
+        elif dataset == 'CIFAR10':
+            train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+                num_class=10)
+            n_train_batches = train_set_x.shape[0] // args.batch_size
+            n_valid_batches = valid_set_x.shape[0] // args.batch_size
+            if args.model == "tf_mlp":
+                train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+                valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+
+            # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+            # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+        elif dataset == 'CIFAR100':
+            train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+                num_class=100)
+            n_train_batches = train_set_x.shape[0] // args.batch_size
+            n_valid_batches = valid_set_x.shape[0] // args.batch_size
+            # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+            # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+        else:
+            raise NotImplementedError
+
+        if dataset == 'MNIST':
+            x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
+            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
+            loss, y = model(x, y_)
+        elif dataset == 'CIFAR10':
+            if args.model == "tf_mlp":
+                x = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 3072), name='x')
+                y_ = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 10), name='y_')
+            else:
+                x = tf.placeholder(dtype=tf.float32, shape=(
+                    None, 32, 32, 3), name='x')
+                y_ = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 10), name='y_')
+            loss, y = model(x, y_, 10)
+        elif dataset == 'CIFAR100':
+            x = tf.placeholder(dtype=tf.float32, shape=(
+                None, 32, 32, 3), name='x')
+            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
+            loss, y = model(x, y_, 100)
+        train_op = opt.minimize(loss)
+
+        server = tf.train.Server(
+            cluster, job_name="worker", task_index=task_id)
+
+        init = tf.compat.v1.global_variables_initializer()
+        sv = tf.train.Supervisor(
+            is_chief=(task_id == 0),
+            init_op=init,
+            recovery_wait_secs=1)
+        sess_config = tf.compat.v1.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=False,
+            device_filters=["/job:ps",
+                            "/job:worker/task:%d" % task_id])
+        sess = sv.prepare_or_wait_for_session(
+            server.target, config=sess_config)
+
+        sess.run(init)
+        # training
+        print_rank0("Start training loop...")
+        running_time = 0
+        for i in range(args.num_epochs + 1):
+            print_rank0("Epoch %d" % i)
+            loss_all = 0
+            batch_num = 0
+            if args.timing:
+                start = time()
+            correct_predictions = []
+            for minibatch_index in range(n_train_batches):
+                minibatch_start = minibatch_index * args.batch_size
+                minibatch_end = (minibatch_index + 1) * args.batch_size
+                x_val = train_set_x[minibatch_start:minibatch_end]
+                y_val = train_set_y[minibatch_start:minibatch_end]
+                loss_val, predict_y, _ = sess.run([loss, y, train_op],
+                                                  feed_dict={x: x_val, y_: y_val})
+                correct_prediction = np.equal(
+                    np.argmax(y_val, 1),
+                    np.argmax(predict_y, 1)).astype(np.float32)
+                correct_predictions.extend(correct_prediction)
+                batch_num += 1
+                loss_all += loss_val
+            loss_all /= batch_num
+            accuracy = np.mean(correct_predictions)
+            print_rank0("Train loss = %f" % loss_all)
+            print_rank0("Train accuracy = %f" % accuracy)
+
+            if args.timing:
+                end = time()
+                print_rank0("Running time of current epoch = %fs" %
+                            (end - start))
+                if i != 0:
+                    running_time += (end - start)
+
+            if args.validate:
+                val_loss_all = 0
+                batch_num = 0
+                correct_predictions = []
+                for minibatch_index in range(n_valid_batches):
+                    minibatch_start = minibatch_index * args.batch_size
+                    minibatch_end = (minibatch_index + 1) * args.batch_size
+                    valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+                    valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+                    loss_val, valid_y_predicted = sess.run([loss, y],
+                                                           feed_dict={x: valid_x_val, y_: valid_y_val})
+                    correct_prediction = np.equal(
+                        np.argmax(valid_y_val, 1),
+                        np.argmax(valid_y_predicted, 1)).astype(np.float32)
+                    correct_predictions.extend(correct_prediction)
+                    val_loss_all += loss_all
+                    batch_num += 1
+                val_loss_all /= batch_num
+                accuracy = np.mean(correct_predictions)
+                print_rank0("Validation loss = %f" % val_loss_all)
+                print_rank0("Validation accuracy = %f" % accuracy)
+        print_rank0("*"*50)
+        print_rank0("Running time of total %d epoch = %fs" %
+                    (args.num_epochs, running_time))
diff --git a/examples/cnn/tf_main.py b/examples/cnn/tf_main.py
new file mode 100644
index 0000000..feb1400
--- /dev/null
+++ b/examples/cnn/tf_main.py
@@ -0,0 +1,194 @@
+import tensorflow as tf
+import tf_models
+import hetu as ht
+import numpy as np
+import argparse
+from time import time
+import logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+    logger.info(msg)
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        help='model to be tested')
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='dataset to be trained on')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=20, help='epoch number')
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='gpu to be used, -1 means cpu')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    args = parser.parse_args()
+
+    if args.gpu == -1:
+        device = '/cpu:0'
+        print_rank0('Use CPU.')
+    else:
+        device = '/gpu:%d' % args.gpu
+        print_rank0('Use GPU %d.' % args.gpu)
+
+    print_rank0("Training {} on TensorFlow".format(args.model))
+    assert args.model in ['tf_cnn_3_layers', 'tf_lenet', 'tf_logreg', 'tf_lstm', 'tf_mlp', 'tf_resnet18', 'tf_resnet34', 'tf_rnn', 'tf_vgg16', 'tf_vgg19'], \
+        'Model not supported now.'
+    model = eval('tf_models.' + args.model)
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+    if args.opt == 'sgd':
+        print_rank0('Use SGD Optimizer.')
+        opt = tf.train.GradientDescentOptimizer(
+            learning_rate=args.learning_rate)
+    elif args.opt == 'momentum':
+        print_rank0('Use Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9)
+    elif args.opt == 'nesterov':
+        print_rank0('Use Nesterov Momentum Optimizer.')
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=args.learning_rate, momentum=0.9, use_nesterov=True)
+    elif args.opt == 'adagrad':
+        print_rank0('Use AdaGrad Optimizer.')
+        opt = tf.train.AdagradOptimizer(learning_rate=args.learning_rate)
+    else:
+        print_rank0('Use Adam Optimizer.')
+        opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    # model definition
+    print_rank0('Building model...')
+    with tf.device(device):
+        if dataset == 'MNIST':
+            x = tf.placeholder(dtype=tf.float32, shape=(None, 784), name='x')
+            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 10), name='y_')
+            loss, y = model(x, y_)
+        elif dataset == 'CIFAR10':
+            if args.model == "tf_mlp":
+                x = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 3072), name='x')
+                y_ = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 10), name='y_')
+            else:
+                x = tf.placeholder(dtype=tf.float32, shape=(
+                    None, 32, 32, 3), name='x')
+                y_ = tf.placeholder(
+                    dtype=tf.float32, shape=(None, 10), name='y_')
+            loss, y = model(x, y_, 10)
+        elif dataset == 'CIFAR100':
+            x = tf.placeholder(dtype=tf.float32, shape=(
+                None, 32, 32, 3), name='x')
+            y_ = tf.placeholder(dtype=tf.float32, shape=(None, 100), name='y_')
+            loss, y = model(x, y_, 100)
+
+        train_op = opt.minimize(loss)
+
+    # data loading
+    print_rank0('Loading %s data...' % dataset)
+    if dataset == 'MNIST':
+        datasets = ht.data.mnist()
+        train_set_x, train_set_y = datasets[0]
+        valid_set_x, valid_set_y = datasets[1]
+        test_set_x, test_set_y = datasets[2]
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        # train_set_x: (50000, 784), train_set_y: (50000,)
+        # valid_set_x: (10000, 784), valid_set_y: (10000,)
+    elif dataset == 'CIFAR10':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+            num_class=10)
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        if args.model == "tf_mlp":
+            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+    elif dataset == 'CIFAR100':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.tf_normalize_cifar(
+            num_class=100)
+        n_train_batches = train_set_x.shape[0] // args.batch_size
+        n_valid_batches = valid_set_x.shape[0] // args.batch_size
+        # train_set_x: (50000, 32, 32, 3), train_set_y: (50000,)
+        # valid_set_x: (10000, 32, 32, 3), valid_set_y: (10000,)
+    else:
+        raise NotImplementedError
+
+    # training
+    print_rank0("Start training loop...")
+    running_time = 0
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        for i in range(args.num_epochs + 1):
+            print_rank0("Epoch %d" % i)
+            loss_all = 0
+            batch_num = 0
+            if args.timing:
+                start = time()
+            correct_predictions = []
+            for minibatch_index in range(n_train_batches):
+                minibatch_start = minibatch_index * args.batch_size
+                minibatch_end = (minibatch_index + 1) * args.batch_size
+                x_val = train_set_x[minibatch_start:minibatch_end]
+                y_val = train_set_y[minibatch_start:minibatch_end]
+                loss_val, predict_y, _ = sess.run([loss, y, train_op],
+                                                  feed_dict={x: x_val, y_: y_val})
+                correct_prediction = np.equal(
+                    np.argmax(y_val, 1),
+                    np.argmax(predict_y, 1)).astype(np.float32)
+                correct_predictions.extend(correct_prediction)
+                batch_num += 1
+                loss_all += loss_val
+            loss_all /= batch_num
+            accuracy = np.mean(correct_predictions)
+            print_rank0("Train loss = %f" % loss_all)
+            print_rank0("Train accuracy = %f" % accuracy)
+
+            if args.timing:
+                end = time()
+                print_rank0("Running time of current epoch = %fs" %
+                            (end - start))
+                if i != 0:
+                    running_time += (end - start)
+
+            if args.validate:
+                val_loss_all = 0
+                batch_num = 0
+                correct_predictions = []
+                for minibatch_index in range(n_valid_batches):
+                    minibatch_start = minibatch_index * args.batch_size
+                    minibatch_end = (minibatch_index + 1) * args.batch_size
+                    valid_x_val = valid_set_x[minibatch_start:minibatch_end]
+                    valid_y_val = valid_set_y[minibatch_start:minibatch_end]
+                    loss_val, valid_y_predicted = sess.run([loss, y],
+                                                           feed_dict={x: valid_x_val, y_: valid_y_val})
+                    correct_prediction = np.equal(
+                        np.argmax(valid_y_val, 1),
+                        np.argmax(valid_y_predicted, 1)).astype(np.float32)
+                    correct_predictions.extend(correct_prediction)
+                    val_loss_all += loss_all
+                    batch_num += 1
+                val_loss_all /= batch_num
+                accuracy = np.mean(correct_predictions)
+                print_rank0("Validation loss = %f" % val_loss_all)
+                print_rank0("Validation accuracy = %f" % accuracy)
+        print_rank0("*"*50)
+        print_rank0("Running time of total %d epoch = %fs" %
+                    (args.num_epochs, running_time))
diff --git a/examples/cnn/tf_models/__init__.py b/examples/cnn/tf_models/__init__.py
new file mode 100644
index 0000000..6d713ff
--- /dev/null
+++ b/examples/cnn/tf_models/__init__.py
@@ -0,0 +1,8 @@
+from .tf_LogReg import tf_logreg
+from .tf_CNN import tf_cnn_3_layers
+from .tf_LeNet import tf_lenet
+from .tf_MLP import tf_mlp
+from .tf_RNN import tf_rnn
+from .tf_LSTM import tf_lstm
+from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34
+from .tf_VGG import tf_vgg16, tf_vgg19
diff --git a/examples/cnn/tf_models/tf_CNN.py b/examples/cnn/tf_models/tf_CNN.py
new file mode 100644
index 0000000..d8036a7
--- /dev/null
+++ b/examples/cnn/tf_models/tf_CNN.py
@@ -0,0 +1,45 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv_relu_avg(x, shape):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=shape).transpose([2, 3, 1, 0]).astype(np.float32))
+    x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
+    x = tf.nn.relu(x)
+    x = tf.nn.avg_pool(x, ksize=[1, 2, 2, 1],
+                       padding='VALID', strides=[1, 2, 2, 1])
+    return x
+
+
+def tf_fc(x, shape):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=shape).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=shape[-1:]).astype(np.float32))
+    x = tf.reshape(x, (-1, shape[0]))
+    y = tf.matmul(x, weight) + bias
+    return y
+
+
+def tf_cnn_3_layers(x, y_):
+    '''
+    3-layer-CNN model in TensorFlow, for MNIST dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print('Building 3-layer-CNN model in tensorflow...')
+    x = tf.reshape(x, [-1, 28, 28, 1])
+    x = tf_conv_relu_avg(x, (32, 1, 5, 5))
+    x = tf_conv_relu_avg(x, (64, 32, 5, 5))
+    x = tf.transpose(x, [0, 3, 1, 2])
+    y = tf_fc(x, (7 * 7 * 64, 10))
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_LSTM.py b/examples/cnn/tf_models/tf_LSTM.py
new file mode 100644
index 0000000..32ebe23
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LSTM.py
@@ -0,0 +1,81 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_lstm(x, y_):
+    '''
+    LSTM model in TensorFlow, for MNIST dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print("Building LSTM model in tensorflow...")
+    diminput = 28
+    dimhidden = 128
+    dimoutput = 10
+    nsteps = 28
+
+    forget_gate_w = tf.Variable(np.random.normal(
+        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+    forget_gate_u = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+    forget_gate_b = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden,)).astype(np.float32))
+    input_gate_w = tf.Variable(np.random.normal(
+        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+    input_gate_u = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+    input_gate_b = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden,)).astype(np.float32))
+    output_gate_w = tf.Variable(np.random.normal(
+        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+    output_gate_u = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+    output_gate_b = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden,)).astype(np.float32))
+    tanh_w = tf.Variable(np.random.normal(
+        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+    tanh_u = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimhidden)).astype(np.float32))
+    tanh_b = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden,)).astype(np.float32))
+    out_weights = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
+    out_bias = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimoutput,)).astype(np.float32))
+    initial_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32)
+
+    last_c_state = initial_state
+    last_h_state = initial_state
+
+    for i in range(nsteps):
+        cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
+        # forget gate
+        cur_forget = tf.matmul(last_h_state, forget_gate_u) + \
+            tf.matmul(cur_x, forget_gate_w) + forget_gate_b
+        cur_forget = tf.sigmoid(cur_forget)
+        # input gate
+        cur_input = tf.matmul(last_h_state, input_gate_u) + \
+            tf.matmul(cur_x, input_gate_w) + input_gate_b
+        cur_input = tf.sigmoid(cur_input)
+        # output gate
+        cur_output = tf.matmul(last_h_state, output_gate_u) + \
+            tf.matmul(cur_x, output_gate_w) + output_gate_b
+        cur_output = tf.sigmoid(cur_output)
+        # tanh
+        cur_tanh = tf.matmul(last_h_state, tanh_u) + \
+            tf.matmul(cur_x, tanh_w) + tanh_b
+        cur_tanh = tf.tanh(cur_tanh)
+
+        last_c_state = last_c_state * cur_forget + cur_input * cur_tanh
+        last_h_state = tf.tanh(last_c_state) * cur_output
+
+    y = tf.matmul(last_h_state, out_weights) + out_bias
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_LeNet.py b/examples/cnn/tf_models/tf_LeNet.py
new file mode 100644
index 0000000..03905f5
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LeNet.py
@@ -0,0 +1,49 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv_pool(x, in_channel, out_channel):
+    weight = tf.Variable(np.random.normal(scale=0.1, size=(
+        out_channel, in_channel, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
+    x = tf.nn.conv2d(x, weight, padding='SAME', strides=[1, 1, 1, 1])
+    x = tf.nn.relu(x)
+    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
+                       padding='VALID', strides=[1, 2, 2, 1])
+    return x
+
+
+def tf_fc(x, shape, with_relu=True):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=shape).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=shape[-1:]).astype(np.float32))
+    x = tf.matmul(x, weight) + bias
+    if with_relu:
+        x = tf.nn.relu(x)
+    return x
+
+
+def tf_lenet(x, y_):
+    '''
+    LeNet model in TensorFlow, for MNIST dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print('Building LeNet model in tensorflow...')
+    x = tf.reshape(x, [-1, 28, 28, 1])
+    x = tf_conv_pool(x, 1,  6)
+    x = tf_conv_pool(x, 6, 16)
+    x = tf.transpose(x, [0, 3, 1, 2])
+    x = tf.reshape(x, (-1, 7*7*16))
+    x = tf_fc(x, (7*7*16, 120), with_relu=True)
+    x = tf_fc(x, (120, 84), with_relu=True)
+    y = tf_fc(x, (84,  10), with_relu=False)
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_LogReg.py b/examples/cnn/tf_models/tf_LogReg.py
new file mode 100644
index 0000000..67a7577
--- /dev/null
+++ b/examples/cnn/tf_models/tf_LogReg.py
@@ -0,0 +1,23 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_logreg(x, y_):
+    '''
+    Logistic Regression model in TensorFlow, for MNIST dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print("Build logistic regression model in tensorflow...")
+    weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32))
+    bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32))
+    y = tf.matmul(x, weight) + bias
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_MLP.py b/examples/cnn/tf_models/tf_MLP.py
new file mode 100644
index 0000000..8a23646
--- /dev/null
+++ b/examples/cnn/tf_models/tf_MLP.py
@@ -0,0 +1,34 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_fc(x, shape, with_relu=True):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=shape).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=shape[-1:]).astype(np.float32))
+    x = tf.matmul(x, weight) + bias
+    if with_relu:
+        x = tf.nn.relu(x)
+    return x
+
+
+def tf_mlp(x, y_, num_class=10):
+    '''
+    MLP model in TensorFlow, for CIFAR dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print("Building MLP model in tensorflow...")
+    x = tf_fc(x, (3072, 256), with_relu=True)
+    x = tf_fc(x, (256, 256), with_relu=True)
+    y = tf_fc(x, (256, num_class), with_relu=False)
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_RNN.py b/examples/cnn/tf_models/tf_RNN.py
new file mode 100644
index 0000000..c1d1436
--- /dev/null
+++ b/examples/cnn/tf_models/tf_RNN.py
@@ -0,0 +1,49 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_rnn(x, y_):
+    '''
+    RNN model in TensorFlow, for MNIST dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+
+    print("Building RNN model in tensorflow...")
+    diminput = 28
+    dimhidden = 128
+    dimoutput = 10
+    nsteps = 28
+
+    weight1 = tf.Variable(np.random.normal(
+        scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+    bias1 = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, )).astype(np.float32))
+    weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
+        dimhidden + dimhidden, dimhidden)).astype(np.float32))
+    bias2 = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, )).astype(np.float32))
+    weight3 = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
+    bias3 = tf.Variable(np.random.normal(
+        scale=0.1, size=(dimoutput, )).astype(np.float32))
+    last_state = tf.zeros((128, dimhidden), dtype=tf.float32)
+
+    for i in range(nsteps):
+        cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
+        h = tf.matmul(cur_x, weight1) + bias1
+
+        s = tf.concat([h, last_state], axis=1)
+        s = tf.matmul(s, weight2) + bias2
+        last_state = tf.nn.relu(s)
+
+    final_state = last_state
+    y = tf.matmul(final_state, weight3) + bias3
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
diff --git a/examples/cnn/tf_models/tf_ResNet.py b/examples/cnn/tf_models/tf_ResNet.py
new file mode 100644
index 0000000..a0e0b2c
--- /dev/null
+++ b/examples/cnn/tf_models/tf_ResNet.py
@@ -0,0 +1,113 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_conv2d(x, in_channel, out_channel, stride=1):
+    weight = tf.Variable(np.random.normal(scale=0.1, size=(
+        out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
+    x = tf.nn.conv2d(x, weight, strides=[1, stride, stride, 1], padding='SAME')
+    return x
+
+
+def tf_batch_norm_with_relu(x, hidden):
+    scale = tf.Variable(np.random.normal(
+        scale=0.1, size=(hidden,)).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=(hidden,)).astype(np.float32))
+    axis = list(range(len(x.shape) - 1))
+    a_mean, a_var = tf.nn.moments(x, axis)
+    x = tf.nn.batch_normalization(
+        x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
+    x = tf.nn.relu(x)
+    return x
+
+
+def tf_resnet_block(x, in_channel, num_blocks, is_first=False):
+    if is_first:
+        out_channel = in_channel
+        identity = x
+        x = tf_conv2d(x, in_channel, out_channel, stride=1)
+        x = tf_batch_norm_with_relu(x, out_channel)
+        x = tf_conv2d(x, out_channel, out_channel, stride=1)
+        x = x + identity
+    else:
+        out_channel = 2 * in_channel
+        identity = x
+        x = tf_batch_norm_with_relu(x, in_channel)
+        x = tf_conv2d(x, in_channel, out_channel, stride=2)
+        x = tf_batch_norm_with_relu(x, out_channel)
+        x = tf_conv2d(x, out_channel, out_channel, stride=1)
+        identity = tf.nn.avg_pool(identity, ksize=[1, 2, 2, 1], strides=[
+                                  1, 2, 2, 1], padding='VALID')
+        identity = tf.pad(identity, [[0, 0], [0, 0], [0, 0], [
+                          in_channel // 2, in_channel // 2]])
+        x = x + identity
+
+    for i in range(1, num_blocks):
+        identity = x
+        x = tf_batch_norm_with_relu(x, out_channel)
+        x = tf_conv2d(x, out_channel, out_channel, stride=1)
+        x = tf_batch_norm_with_relu(x, out_channel)
+        x = tf_conv2d(x, out_channel, out_channel, stride=1)
+        x = x + identity
+
+    return x
+
+
+def tf_fc(x, shape):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=shape).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=shape[-1:]).astype(np.float32))
+    x = tf.matmul(x, weight) + bias
+    return x
+
+
+def tf_resnet(x, y_, num_layers, num_class=10):
+    '''
+    ResNet model in TensorFlow, for CIFAR10 dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+        num_layers: 18 or 34
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+    print("Number of Class: {}".format(num_class))
+    base_size = 16
+
+    x = tf_conv2d(x, 3, base_size, stride=1)
+    x = tf_batch_norm_with_relu(x, base_size)
+
+    if num_layers == 18:
+        print("Building ResNet-18 model in tensorflow...")
+        x = tf_resnet_block(x,     base_size, num_blocks=2, is_first=True)
+        x = tf_resnet_block(x,     base_size, num_blocks=2)
+        x = tf_resnet_block(x, 2 * base_size, num_blocks=2)
+        x = tf_resnet_block(x, 4 * base_size, num_blocks=2)
+    elif num_layers == 34:
+        print("Building ResNet-34 model in tensorflow...")
+        x = tf_resnet_block(x,     base_size, num_blocks=3, is_first=True)
+        x = tf_resnet_block(x,     base_size, num_blocks=4)
+        x = tf_resnet_block(x, 2 * base_size, num_blocks=6)
+        x = tf_resnet_block(x, 4 * base_size, num_blocks=3)
+    else:
+        assert False, "Number of layers should be 18 or 34 !"
+
+    x = tf_batch_norm_with_relu(x, 8 * base_size)
+    x = tf.transpose(x, [0, 3, 1, 2])
+    x = tf.reshape(x, [-1, 128 * base_size])
+    y = tf_fc(x, (128 * base_size, num_class))
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
+
+
+def tf_resnet18(x, y_, num_class=10):
+    return tf_resnet(x, y_, 18, num_class)
+
+
+def tf_resnet34(x, y_, num_class=10):
+    return tf_resnet(x, y_, 34, num_class)
diff --git a/examples/cnn/tf_models/tf_VGG.py b/examples/cnn/tf_models/tf_VGG.py
new file mode 100644
index 0000000..124457d
--- /dev/null
+++ b/examples/cnn/tf_models/tf_VGG.py
@@ -0,0 +1,103 @@
+import numpy as np
+import tensorflow as tf
+
+
+def conv_bn_relu(x, in_channel, out_channel):
+    weight = tf.Variable(np.random.normal(scale=0.1, size=(
+        out_channel, in_channel, 3, 3)).transpose([2, 3, 1, 0]).astype(np.float32))
+    scale = tf.Variable(np.random.normal(
+        scale=0.1, size=(out_channel,)).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=(out_channel,)).astype(np.float32))
+    x = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding='SAME')
+    axis = list(range(len(x.shape) - 1))
+    a_mean, a_var = tf.nn.moments(x, axis)
+    x = tf.nn.batch_normalization(
+        x, mean=a_mean, variance=a_var, scale=scale, offset=bias, variance_epsilon=1e-2)
+    x = tf.nn.relu(x)
+    return x
+
+
+def vgg_2block(x, in_channel, out_channel):
+    x = conv_bn_relu(x, in_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+                       1, 2, 2, 1], padding='VALID')
+    return x
+
+
+def vgg_3block(x, in_channel, out_channel):
+    x = conv_bn_relu(x, in_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+                       1, 2, 2, 1], padding='VALID')
+    return x
+
+
+def vgg_4block(x, in_channel, out_channel):
+    x = conv_bn_relu(x, in_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = conv_bn_relu(x, out_channel, out_channel)
+    x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[
+                       1, 2, 2, 1], padding='VALID')
+    return x
+
+
+def tf_fc(x, in_feat, out_feat):
+    weight = tf.Variable(np.random.normal(
+        scale=0.1, size=(in_feat, out_feat)).astype(np.float32))
+    bias = tf.Variable(np.random.normal(
+        scale=0.1, size=(out_feat,)).astype(np.float32))
+    x = tf.matmul(x, weight) + bias
+    return x
+
+
+def tf_vgg(x, y_, num_layers, num_class=10):
+    '''
+    ResNet model in TensorFlow, for CIFAR10 dataset.
+
+    Parameters:
+        x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, H, W, C)
+        y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+        num_layers: 18 or 34
+    Return:
+        loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
+        y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
+    '''
+    if num_layers == 16:
+        print('Building VGG-16 model in tensorflow')
+        x = vgg_2block(x,   3,  64)
+        x = vgg_2block(x,  64, 128)
+        x = vgg_3block(x, 128, 256)
+        x = vgg_3block(x, 256, 512)
+        x = vgg_3block(x, 512, 512)
+
+    elif num_layers == 19:
+        print('Building VGG-19 model in tensorflow')
+        x = vgg_2block(x,   3,  64)
+        x = vgg_2block(x,  64, 128)
+        x = vgg_4block(x, 128, 256)
+        x = vgg_4block(x, 256, 512)
+        x = vgg_4block(x, 512, 512)
+    else:
+        assert False, "Number of layers should be 18 or 34 !"
+
+    x = tf.reshape(x, [-1, 512])
+    x = tf_fc(x,  512, 4096)
+    x = tf_fc(x, 4096, 4096)
+    y = tf_fc(x, 4096, num_class)
+    print("Number of Class: {}".format(num_class))
+
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
+    loss = tf.reduce_mean(loss)
+    return loss, y
+
+
+def tf_vgg16(x, y_, num_class=10):
+    return tf_vgg(x, y_, 16, num_class)
+
+
+def tf_vgg19(x, y_, num_class=10):
+    return tf_vgg(x, y_, 34, num_class)
diff --git a/examples/cnn/torch_main.py b/examples/cnn/torch_main.py
new file mode 100644
index 0000000..f6ed928
--- /dev/null
+++ b/examples/cnn/torch_main.py
@@ -0,0 +1,213 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from pytorch_models import *
+import hetu as ht
+import numpy as np
+import argparse
+from time import time
+import os
+import logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def print_rank0(msg):
+    if local_rank % 8 == 0:
+        logger.info(msg)
+
+
+def train(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None, optimizer=None):
+    print_rank0('Epoch: %d' % epoch)
+    n_train_batches = data.shape[0] // batch_size
+
+    net.train()
+
+    train_loss = 0
+    correct = 0
+    total = 0
+
+    for minibatch_index in range(n_train_batches):
+        minibatch_start = minibatch_index * args.batch_size
+        minibatch_end = (minibatch_index + 1) * args.batch_size
+        inputs = torch.Tensor(data[minibatch_start:minibatch_end])
+        targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
+
+        inputs, targets = inputs.to(device), targets.to(device)
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, targets)
+        loss.backward()
+        optimizer.step()
+
+        train_loss += loss.item()
+        _, predicted = outputs.max(1)
+        total += targets.size(0)
+        correct += predicted.eq(targets).sum().item()
+
+    print_rank0("Train loss = %f" % (train_loss/(minibatch_index+1)))
+    print_rank0("Train accuracy = %f" % (100.*correct/total))
+
+
+def test(epoch=-1, net=None, data=None, label=None, batch_size=-1, criterion=None):
+    net.eval()
+    n_test_batches = data.shape[0] // batch_size
+    test_loss = 0
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for minibatch_index in range(n_test_batches):
+            minibatch_start = minibatch_index * args.batch_size
+            minibatch_end = (minibatch_index + 1) * args.batch_size
+            inputs = torch.Tensor(data[minibatch_start:minibatch_end])
+            targets = torch.Tensor(label[minibatch_start:minibatch_end]).long()
+
+            inputs, targets = inputs.to(device), targets.to(device)
+            outputs = net(inputs)
+            loss = criterion(outputs, targets)
+            test_loss += loss.item()
+            _, predicted = outputs.max(1)
+            total += targets.size(0)
+            correct += predicted.eq(targets).sum().item()
+
+        print_rank0("Validation loss = %f" % (test_loss/(minibatch_index+1)))
+        print_rank0("Validation accuracy = %f" % (100.*correct/total))
+
+
+if __name__ == "__main__":
+    # argument parser
+    global local_rank
+    local_rank = 0
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                        help='model to be tested')
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='dataset to be trained on')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=20, help='epoch number')
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='gpu to be used, -1 means cpu')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    parser.add_argument('--distributed', action='store_true',
+                        help='whether to distributed training')
+    parser.add_argument('--local_rank', type=int, default=-1)
+    args = parser.parse_args()
+
+    if args.distributed == True:
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        rank = int(os.getenv('RANK', '0'))
+        world_size = int(os.getenv("WORLD_SIZE", '1'))
+        print("***"*50)
+        print(init_method)
+        torch.distributed.init_process_group(backend="nccl",
+                                             world_size=world_size,
+                                             rank=rank,
+                                             init_method=init_method)
+
+    if args.gpu == -1:
+        device = 'cpu'
+    else:
+        if args.distributed == True:
+            local_rank = rank % torch.cuda.device_count()
+            torch.cuda.set_device(local_rank)
+            device = torch.device('cuda:%d' % local_rank)
+            logger.info('Use GPU %d.' % local_rank)
+        else:
+            device = torch.device('cuda:%d' % args.gpu)
+            torch.cuda.set_device(args.gpu)
+            print_rank0('Use GPU %d.' % args.gpu)
+
+    assert args.model in ['mlp', 'resnet18', 'resnet34',
+                          'vgg16', 'vgg19', 'rnn'], 'Model not supported now.'
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+
+    if args.model in ['resnet18', 'resnet34', 'vgg16', 'vgg19'] and args.dataset == 'CIFAR100':
+        net = eval(args.model)(100)
+    elif args.model == 'rnn':
+        net = eval(args.model)(28, 10, 128, 28)
+    else:
+        net = eval(args.model)()
+
+    assert args.dataset in ['MNIST', 'CIFAR10', 'CIFAR100', 'ImageNet']
+    dataset = args.dataset
+
+    net.to(device)
+    if args.distributed:
+        net = torch.nn.parallel.DistributedDataParallel(
+            net, device_ids=[local_rank])
+
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+    if args.opt == 'sgd':
+        print_rank0('Use SGD Optimizer.')
+        opt = optim.SGD(net.parameters(), lr=args.learning_rate)
+    elif args.opt == 'momentum':
+        print_rank0('Use Momentum Optimizer.')
+        opt = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9)
+    elif args.opt == 'nesterov':
+        print_rank0('Use Nesterov Momentum Optimizer.')
+        opt = optim.SGD(net.parameters(), lr=args.learning_rate,
+                        momentum=0.9, nesterov=True)
+    elif args.opt == 'adagrad':
+        print_rank0('Use AdaGrad Optimizer.')
+        opt = optim.Adagrad(net.parameters(), lr=args.learning_rate)
+    else:
+        print_rank0('Use Adam Optimizer.')
+        opt = optim.Adam(lr=args.learning_rate)
+
+    criterion = nn.CrossEntropyLoss()
+
+    # data loading
+    print_rank0('Loading %s data...' % dataset)
+    if dataset == 'MNIST':
+        datasets = ht.data.mnist(onehot=False)
+        train_set_x, train_set_y = datasets[0]
+        valid_set_x, valid_set_y = datasets[1]
+        test_set_x, test_set_y = datasets[2]
+    elif dataset == 'CIFAR10':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+            num_class=10, onehot=False)
+        if args.model == "mlp":
+            train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
+            valid_set_x = valid_set_x.reshape(valid_set_x.shape[0], -1)
+    elif dataset == 'CIFAR100':
+        train_set_x, train_set_y, valid_set_x, valid_set_y = ht.data.normalize_cifar(
+            num_class=100, onehot=False)
+
+    running_time = 0
+    # training
+    print_rank0("Start training loop...")
+    for i in range(args.num_epochs + 1):
+        if args.timing:
+            start = time()
+        train(epoch=i, net=net, data=train_set_x, label=train_set_y,
+              batch_size=args.batch_size, criterion=criterion, optimizer=opt)
+        if args.timing:
+            end = time()
+            print_rank0("Running time of current epoch = %fs" % (end - start))
+            if i != 0:
+                running_time += (end - start)
+        test(epoch=i, net=net, data=valid_set_x, label=valid_set_y,
+             batch_size=args.batch_size, criterion=criterion)
+
+    print_rank0("*"*50)
+    print_rank0("Running time of total %d epoch = %fs" %
+                (args.num_epochs, running_time))
diff --git a/examples/cnn/worker_conf0.json b/examples/cnn/worker_conf0.json
new file mode 100644
index 0000000..b705130
--- /dev/null
+++ b/examples/cnn/worker_conf0.json
@@ -0,0 +1,9 @@
+{
+"DMLC_ROLE":"worker",
+"WORKER_ID":"0",
+"DMLC_PS_ROOT_URI":"127.0.0.1",
+"DMLC_PS_ROOT_PORT":"13030",
+"DMLC_NUM_WORKER":"2",
+"DMLC_NUM_SERVER":"1",
+"DMLC_PS_VAN_TYPE":"p3"
+}
diff --git a/examples/cnn/worker_conf1.json b/examples/cnn/worker_conf1.json
new file mode 100644
index 0000000..831826d
--- /dev/null
+++ b/examples/cnn/worker_conf1.json
@@ -0,0 +1,9 @@
+{
+"DMLC_ROLE":"worker",
+"WORKER_ID":"1",
+"DMLC_PS_ROOT_URI":"127.0.0.1",
+"DMLC_PS_ROOT_PORT":"13030",
+"DMLC_NUM_WORKER":"2",
+"DMLC_NUM_SERVER":"1",
+"DMLC_PS_VAN_TYPE":"p3"
+}
diff --git a/examples/ctr/.gitignore b/examples/ctr/.gitignore
new file mode 100644
index 0000000..ccb0cb2
--- /dev/null
+++ b/examples/ctr/.gitignore
@@ -0,0 +1,2 @@
+datasets/
+logs/
diff --git a/examples/ctr/README.md b/examples/ctr/README.md
new file mode 100644
index 0000000..c99c5db
--- /dev/null
+++ b/examples/ctr/README.md
@@ -0,0 +1,109 @@
+# CTR Examples (with Distributed Settings)
+In this directory we provide several models for CTR tasks. We use Wide & Deep model to train on Adult and Criteo dataset, and DeepFM, DCN, DC models on Criteo dataset.
+
+## Structure
+```
+- ctr
+    - datasets/             contains sampled criteo data
+    - models/               ctr models in hetu
+    - tf_models/            ctr models in tensorflow
+    - settings/             configurations for distributed training
+    - tests/                test scripts
+    - kill.sh               script to kill all python processes
+    - run_hetu.py           basic trainer for hetu
+    - run_tf_local.py       local trainer for tensorflow
+    - run_tf_horovod.py     trainer for tensorflow in horovod setting
+    - run_tf_parallax.py    trainer for tensorflow in parallax setting
+    - tf_launch_server.py   launcher for server in tensorflow
+    - tf_launch_worker.py   launcher for worker in tensorflow
+```
+
+## Prepare criteo data
+* We have provided a sampled version of kaggle-criteo dataset, which locates in ./datasets/criteo/ . To use the given data, please do not specify the 'all' flag and 'val' flag when running test files.
+* To download the original kaggle-criteo dataset, please specify a source in models/load_data.py and use ```python models/load_data.py``` to download the whole kaggle-criteo dataset.
+
+
+## Flags for test files
+Here we explain some of the flags you may use in test files:
+* model: to specify the model, candidates are ('wdl_criteo', 'dfm_criteo', 'dcn_criteo', 'wdl_adult')
+* config: to specify the configuration file in settings.
+* val: whether using validation.
+* cache: whether using cache in PS/Hybrid mode.
+* bsp: whether using bsp (default asp) in PS/Hybrid mode. (In Hybrid, AllReduce can enforce dense parameters to use bsp, so there will be no stragglers.)
+* all: whether to use all criteo data.
+* bound: per embedding entry staleness in cache setting, default to be 100.
+
+
+## Usage
+If memory available, you can try to run the model locally, by running
+```bash
+# run locally
+bash tests/local_{model}_{dataset}.sh
+# run in ps setting (locally)
+bash tests/ps_{model}_{dataset}.sh
+# run in hybrid setting (locally)
+bash tests/hybrid_{model}_{dataset}.sh
+
+# run tensorflow locally
+python run_tf_local.py --model {model}_{dataset}
+# run tensorflow in horovod
+horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model {model}_{dataset}
+# run tensorflow in parallax
+python {absolute_path_to}/run_tf_parallax.py
+# run tensorflow in ps setting
+python tf_launch_server.py --config {config} --id {rank}
+python tf_launch_worker.py --model {model}_{dataset} --rank {rank} --config {config}
+```
+
+
+## Configuration
+We use a simple yaml file to specify the run configuration.
+
+```yaml
+shared :
+    DMLC_PS_ROOT_URI : 127.0.0.1
+    DMLC_PS_ROOT_PORT : 13100
+    DMLC_NUM_WORKER : 4
+    DMLC_NUM_SERVER : 1
+launch :
+    worker : 4
+    server : 1
+    scheduler : true
+```
+
+The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.
+
+The following "launch" is only used in PS-mode (ommitted in hybrid mode). This means that the number of worker, server and scheduler launched locally on this machine. In hybrid mode, workers are launched by mpirun. Servers and schedulers will be launched by
+
+
+## Examples
+### Local execution
+Run wdl with criteo locally(if the whole dataset is downloaded, you can use all data or use validate data):
+```bash
+python run_hetu.py --model wdl_criteo (--all) (--val)
+```
+
+### PS mode execution
+Run ps locally, here we can also run on multiple nodes. 
+```bash
+# launch scheduler and server, -n means number of servers, --sched means using scheduler
+python -m hetu.launcher {config} -n 1 --sched
+# launch workers (or run scheduler and server together if configured in config file)
+python run_hetu.py --comm PS --model wdl_criteo --config {config} (--all) (--val) (--cache lfuopt) (--bound 10)
+```
+You can also specify the cache to be used and also the cache bound.
+
+
+### Hybrid mode execution
+You must launch a scheduler and server in one terminal:
+```bash
+python -m hetu.launcher {config} -n 1 --sched
+```
+And then launch the workers simultaneously using mpirun command:
+```bash
+mpirun -np {num_worker} --allow-run-as-root python run_hetu.py --comm Hybrid ...
+```
+Or if in distributed nodes setting:
+```
+mpirun -mca btl_tcp_if_include (network card name or ip) -x NCCL_SOCKET_IFNAME=(network card name) --host (host ips) --allow-run-as-root python run_hetu.py --comm Hybrid ...
+```
diff --git a/examples/ctr/kill.sh b/examples/ctr/kill.sh
new file mode 100755
index 0000000..4d99771
--- /dev/null
+++ b/examples/ctr/kill.sh
@@ -0,0 +1,3 @@
+#/bin/bash
+#pkill -f mnist_mlp_ps.py
+kill -9 $(pidof python)
diff --git a/examples/ctr/models/__init__.py b/examples/ctr/models/__init__.py
new file mode 100644
index 0000000..8c17586
--- /dev/null
+++ b/examples/ctr/models/__init__.py
@@ -0,0 +1,5 @@
+from .wdl_adult import wdl_adult
+from .dcn_criteo import dcn_criteo
+from .dc_criteo import dc_criteo
+from .wdl_criteo import wdl_criteo
+from .deepfm_criteo import dfm_criteo
diff --git a/examples/ctr/models/dc_criteo.py b/examples/ctr/models/dc_criteo.py
new file mode 100644
index 0000000..30b3522
--- /dev/null
+++ b/examples/ctr/models/dc_criteo.py
@@ -0,0 +1,63 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def residual_layer(x0, input_dim, hidden_dim):
+
+    embedding_len = input_dim
+    weight_1 = init.random_normal(
+        shape=(input_dim, hidden_dim), stddev=0.1, name='weight_1')
+    bias_1 = init.random_normal(shape=(hidden_dim,), stddev=0.1, name='bias_1')
+    weight_2 = init.random_normal(
+        shape=(hidden_dim, input_dim), stddev=0.1, name='weight_2')
+    bias_2 = init.random_normal(shape=(input_dim,), stddev=0.1, name='bias_2')
+
+    x0w = ht.matmul_op(x0, weight_1)  # (batch, hidden_dim)
+    x0w_b = x0w + ht.broadcastto_op(bias_1, x0w)
+
+    relu1 = ht.relu_op(x0w_b)
+    x1w = ht.matmul_op(relu1, weight_2)  # (batch, input_dim)
+    x1w_b = x1w + ht.broadcastto_op(bias_2, x1w)
+    residual = x1w_b + x0
+    y = ht.relu_op(residual)
+    return y
+
+
+def build_residual_layers(x0, input_dim, hidden_dim, num_layers=3):
+    for i in range(num_layers):
+        x0 = residual_layer(x0, input_dim, hidden_dim)
+    return x0
+
+
+def dc_criteo(dense_input, sparse_input, y_):
+
+    feature_dimension = 33762577
+    embedding_size = 8
+    learning_rate = 0.001
+
+    Embedding = init.random_normal(
+        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding")
+    sparse_input = ht.embedding_lookup_op(Embedding, sparse_input)
+    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+
+    # dc_model
+    x = ht.concat_op(sparse_input, dense_input, axis=1)
+
+    input_dim = 26 * 8 + 13
+    hidden_dim = input_dim
+    residual_out = build_residual_layers(
+        x, input_dim, hidden_dim, num_layers=5)
+
+    W4 = init.random_normal([26*embedding_size + 13, 1], stddev=0.1, name="W4")
+    y = ht.matmul_op(residual_out, W4)
+    y = ht.sigmoid_op(y)
+
+    loss = ht.binarycrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+    train_op = opt.minimize(loss)
+
+    return loss, y, y_, train_op
diff --git a/examples/ctr/models/dcn_criteo.py b/examples/ctr/models/dcn_criteo.py
new file mode 100644
index 0000000..2222d22
--- /dev/null
+++ b/examples/ctr/models/dcn_criteo.py
@@ -0,0 +1,68 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def cross_layer(x0, x1):
+    # x0: input embedding feature (batch_size, 26 * embedding_size + 13)
+    # x1: the output of last layer (batch_size, 26 * embedding_size + 13)
+
+    embedding_len = 26 * 128 + 13
+    weight = init.random_normal(
+        shape=(embedding_len, 1), stddev=0.01, name='weight')
+    bias = init.random_normal(shape=(embedding_len,), stddev=0.01, name='bias')
+    x1w = ht.matmul_op(x1, weight)  # (batch_size, 1)
+    y = ht.mul_op(x0, ht.broadcastto_op(x1w, x0))
+    y = y + x1 + ht.broadcastto_op(bias, y)
+    return y
+
+
+def build_cross_layer(x0, num_layers=3):
+    x1 = x0
+    for i in range(num_layers):
+        x1 = cross_layer(x0, x1)
+    return x1
+
+
+def dcn_criteo(dense_input, sparse_input, y_):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.003
+
+    Embedding = init.random_normal(
+        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+    sparse_input = ht.embedding_lookup_op(
+        Embedding, sparse_input, ctx=ht.cpu(0))
+    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+    x = ht.concat_op(sparse_input, dense_input, axis=1)
+    # Cross Network
+    cross_output = build_cross_layer(x, num_layers=3)
+
+    # DNN
+    flatten = x
+    W1 = init.random_normal(
+        [26*embedding_size + 13, 256], stddev=0.01, name="W1")
+    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
+
+    W4 = init.random_normal(
+        [256 + 26*embedding_size + 13, 1], stddev=0.01, name="W4")
+
+    fc1 = ht.matmul_op(flatten, W1)
+    relu1 = ht.relu_op(fc1)
+    fc2 = ht.matmul_op(relu1, W2)
+    relu2 = ht.relu_op(fc2)
+    y3 = ht.matmul_op(relu2, W3)
+
+    y4 = ht.concat_op(cross_output, y3, axis=1)
+    y = ht.matmul_op(y4, W4)
+    y = ht.sigmoid_op(y)
+
+    loss = ht.binarycrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+    train_op = opt.minimize(loss)
+
+    return loss, y, y_, train_op
diff --git a/examples/ctr/models/deepfm_criteo.py b/examples/ctr/models/deepfm_criteo.py
new file mode 100644
index 0000000..d84f299
--- /dev/null
+++ b/examples/ctr/models/deepfm_criteo.py
@@ -0,0 +1,59 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def dfm_criteo(dense_input, sparse_input, y_):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.01
+
+    # FM
+    Embedding1 = init.random_normal(
+        [feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ht.cpu(0))
+    FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
+    sparse_1dim_input = ht.embedding_lookup_op(
+        Embedding1, sparse_input, ctx=ht.cpu(0))
+    fm_dense_part = ht.matmul_op(dense_input, FM_W)
+    fm_sparse_part = ht.reduce_sum_op(sparse_1dim_input, axes=1)
+    # fst order output
+    y1 = fm_dense_part + fm_sparse_part
+
+    Embedding2 = init.random_normal(
+        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+    sparse_2dim_input = ht.embedding_lookup_op(
+        Embedding2, sparse_input, ctx=ht.cpu(0))
+    sparse_2dim_sum = ht.reduce_sum_op(sparse_2dim_input, axes=1)
+    sparse_2dim_sum_square = ht.mul_op(sparse_2dim_sum, sparse_2dim_sum)
+
+    sparse_2dim_square = ht.mul_op(sparse_2dim_input, sparse_2dim_input)
+    sparse_2dim_square_sum = ht.reduce_sum_op(sparse_2dim_square, axes=1)
+    sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
+    sparse_2dim_half = sparse_2dim * 0.5
+    # snd order output
+    y2 = ht.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)
+
+    # DNN
+    flatten = ht.array_reshape_op(sparse_2dim_input, (-1, 26*embedding_size))
+    W1 = init.random_normal([26*embedding_size, 256], stddev=0.01, name="W1")
+    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+    W3 = init.random_normal([256, 1], stddev=0.01, name="W3")
+
+    fc1 = ht.matmul_op(flatten, W1)
+    relu1 = ht.relu_op(fc1)
+    fc2 = ht.matmul_op(relu1, W2)
+    relu2 = ht.relu_op(fc2)
+    y3 = ht.matmul_op(relu2, W3)
+
+    y4 = y1 + y2
+    y = y4 + y3
+    y = ht.sigmoid_op(y)
+
+    loss = ht.binarycrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+    train_op = opt.minimize(loss)
+
+    return loss, y, y_, train_op
diff --git a/examples/ctr/models/load_data.py b/examples/ctr/models/load_data.py
new file mode 100644
index 0000000..6122342
--- /dev/null
+++ b/examples/ctr/models/load_data.py
@@ -0,0 +1,320 @@
+import os
+import numpy as np
+
+
+###########################################################################
+# criteo
+###########################################################################
+
+def download_criteo(path):
+    import tarfile
+    import pandas as pd
+    from six.moves import urllib
+    if not os.path.exists(path):
+        os.makedirs(path)
+    assert os.path.isdir(path), 'Please provide a directory path.'
+    # this source may be invalid, please use other valid sources.
+    origin = (
+        'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'
+    )
+    print('Downloading data from %s' % origin)
+    dataset = os.path.join(path, 'criteo.tar.gz')
+    urllib.request.urlretrieve(origin, dataset)
+    print("Extracting criteo zip...")
+    with tarfile.open(dataset) as f:
+        f.extractall(path=path)
+    print("Create local files...")
+
+    # save csv filed
+    df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None)
+    df.columns = ['label'] + ["I" +
+                              str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)]
+    df.to_csv(os.path.join(path, "train.csv"), index=0)
+    print('Csv file saved.')
+
+    # save numpy arrays
+    target_path = [os.path.join(path, filename) for filename in [
+        'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy',
+        'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']]
+    dense_feats = [col for col in df.columns if col.startswith('I')]
+    sparse_feats = [col for col in df.columns if col.startswith('C')]
+    labels = df['label']
+    dense_feats = process_dense_feats(df, dense_feats)
+    sparse_feats = process_sparse_feats(df, sparse_feats)
+    num_data = dense_feats.shape[0]
+    perm = np.random.permutation(num_data)
+    # split data in 2 parts
+    test_num = num_data // 10
+    processed_data = [
+        dense_feats[perm[:-test_num]],  # train dense
+        sparse_feats[perm[:-test_num]],  # train sparse
+        labels[perm[:-test_num]],       # train labels
+        dense_feats[perm[-test_num:]],  # validate dense
+        sparse_feats[perm[-test_num:]],  # validate sparse
+        labels[perm[-test_num:]],       # validate labels
+    ]
+    print('Array shapes:')
+    for i in range(len(processed_data)):
+        print(os.path.split(target_path[i])
+              [-1].split('.')[0], processed_data[i].shape)
+        np.save(target_path[i], processed_data[i])
+    print('Numpy arrays saved.')
+
+
+def process_dense_feats(data, feats):
+    d = data.copy()
+    d = d[feats].fillna(0.0)
+    for f in feats:
+        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
+    return d
+
+
+def process_sparse_feats(data, feats):
+    from sklearn.preprocessing import LabelEncoder
+    # process to embeddings.
+    d = data.copy()
+    d = d[feats].fillna("-1")
+    for f in feats:
+        label_encoder = LabelEncoder()
+        d[f] = label_encoder.fit_transform(d[f])
+    feature_cnt = 0
+    for f in feats:
+        d[f] += feature_cnt
+        feature_cnt += d[f].nunique()
+    return d
+
+
+def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True):
+    import pandas as pd
+    csv_path = os.path.join(path, "train.csv")
+    if not os.path.exists(csv_path):
+        download_criteo(path)
+    df = pd.read_csv(csv_path, nrows=nrows, header=0)
+    dense_feats = [col for col in df.columns if col.startswith('I')]
+    sparse_feats = [col for col in df.columns if col.startswith('C')]
+    labels = np.array(df['label']).reshape(-1, 1)
+    dense_feats = np.array(process_dense_feats(df, dense_feats))
+    sparse_feats = np.array(process_sparse_feats(
+        df, sparse_feats)).astype(np.int32)
+    if return_val:
+        test_num = nrows // 10
+        train_dense = dense_feats[:-test_num]
+        train_sparse = sparse_feats[:-test_num]
+        train_label = labels[:-test_num]
+        validate_dense = dense_feats[-test_num:]
+        validate_sparse = sparse_feats[-test_num:]
+        validate_label = labels[-test_num:]
+        return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label)
+    else:
+        return dense_feats, sparse_feats, labels
+
+
+def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')):
+    # all data should be available! no checking.
+    processed_data = [np.load(os.path.join(path, filename))
+                      for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']]
+    return tuple(processed_data)
+
+
+def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True):
+    file_paths = [os.path.join(path, filename) for filename in [
+        'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy',
+        'test_sparse_feats.npy',  'train_labels.npy', 'test_labels.npy']]
+    if not all([os.path.exists(p) for p in file_paths]):
+        download_criteo(path)
+    files = [np.load(filename) for filename in file_paths]
+    if return_val:
+        return (files[0], files[1]), (files[2], files[3]), (files[4], files[5])
+    else:
+        return files[0], files[2], files[4]
+
+
+###########################################################################
+# adult
+###########################################################################
+
+def maybe_download(train_data, test_data):
+    import pandas as pd
+    """if adult data "train.csv" and "test.csv" are not in your directory,
+    download them.
+    """
+
+    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+               "marital_status", "occupation", "relationship", "race", "gender",
+               "capital_gain", "capital_loss", "hours_per_week", "native_country",
+               "income_bracket"]
+
+    if not os.path.exists(train_data):
+        print("downloading training data...")
+        df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                               names=COLUMNS, skipinitialspace=True)
+    else:
+        df_train = pd.read_csv("train.csv")
+
+    if not os.path.exists(test_data):
+        print("downloading testing data...")
+        df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+                              names=COLUMNS, skipinitialspace=True, skiprows=1)
+    else:
+        df_test = pd.read_csv("test.csv")
+
+    return df_train, df_test
+
+
+def cross_columns(x_cols):
+    """simple helper to build the crossed columns in a pandas dataframe
+    """
+    crossed_columns = dict()
+    colnames = ['_'.join(x_c) for x_c in x_cols]
+    for cname, x_c in zip(colnames, x_cols):
+        crossed_columns[cname] = x_c
+    return crossed_columns
+
+
+def val2idx(df, cols):
+    """helper to index categorical columns before embeddings.
+    """
+    val_types = dict()
+    for c in cols:
+        val_types[c] = df[c].unique()
+
+    val_to_idx = dict()
+    for k, v in val_types.items():
+        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
+
+    for k, v in val_to_idx.items():
+        df[k] = df[k].apply(lambda x: v[x])
+
+    unique_vals = dict()
+    for c in cols:
+        unique_vals[c] = df[c].nunique()
+
+    return df, unique_vals
+
+
+def onehot(x):
+    from sklearn.preprocessing import OneHotEncoder
+    return np.array(OneHotEncoder().fit_transform(x).todense())
+
+
+def wide(df_train, df_test, wide_cols, x_cols, target):
+    import pandas as pd
+    print('Processing wide data')
+    df_train['IS_TRAIN'] = 1
+    df_test['IS_TRAIN'] = 0
+    df_wide = pd.concat([df_train, df_test])
+
+    crossed_columns_d = cross_columns(x_cols)
+    categorical_columns = list(
+        df_wide.select_dtypes(include=['object']).columns)
+
+    wide_cols += list(crossed_columns_d.keys())
+
+    for k, v in crossed_columns_d.items():
+        df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
+
+    df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
+
+    dummy_cols = [
+        c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
+    df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
+
+    train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+    test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+    assert all(train.columns == test.columns)
+
+    cols = [c for c in train.columns if c != target]
+    X_train = train[cols].values
+    y_train = train[target].values.reshape(-1, 1)
+    X_test = test[cols].values
+    y_test = test[target].values.reshape(-1, 1)
+    return X_train, y_train, X_test, y_test
+
+
+def load_adult_data(return_val=True):
+    import pandas as pd
+    df_train, df_test = maybe_download("train.csv", "test.csv")
+
+    df_train['income_label'] = (
+        df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+    df_test['income_label'] = (
+        df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+
+    age_groups = [0, 25, 65, 90]
+    age_labels = range(len(age_groups) - 1)
+    df_train['age_group'] = pd.cut(
+        df_train['age'], age_groups, labels=age_labels)
+    df_test['age_group'] = pd.cut(
+        df_test['age'], age_groups, labels=age_labels)
+
+    # columns for wide model
+    wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
+                 'relationship', 'race', 'gender', 'native_country', 'age_group']
+    x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
+
+    # columns for deep model
+    embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
+                      'relationship', 'race', 'gender', 'native_country']
+    cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
+
+    target = 'income_label'
+
+    x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
+        df_train, df_test, wide_cols, x_cols, target)
+    x_train_wide = np.array(x_train_wide).astype(np.float32)
+    x_test_wide = np.array(x_test_wide).astype(np.float32)
+
+    print('Processing deep data')
+    df_train['IS_TRAIN'] = 1
+    df_test['IS_TRAIN'] = 0
+    df_deep = pd.concat([df_train, df_test])
+
+    deep_cols = embedding_cols + cont_cols
+    df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
+    from sklearn.preprocessing import StandardScaler
+    scaler = StandardScaler()
+    df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
+                                      columns=cont_cols)
+    df_deep, unique_vals = val2idx(df_deep, embedding_cols)
+
+    train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+    test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+
+    x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
+    y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
+    x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
+    y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
+
+    x_train_deep = np.transpose(x_train_deep)
+    x_test_deep = np.transpose(x_test_deep)
+    y_train = onehot(y_train)
+    y_test = onehot(y_test)
+
+    if return_val:
+        return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
+    else:
+        return x_train_deep, x_train_wide, y_train
+
+
+###########################################################################
+# avazu
+###########################################################################
+
+def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')):
+    import pandas as pd
+    # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data
+    train_file = os.path.join(path, 'train.csv')
+    # test_file = os.path.join(path, 'test.csv') # useless, no labels
+
+    df_train = pd.read_csv(train_file)
+    sparse_feats = process_sparse_feats(df_train, df_train.columns[2:])
+    # the embedding num for each feature:
+    # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60]
+    # sum: 9449445
+
+    np.save(os.path.join(path, 'sparse.npy'), sparse_feats)
+
+
+if __name__ == '__main__':
+    download_criteo(os.path.join(os.path.split(
+        os.path.abspath(__file__)), '../datasets/criteo'))
diff --git a/examples/ctr/models/wdl_adult.py b/examples/ctr/models/wdl_adult.py
new file mode 100644
index 0000000..1ce77e0
--- /dev/null
+++ b/examples/ctr/models/wdl_adult.py
@@ -0,0 +1,56 @@
+import hetu as ht
+from hetu import init
+
+
+def wdl_adult(X_deep, X_wide, y_):
+    lr = 5 / 128
+    dim_wide = 809
+    dim_deep = 68
+
+    W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
+    W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
+    b1 = init.random_normal([50], stddev=0.1, name="b1")
+    W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
+    b2 = init.random_normal([20], stddev=0.1, name="b2")
+
+    # deep
+    Embedding = []
+    X_deep_input = None
+
+    for i in range(8):
+        Embedding_name = "Embedding_deep_" + str(i)
+        Embedding.append(init.random_normal(
+            [50, 8], stddev=0.1, name=Embedding_name))
+        now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
+        now = ht.array_reshape_op(now, (-1, 8))
+        if X_deep_input is None:
+            X_deep_input = now
+        else:
+            X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+    for i in range(4):
+        now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
+        X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+    mat1 = ht.matmul_op(X_deep_input, W1)
+    add1 = mat1 + ht.broadcastto_op(b1, mat1)
+    relu1 = ht.relu_op(add1)
+    dropout1 = relu1
+    mat2 = ht.matmul_op(dropout1, W2)
+    add2 = mat2 + ht.broadcastto_op(b2, mat2)
+    relu2 = ht.relu_op(add2)
+    dropout2 = relu2
+    dmodel = dropout2
+
+    # wide
+    wmodel = ht.concat_op(X_wide, dmodel, 1)
+    wmodel = ht.matmul_op(wmodel, W)
+
+    prediction = wmodel
+    loss = ht.softmaxcrossentropy_op(prediction, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+
+    opt = ht.optim.SGDOptimizer(learning_rate=lr)
+    train_op = opt.minimize(loss)
+
+    return loss, prediction, y_, train_op
diff --git a/examples/ctr/models/wdl_criteo.py b/examples/ctr/models/wdl_criteo.py
new file mode 100644
index 0000000..416ae24
--- /dev/null
+++ b/examples/ctr/models/wdl_criteo.py
@@ -0,0 +1,42 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+import time
+
+
+def wdl_criteo(dense_input, sparse_input, y_):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.01
+    Embedding = init.random_normal(
+        [feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ht.cpu(0))
+    sparse_input = ht.embedding_lookup_op(
+        Embedding, sparse_input, ctx=ht.cpu(0))
+    sparse_input = ht.array_reshape_op(sparse_input, (-1, 26*embedding_size))
+
+    # DNN
+    flatten = dense_input
+    W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
+    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
+    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")
+
+    W4 = init.random_normal(
+        [256 + 26*embedding_size, 1], stddev=0.01, name="W4")
+
+    fc1 = ht.matmul_op(flatten, W1)
+    relu1 = ht.relu_op(fc1)
+    fc2 = ht.matmul_op(relu1, W2)
+    relu2 = ht.relu_op(fc2)
+    y3 = ht.matmul_op(relu2, W3)
+
+    y4 = ht.concat_op(sparse_input, y3, axis=1)
+    y = ht.matmul_op(y4, W4)
+    y = ht.sigmoid_op(y)
+
+    loss = ht.binarycrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+    train_op = opt.minimize(loss)
+
+    return loss, y, y_, train_op
diff --git a/examples/ctr/run_hetu.py b/examples/ctr/run_hetu.py
new file mode 100644
index 0000000..5745af3
--- /dev/null
+++ b/examples/ctr/run_hetu.py
@@ -0,0 +1,230 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import os.path as osp
+import numpy as np
+import yaml
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+
+def worker(args):
+    def train(iterations, auc_enabled=True, tqdm_enabled=False):
+        localiter = tqdm(range(iterations)
+                         ) if tqdm_enabled else range(iterations)
+        train_loss = []
+        train_acc = []
+        if auc_enabled:
+            train_auc = []
+        for it in localiter:
+            loss_val, predict_y, y_val, _ = executor.run(
+                'train', convert_to_numpy_ret_vals=True)
+            if y_val.shape[1] == 1:  # for criteo case
+                acc_val = np.equal(
+                    y_val,
+                    predict_y > 0.5).astype(np.float32)
+            else:
+                acc_val = np.equal(
+                    np.argmax(y_val, 1),
+                    np.argmax(predict_y, 1)).astype(np.float32)
+            train_loss.append(loss_val[0])
+            train_acc.append(acc_val)
+            if auc_enabled:
+                train_auc.append(metrics.roc_auc_score(y_val, predict_y))
+        if auc_enabled:
+            return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
+        else:
+            return np.mean(train_loss), np.mean(train_acc)
+
+    def validate(iterations, tqdm_enabled=False):
+        localiter = tqdm(range(iterations)
+                         ) if tqdm_enabled else range(iterations)
+        test_loss = []
+        test_acc = []
+        test_auc = []
+        for it in localiter:
+            loss_val, test_y_predicted, y_test_val = executor.run(
+                'validate', convert_to_numpy_ret_vals=True)
+            if y_test_val.shape[1] == 1:  # for criteo case
+                correct_prediction = np.equal(
+                    y_test_val,
+                    test_y_predicted > 0.5).astype(np.float32)
+            else:
+                correct_prediction = np.equal(
+                    np.argmax(y_test_val, 1),
+                    np.argmax(test_y_predicted, 1)).astype(np.float32)
+            test_loss.append(loss_val[0])
+            test_acc.append(correct_prediction)
+            test_auc.append(metrics.roc_auc_score(
+                y_test_val, test_y_predicted))
+        return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)
+
+    def get_current_shard(data):
+        if args.comm is not None:
+            part_size = data.shape[0] // nrank
+            start = part_size * rank
+            end = start + part_size if rank != nrank - 1 else data.shape[0]
+            return data[start:end]
+        else:
+            return data
+
+    batch_size = 128
+    dataset = args.dataset
+    model = args.model
+    device_id = 0
+
+    if args.comm == 'PS':
+        rank = ht.get_worker_communicate().rank()
+        nrank = int(os.environ['DMLC_NUM_WORKER'])
+        device_id = rank % 8
+    elif args.comm == 'Hybrid':
+        comm = ht.wrapped_mpi_nccl_init()
+        device_id = comm.dev_id
+        rank = comm.rank
+        nrank = int(os.environ['DMLC_NUM_WORKER'])
+
+    if dataset == 'criteo':
+        # define models for criteo
+        if args.all:
+            from models.load_data import process_all_criteo_data
+            dense, sparse, labels = process_all_criteo_data(
+                return_val=args.val)
+        elif args.val:
+            from models.load_data import process_head_criteo_data
+            dense, sparse, labels = process_head_criteo_data(return_val=True)
+        else:
+            from models.load_data import process_sampled_criteo_data
+            dense, sparse, labels = process_sampled_criteo_data()
+        if isinstance(dense, tuple):
+            dense_input = ht.dataloader_op([[get_current_shard(dense[0]), batch_size, 'train'], [
+                                           get_current_shard(dense[1]), batch_size, 'validate']])
+            sparse_input = ht.dataloader_op([[get_current_shard(sparse[0]), batch_size, 'train'], [
+                                            get_current_shard(sparse[1]), batch_size, 'validate']])
+            y_ = ht.dataloader_op([[get_current_shard(labels[0]), batch_size, 'train'], [
+                                  get_current_shard(labels[1]), batch_size, 'validate']])
+        else:
+            dense_input = ht.dataloader_op(
+                [[get_current_shard(dense), batch_size, 'train']])
+            sparse_input = ht.dataloader_op(
+                [[get_current_shard(sparse), batch_size, 'train']])
+            y_ = ht.dataloader_op(
+                [[get_current_shard(labels), batch_size, 'train']])
+    elif dataset == 'adult':
+        from models.load_data import load_adult_data
+        x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data()
+        dense_input = [
+            ht.dataloader_op([
+                [get_current_shard(x_train_deep[:, i]), batch_size, 'train'],
+                [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'],
+            ]) for i in range(12)
+        ]
+        sparse_input = ht.dataloader_op([
+            [get_current_shard(x_train_wide), batch_size, 'train'],
+            [get_current_shard(x_test_wide), batch_size, 'validate'],
+        ])
+        y_ = ht.dataloader_op([
+            [get_current_shard(y_train), batch_size, 'train'],
+            [get_current_shard(y_test), batch_size, 'validate'],
+        ])
+    else:
+        raise NotImplementedError
+    print("Data loaded.")
+
+    loss, prediction, y_, train_op = model(dense_input, sparse_input, y_)
+
+    eval_nodes = {'train': [loss, prediction, y_, train_op]}
+    if args.val:
+        print('Validation enabled...')
+        eval_nodes['validate'] = [loss, prediction, y_]
+    executor_log_path = osp.join(osp.dirname(osp.abspath(__file__)), 'logs')
+    executor = ht.Executor(eval_nodes, ctx=ht.gpu(device_id),
+                           comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path=executor_log_path)
+
+    if args.all and dataset == 'criteo':
+        print('Processing all data...')
+        file_path = '%s_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[
+                               args.comm], args.raw_model)
+        file_path += '%d.log' % rank if args.comm else '.log'
+        file_path = osp.join(osp.dirname(
+            osp.abspath(__file__)), 'logs', file_path)
+        log_file = open(file_path, 'w')
+        total_epoch = args.nepoch if args.nepoch > 0 else 11
+        for ep in range(total_epoch):
+            print("ep: %d" % ep)
+            ep_st = time.time()
+            train_loss, train_acc, train_auc = train(executor.get_batch_num(
+                'train') // 10 + (ep % 10 == 9) * (executor.get_batch_num('train') % 10), tqdm_enabled=True)
+            ep_en = time.time()
+            if args.val:
+                val_loss, val_acc, val_auc = validate(
+                    executor.get_batch_num('validate'))
+                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
+                    % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st)
+            else:
+                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+                    % (train_loss, train_acc, train_auc, ep_en - ep_st)
+            print(printstr)
+            log_file.write(printstr + '\n')
+            log_file.flush()
+    else:
+        total_epoch = args.nepoch if args.nepoch > 0 else 50
+        for ep in range(total_epoch):
+            if ep == 5:
+                start = time.time()
+            print("epoch %d" % ep)
+            ep_st = time.time()
+            train_loss, train_acc = train(
+                executor.get_batch_num('train'), auc_enabled=False)
+            ep_en = time.time()
+            if args.val:
+                val_loss, val_acc, val_auc = validate(
+                    executor.get_batch_num('validate'))
+                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
+                      % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
+            else:
+                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                      % (train_loss, train_acc, ep_en - ep_st))
+        print('all time:', time.time() - start)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True,
+                        help="model to be tested")
+    parser.add_argument("--val", action="store_true",
+                        help="whether to use validation")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    parser.add_argument("--comm", default=None,
+                        help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
+    parser.add_argument("--bsp", action="store_true",
+                        help="whether to use bsp instead of asp")
+    parser.add_argument("--cache", default=None, help="cache policy")
+    parser.add_argument("--bound", default=100, help="cache bound")
+    parser.add_argument("--config", type=str, default=osp.join(osp.dirname(
+        osp.abspath(__file__)), "./settings/local_s1_w4.yml"), help="configuration for ps")
+    parser.add_argument("--nepoch", type=int, default=-1,
+                        help="num of epochs, each train 1/10 data")
+    args = parser.parse_args()
+    import models
+    print('Model:', args.model)
+    model = eval('models.' + args.model)
+    args.dataset = args.model.split('_')[-1]
+    args.raw_model = args.model
+    args.model = model
+    if args.comm is None:
+        worker(args)
+    elif args.comm == 'Hybrid':
+        settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+        value = settings['shared']
+        os.environ['DMLC_ROLE'] = 'worker'
+        for k, v in value.items():
+            os.environ[k] = str(v)
+        worker(args)
+    elif args.comm == 'PS':
+        launch(worker, args)
+    else:
+        raise NotImplementedError
diff --git a/examples/ctr/run_tf_horovod.py b/examples/ctr/run_tf_horovod.py
new file mode 100644
index 0000000..dc590cc
--- /dev/null
+++ b/examples/ctr/run_tf_horovod.py
@@ -0,0 +1,174 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+import horovod.tensorflow as hvd
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+# horovodrun -np 8 -H localhost:8 python run_tf_horovod.py --model
+# horovodrun -np 8 --start-timeout 300 -H daim116:4,daim117:4 python run_tf_horovod.py --model
+# if using multi nodes setting in conda, need to modify /etc/bash.bashrc
+# we can also use mpirun (default gloo):
+# ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
+#  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H daim117:8,daim118:8 --allow-run-as-root python run_tf_horovod.py --model
+
+
+def train_criteo(model, args):
+    hvd.init()
+
+    def get_current_shard(data):
+        part_size = data.shape[0] // hvd.size()
+        start = part_size * hvd.rank()
+        end = start + part_size if hvd.rank() != hvd.size() - \
+            1 else data.shape[0]
+        return data[start:end]
+
+    if args.all:
+        from models.load_data import process_all_criteo_data
+        dense, sparse, all_labels = process_all_criteo_data()
+        dense_feature = get_current_shard(dense[0])
+        sparse_feature = get_current_shard(sparse[0])
+        labels = get_current_shard(all_labels[0])
+        val_dense = get_current_shard(dense[1])
+        val_sparse = get_current_shard(sparse[1])
+        val_labels = get_current_shard(all_labels[1])
+    else:
+        from models.load_data import process_sampled_criteo_data
+        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+        dense_feature = get_current_shard(dense_feature)
+        sparse_feature = get_current_shard(sparse_feature)
+        labels = get_current_shard(labels)
+
+    batch_size = 128
+    dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+    sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+    y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+    loss, y, opt = model(dense_input, sparse_input, y_)
+    global_step = tf.train.get_or_create_global_step()
+    # here in DistributedOptimizer by default all tensor are reduced on GPU
+    # can use device_sparse=xxx, device_dense=xxx to modify
+    # if using device_sparse='/cpu:0', the performance degrades
+    train_op = hvd.DistributedOptimizer(
+        opt).minimize(loss, global_step=global_step)
+
+    gpu_options = tf.compat.v1.GPUOptions(
+        allow_growth=True, visible_device_list=str(hvd.local_rank()))
+    # here horovod default use gpu to initialize, which will cause OOM
+    hooks = [hvd.BroadcastGlobalVariablesHook(0, device='/cpu:0')]
+    sess = tf.compat.v1.train.MonitoredTrainingSession(
+        hooks=hooks, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+
+    my_feed_dict = {
+        dense_input: np.empty(shape=(batch_size, 13)),
+        sparse_input: np.empty(shape=(batch_size, 26)),
+        y_: np.empty(shape=(batch_size, 1)),
+    }
+
+    if args.all:
+        raw_log_file = './logs/tf_hvd_%s_%d.log' % (
+            args.model, hvd.local_rank())
+        print('Processing all data, log to', raw_log_file)
+        log_file = open(raw_log_file, 'w')
+        iterations = dense_feature.shape[0] // batch_size
+        total_epoch = 400
+        start_index = 0
+        for ep in range(total_epoch):
+            print("epoch %d" % ep)
+            st_time = time.time()
+            train_loss, train_acc, train_auc = [], [], []
+            for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+                start_index += batch_size
+                if start_index + batch_size > dense_feature.shape[0]:
+                    start_index = 0
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                acc_val = np.equal(
+                    true_val,
+                    pred_val > 0.5)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+                train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            tra_auc = np.mean(train_auc)
+            en_time = time.time()
+            train_time = en_time - st_time
+            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+                % (tra_loss, tra_accuracy, tra_auc, train_time)
+            print(printstr)
+            log_file.write(printstr + '\n')
+            log_file.flush()
+
+    else:
+        iterations = dense_feature.shape[0] // batch_size
+
+        epoch = 50
+        for ep in range(epoch):
+            print('epoch', ep)
+            if ep == 5:
+                start = time.time()
+            ep_st = time.time()
+            train_loss = []
+            train_acc = []
+            for idx in range(iterations):
+                start_index = idx * batch_size
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                if pred_val.shape[1] == 1:  # for criteo case
+                    acc_val = np.equal(
+                        true_val,
+                        pred_val > 0.5)
+                else:
+                    acc_val = np.equal(
+                        np.argmax(pred_val, 1),
+                        np.argmax(true_val, 1)).astype(np.float32)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            ep_en = time.time()
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                  % (tra_loss, tra_accuracy, ep_en - ep_st))
+        print('all time:', (time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True,
+                        help="model to be tested")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+    raw_model = args.model
+    import tf_models
+    model = eval('tf_models.' + raw_model)
+    dataset = raw_model.split('_')[-1]
+    print('Model:', raw_model)
+    train_criteo(model, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/ctr/run_tf_local.py b/examples/ctr/run_tf_local.py
new file mode 100644
index 0000000..1b0bc6d
--- /dev/null
+++ b/examples/ctr/run_tf_local.py
@@ -0,0 +1,202 @@
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+
+def train_criteo(model, args):
+    if args.all:
+        from models.load_data import process_all_criteo_data
+        dense, sparse, all_labels = process_all_criteo_data()
+        dense_feature, val_dense = dense
+        sparse_feature, val_sparse = sparse
+        labels, val_labels = all_labels
+    else:
+        from models.load_data import process_sampled_criteo_data
+        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+
+    batch_size = 128
+    dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+    sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+    y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+    loss, y, opt = model(dense_input, sparse_input, y_)
+    train_op = opt.minimize(loss)
+
+    init = tf.compat.v1.global_variables_initializer()
+    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+    sess = tf.compat.v1.Session(
+        config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+    sess.run(init)
+
+    my_feed_dict = {
+        dense_input: np.empty(shape=(batch_size, 13)),
+        sparse_input: np.empty(shape=(batch_size, 26)),
+        y_: np.empty(shape=(batch_size, 1)),
+    }
+
+    if args.all:
+        raw_log_file = './logs/tf_local_%s.log' % (args.model)
+        print('Processing all data, log to', raw_log_file)
+        log_file = open(raw_log_file, 'w')
+        iterations = dense_feature.shape[0] // batch_size
+        total_epoch = 11
+        start_index = 0
+        for ep in range(total_epoch):
+            print("epoch %d" % ep)
+            st_time = time.time()
+            train_loss, train_acc, train_auc = [], [], []
+            for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+                start_index += batch_size
+                if start_index + batch_size > dense_feature.shape[0]:
+                    start_index = 0
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                acc_val = np.equal(
+                    true_val,
+                    pred_val > 0.5)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+                train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            tra_auc = np.mean(train_auc)
+            en_time = time.time()
+            train_time = en_time - st_time
+            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+                % (tra_loss, tra_accuracy, tra_auc, train_time)
+            print(printstr)
+            log_file.write(printstr + '\n')
+            log_file.flush()
+
+    else:
+        iteration = dense_feature.shape[0] // batch_size
+
+        epoch = 50
+        for ep in range(epoch):
+            print('epoch', ep)
+            if ep == 5:
+                start = time.time()
+            ep_st = time.time()
+            train_loss = []
+            train_acc = []
+            for idx in range(iteration):
+                start_index = idx * batch_size
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                if pred_val.shape[1] == 1:  # for criteo case
+                    acc_val = np.equal(
+                        true_val,
+                        pred_val > 0.5)
+                else:
+                    acc_val = np.equal(
+                        np.argmax(pred_val, 1),
+                        np.argmax(true_val, 1)).astype(np.float32)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            ep_en = time.time()
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                  % (tra_loss, tra_accuracy, ep_en - ep_st))
+        print('all time:', (time.time() - start))
+
+
+def train_adult(model):
+    batch_size = 128
+    total_epoch = 50
+    dim_wide = 809
+
+    X_deep = []
+    for i in range(8):
+        X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
+    for i in range(4):
+        X_deep.append(tf.compat.v1.placeholder(tf.float32, [batch_size, 1]))
+    X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
+    y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
+    loss, y, train_op = model(X_deep, X_wide, y_)
+
+    init = tf.global_variables_initializer()
+
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+
+    sess.run(init)
+
+    from models.load_data import load_adult_data
+    x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)
+
+    iterations = x_train_deep.shape[0] // batch_size
+    for ep in range(total_epoch):
+        print('epoch', ep)
+        if ep == 5:
+            start = time.time()
+        ep_st = time.time()
+        train_loss = []
+        train_acc = []
+        pre_index = 0
+
+        for it in range(iterations):
+            batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
+            batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
+            batch_y = y_train[pre_index:pre_index + batch_size]
+            pre_index += batch_size
+
+            my_feed_dict = dict()
+            for i in range(12):
+                my_feed_dict[X_deep[i]] = np.array(
+                    batch_x_deep[:, 1]).reshape(-1, 1)
+
+            my_feed_dict[X_wide] = np.array(batch_x_wide)
+            my_feed_dict[y_] = batch_y
+            loss_val = sess.run([loss, y, y_, train_op],
+                                feed_dict=my_feed_dict)
+            acc_val = np.equal(
+                np.argmax(loss_val[1], 1),
+                np.argmax(loss_val[2], 1)).astype(np.float32)
+            train_loss.append(loss_val[0])
+            train_acc.append(acc_val)
+        tra_accuracy = np.mean(train_acc)
+        tra_loss = np.mean(train_loss)
+        ep_en = time.time()
+        print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+              % (tra_loss, tra_accuracy, ep_en - ep_st))
+    print('all time:', (time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True,
+                        help="model to be tested")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+    raw_model = args.model
+    import tf_models
+    model = eval('tf_models.' + raw_model)
+    dataset = raw_model.split('_')[-1]
+    print('Model:', raw_model)
+
+    if dataset == 'criteo':
+        train_criteo(model, args)
+    elif dataset == 'adult':
+        train_adult(model)
+    else:
+        raise NotImplementedError
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/ctr/run_tf_parallax.py b/examples/ctr/run_tf_parallax.py
new file mode 100644
index 0000000..9642775
--- /dev/null
+++ b/examples/ctr/run_tf_parallax.py
@@ -0,0 +1,211 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+
+from autodist import AutoDist
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
+from autodist.strategy.base import Strategy
+from autodist.kernel.common.utils import get_op_name
+from tensorflow.python.framework import ops
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+# Please DO NOT modify /etc/bash.bashrc to activate conda environment.
+# Use python_venv in spec yml file instead.
+# Use absolute path of python file.
+# Here we use the tf native partitioner instead of autodist's PartitionPS.
+
+
+class Parallaxx(PSLoadBalancing, AllReduce):
+    """
+    Modify original parallax to remove replica on CPUs.
+    """
+
+    def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
+        PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
+        AllReduce.__init__(self, chunk_size)
+
+    # pylint: disable=attribute-defined-outside-init
+    def build(self, graph_item, resource_spec):
+        """Generate the strategy."""
+        expr = Strategy()
+
+        # For each variable, generate variable synchronizer config
+        expr.graph_config.replicas.extend(
+            [k for k, v in resource_spec.gpu_devices])
+        reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+        self.loads = {ps: 0.0 for ps in reduction_device_names}
+
+        # Generate node config
+        node_config = []
+        for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
+            var_op_name = get_op_name(var.name)
+            grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
+            if isinstance(grad, ops.Tensor):  # this is a dense variable
+                group_id = idx // self.chunk_size
+                config = self._gen_all_reduce_node_config(
+                    var.name, group=group_id)
+            else:  # sparse updates
+                # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
+                # Sparse variables are likely larger, so keeping copies would be costlier,
+                # and usually each device only requires a small part of the overall variable.
+                config = self._gen_ps_node_config(
+                    var,
+                    # For Parallax Strategy, all PS vars are sparse which does not need proxy.
+                    False,
+                    self._sync,
+                    self._staleness
+                )
+            node_config.append(config)
+        expr.node_config.extend(node_config)
+
+        return expr
+
+
+def train_criteo(model, args):
+    resource_spec_file = os.path.join(os.path.dirname(
+        __file__), 'settings', 'plx_local_spec.yml')
+    autodist = AutoDist(resource_spec_file, Parallaxx())
+    respec = ResourceSpec(resource_spec_file)
+    if args.all:
+        from models.load_data import process_all_criteo_data
+        dense, sparse, all_labels = process_all_criteo_data()
+        dense_feature, val_dense = dense
+        sparse_feature, val_sparse = sparse
+        labels, val_labels = all_labels
+    else:
+        from models.load_data import process_sampled_criteo_data
+        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+
+    # autodist will split the feeding data
+    batch_size = 128
+    with tf.Graph().as_default() as g, autodist.scope():
+        dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+        sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+        embed_partitioner = tf.fixed_size_partitioner(
+            len(respec.nodes), 0) if len(respec.nodes) > 1 else None
+        loss, y, opt = model(dense_input, sparse_input,
+                             y_, embed_partitioner, False)
+        train_op = opt.minimize(loss)
+
+        sess = autodist.create_distributed_session()
+
+        my_feed_dict = {
+            dense_input: np.empty(shape=(batch_size, 13)),
+            sparse_input: np.empty(shape=(batch_size, 26)),
+            y_: np.empty(shape=(batch_size, 1)),
+        }
+
+        if args.all:
+            raw_log_file = os.path.join(os.path.split(os.path.abspath(__file__))[
+                                        0], 'logs', 'tf_plx_%s.log' % (args.model))
+            print('Processing all data, log to', raw_log_file)
+            log_file = open(raw_log_file, 'w')
+            iterations = dense_feature.shape[0] // batch_size
+            total_epoch = 11
+            start_index = 0
+            for ep in range(total_epoch):
+                print("epoch %d" % ep)
+                st_time = time.time()
+                train_loss, train_acc, train_auc = [], [], []
+                for it in tqdm(range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))):
+                    my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                    my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                    my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+                    start_index += batch_size
+                    if start_index + batch_size > dense_feature.shape[0]:
+                        start_index = 0
+                    loss_val = sess.run(
+                        [loss, y, y_, train_op], feed_dict=my_feed_dict)
+                    pred_val = loss_val[1]
+                    true_val = loss_val[2]
+                    acc_val = np.equal(
+                        true_val,
+                        pred_val > 0.5)
+                    train_loss.append(loss_val[0])
+                    train_acc.append(acc_val)
+                    train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+                tra_accuracy = np.mean(train_acc)
+                tra_loss = np.mean(train_loss)
+                tra_auc = np.mean(train_auc)
+                en_time = time.time()
+                train_time = en_time - st_time
+                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+                    % (tra_loss, tra_accuracy, tra_auc, train_time)
+                print(printstr)
+                log_file.write(printstr + '\n')
+                log_file.flush()
+
+        else:
+            iteration = dense_feature.shape[0] // batch_size
+
+            epoch = 50
+            for ep in range(epoch):
+                print('epoch', ep)
+                if ep == 5:
+                    start = time.time()
+                ep_st = time.time()
+                train_loss = []
+                train_acc = []
+                for idx in range(iteration):
+                    start_index = idx * batch_size
+                    my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                    my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                    my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+                    loss_val = sess.run(
+                        [loss, y, y_, train_op], feed_dict=my_feed_dict)
+                    pred_val = loss_val[1]
+                    true_val = loss_val[2]
+                    if pred_val.shape[1] == 1:  # for criteo case
+                        acc_val = np.equal(
+                            true_val,
+                            pred_val > 0.5)
+                    else:
+                        acc_val = np.equal(
+                            np.argmax(pred_val, 1),
+                            np.argmax(true_val, 1)).astype(np.float32)
+                    train_loss.append(loss_val[0])
+                    train_acc.append(acc_val)
+                tra_accuracy = np.mean(train_acc)
+                tra_loss = np.mean(train_loss)
+                ep_en = time.time()
+                print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                      % (tra_loss, tra_accuracy, ep_en - ep_st))
+            print('all time:', (time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True,
+                        help="model to be tested")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+    raw_model = args.model
+    import tf_models
+    model = eval('tf_models.' + raw_model)
+    dataset = raw_model.split('_')[-1]
+    print('Model:', raw_model)
+
+    if dataset == 'criteo':
+        train_criteo(model, args)
+    else:
+        raise NotImplementedError
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/ctr/settings/local_s1.yml b/examples/ctr/settings/local_s1.yml
new file mode 100644
index 0000000..a60a56b
--- /dev/null
+++ b/examples/ctr/settings/local_s1.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 4
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 0
+  server : 1
+  scheduler : true
diff --git a/examples/ctr/settings/local_s1_w2.yml b/examples/ctr/settings/local_s1_w2.yml
new file mode 100644
index 0000000..34dc439
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w2.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 2
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 2
+  server : 1
+  scheduler : true
diff --git a/examples/ctr/settings/local_s1_w4.yml b/examples/ctr/settings/local_s1_w4.yml
new file mode 100644
index 0000000..0790c40
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w4.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 4
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 4
+  server : 1
+  scheduler : true
diff --git a/examples/ctr/settings/local_s1_w8.yml b/examples/ctr/settings/local_s1_w8.yml
new file mode 100644
index 0000000..a795608
--- /dev/null
+++ b/examples/ctr/settings/local_s1_w8.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 8
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 8
+  server : 1
+  scheduler : true
diff --git a/examples/ctr/settings/local_w4.yml b/examples/ctr/settings/local_w4.yml
new file mode 100644
index 0000000..97b5bcd
--- /dev/null
+++ b/examples/ctr/settings/local_w4.yml
@@ -0,0 +1,6 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 4
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
\ No newline at end of file
diff --git a/examples/ctr/settings/plx_local_spec.yml b/examples/ctr/settings/plx_local_spec.yml
new file mode 100644
index 0000000..11f9025
--- /dev/null
+++ b/examples/ctr/settings/plx_local_spec.yml
@@ -0,0 +1,4 @@
+nodes:
+  - address: localhost
+    cpus: [0]
+    gpus: [0,1,2,3,4,5,6,7]
diff --git a/examples/ctr/settings/tf_local_s1_w2.json b/examples/ctr/settings/tf_local_s1_w2.json
new file mode 100644
index 0000000..c1e0148
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w2.json
@@ -0,0 +1,9 @@
+{
+    "worker": [
+        "127.0.0.1:12349",
+        "127.0.0.1:12348"
+    ],
+    "ps": [
+        "127.0.0.1:12345"
+    ]
+}
\ No newline at end of file
diff --git a/examples/ctr/settings/tf_local_s1_w4.json b/examples/ctr/settings/tf_local_s1_w4.json
new file mode 100644
index 0000000..d0c339b
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w4.json
@@ -0,0 +1,11 @@
+{
+    "worker": [
+        "127.0.0.1:23459",
+        "127.0.0.1:23458",
+        "127.0.0.1:23457",
+        "127.0.0.1:23456"
+    ],
+    "ps": [
+        "127.0.0.1:23455"
+    ]
+}
\ No newline at end of file
diff --git a/examples/ctr/settings/tf_local_s1_w8.json b/examples/ctr/settings/tf_local_s1_w8.json
new file mode 100644
index 0000000..05d1f0e
--- /dev/null
+++ b/examples/ctr/settings/tf_local_s1_w8.json
@@ -0,0 +1,15 @@
+{
+    "worker": [
+        "127.0.0.1:34569",
+        "127.0.0.1:34568",
+        "127.0.0.1:34567",
+        "127.0.0.1:34566",
+        "127.0.0.1:34565",
+        "127.0.0.1:34564",
+        "127.0.0.1:34563",
+        "127.0.0.1:34562"
+    ],
+    "ps": [
+        "127.0.0.1:34575"
+    ]
+}
\ No newline at end of file
diff --git a/examples/ctr/tests/hybrid_dcn_criteo.sh b/examples/ctr/tests/hybrid_dcn_criteo.sh
new file mode 100644
index 0000000..8fcb203
--- /dev/null
+++ b/examples/ctr/tests/hybrid_dcn_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_dfm_criteo.sh b/examples/ctr/tests/hybrid_dfm_criteo.sh
new file mode 100644
index 0000000..caaa171
--- /dev/null
+++ b/examples/ctr/tests/hybrid_dfm_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_wdl_adult.sh b/examples/ctr/tests/hybrid_wdl_adult.sh
new file mode 100644
index 0000000..d3f41ee
--- /dev/null
+++ b/examples/ctr/tests/hybrid_wdl_adult.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/hybrid_wdl_criteo.sh b/examples/ctr/tests/hybrid_wdl_criteo.sh
new file mode 100644
index 0000000..36a6b88
--- /dev/null
+++ b/examples/ctr/tests/hybrid_wdl_criteo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
diff --git a/examples/ctr/tests/local_dcn_criteo.sh b/examples/ctr/tests/local_dcn_criteo.sh
new file mode 100644
index 0000000..f1cd439
--- /dev/null
+++ b/examples/ctr/tests/local_dcn_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dcn_criteo --val
diff --git a/examples/ctr/tests/local_dfm_criteo.sh b/examples/ctr/tests/local_dfm_criteo.sh
new file mode 100644
index 0000000..d57378c
--- /dev/null
+++ b/examples/ctr/tests/local_dfm_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dfm_criteo --val
diff --git a/examples/ctr/tests/local_wdl_adult.sh b/examples/ctr/tests/local_wdl_adult.sh
new file mode 100644
index 0000000..4ccb1cd
--- /dev/null
+++ b/examples/ctr/tests/local_wdl_adult.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_adult --val
diff --git a/examples/ctr/tests/local_wdl_criteo.sh b/examples/ctr/tests/local_wdl_criteo.sh
new file mode 100644
index 0000000..cfd2e45
--- /dev/null
+++ b/examples/ctr/tests/local_wdl_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_criteo --val
diff --git a/examples/ctr/tests/ps_dcn_criteo.sh b/examples/ctr/tests/ps_dcn_criteo.sh
new file mode 100644
index 0000000..fba3350
--- /dev/null
+++ b/examples/ctr/tests/ps_dcn_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_dfm_criteo.sh b/examples/ctr/tests/ps_dfm_criteo.sh
new file mode 100644
index 0000000..ee423e9
--- /dev/null
+++ b/examples/ctr/tests/ps_dfm_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_wdl_adult.sh b/examples/ctr/tests/ps_wdl_adult.sh
new file mode 100644
index 0000000..c0165b6
--- /dev/null
+++ b/examples/ctr/tests/ps_wdl_adult.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/ps_wdl_criteo.sh b/examples/ctr/tests/ps_wdl_criteo.sh
new file mode 100644
index 0000000..8d2f9a1
--- /dev/null
+++ b/examples/ctr/tests/ps_wdl_criteo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../run_hetu.py
+
+python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
diff --git a/examples/ctr/tests/tf_2workers.sh b/examples/ctr/tests/tf_2workers.sh
new file mode 100644
index 0000000..8f1c902
--- /dev/null
+++ b/examples/ctr/tests/tf_2workers.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 0 > ${workdir}/../logs/temp0.log & 
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 1 > ${workdir}/../logs/temp1.log & 
+wait
diff --git a/examples/ctr/tests/tf_4workers.sh b/examples/ctr/tests/tf_4workers.sh
new file mode 100644
index 0000000..d1ad330
--- /dev/null
+++ b/examples/ctr/tests/tf_4workers.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 0 > ${workdir}/../logs/temp0.log &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 1 > ${workdir}/../logs/temp1.log &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 2 > ${workdir}/../logs/temp2.log &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 3 > ${workdir}/../logs/temp3.log &
+wait
diff --git a/examples/ctr/tests/tf_8workers.sh b/examples/ctr/tests/tf_8workers.sh
new file mode 100644
index 0000000..d0ede08
--- /dev/null
+++ b/examples/ctr/tests/tf_8workers.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/../tf_launch_worker.py
+
+rm -f logs/temp*.log
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 0 > ${workdir}/../logs/temp0.log &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 1 > ${workdir}/../logs/temp1.log &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 2 > ${workdir}/../logs/temp2.log &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 3 > ${workdir}/../logs/temp3.log &
+CUDA_VISIBLE_DEVICES=4 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 4 > ${workdir}/../logs/temp4.log &
+CUDA_VISIBLE_DEVICES=5 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 5 > ${workdir}/../logs/temp5.log &
+CUDA_VISIBLE_DEVICES=6 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 6 > ${workdir}/../logs/temp6.log &
+CUDA_VISIBLE_DEVICES=7 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w8.json --rank 7 > ${workdir}/../logs/temp7.log &
+wait
diff --git a/examples/ctr/tf_launch_server.py b/examples/ctr/tf_launch_server.py
new file mode 100644
index 0000000..aaa0b7c
--- /dev/null
+++ b/examples/ctr/tf_launch_server.py
@@ -0,0 +1,49 @@
+import os
+import tensorflow as tf
+import multiprocessing
+import signal
+import json
+import argparse
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+
+
+pop_env()
+
+
+def start_server(cluster, task_id):
+    server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
+    server.join()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, default='./settings/tf_dist_s4_w2.json', help="config file path")
+    parser.add_argument("--id", type=int, required=True)
+    args = parser.parse_args()
+    raw_config = args.config
+    config = json.load(open(raw_config))
+    cluster = tf.train.ClusterSpec(config)
+    global proc
+    proc = multiprocessing.Process(
+        target=start_server, args=[cluster, args.id, ])
+    proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    proc.join()
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    global proc
+    proc.kill()
+    exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/ctr/tf_launch_worker.py b/examples/ctr/tf_launch_worker.py
new file mode 100644
index 0000000..e56825d
--- /dev/null
+++ b/examples/ctr/tf_launch_worker.py
@@ -0,0 +1,353 @@
+import tensorflow as tf
+import numpy as np
+import argparse
+import os
+import time
+import json
+from sklearn import metrics
+from tqdm import tqdm
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+
+def train_criteo(model, cluster, task_id, nrank, args):
+    def get_current_shard(data):
+        part_size = data.shape[0] // nrank
+        start = part_size * task_id
+        end = start + part_size if task_id != nrank - 1 else data.shape[0]
+        return data[start:end]
+
+    if args.all:
+        from models.load_data import process_all_criteo_data
+        dense, sparse, all_labels = process_all_criteo_data()
+        dense_feature = get_current_shard(dense[0])
+        sparse_feature = get_current_shard(sparse[0])
+        labels = get_current_shard(all_labels[0])
+        val_dense = get_current_shard(dense[1])
+        val_sparse = get_current_shard(sparse[1])
+        val_labels = get_current_shard(all_labels[1])
+    else:
+        from models.load_data import process_sampled_criteo_data
+        dense_feature, sparse_feature, labels = process_sampled_criteo_data()
+        dense_feature = get_current_shard(dense_feature)
+        sparse_feature = get_current_shard(sparse_feature)
+        labels = get_current_shard(labels)
+
+    batch_size = 128
+    worker_device = "/job:worker/task:%d/gpu:0" % (task_id)
+    with tf.device(worker_device):
+        dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13])
+        sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26])
+        y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1])
+
+    with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)):
+        server_num = len(cluster.as_dict()['ps'])
+        embed_partitioner = tf.fixed_size_partitioner(
+            server_num, 0) if server_num > 1 else None
+        loss, y, opt = model(dense_input, sparse_input, y_,
+                             embed_partitioner, param_on_gpu=False)
+        train_op = opt.minimize(loss)
+
+    server = tf.train.Server(
+        cluster, job_name="worker", task_index=task_id)
+    init = tf.compat.v1.global_variables_initializer()
+    sv = tf.train.Supervisor(
+        is_chief=(task_id == 0),
+        init_op=init,
+        recovery_wait_secs=1)
+    sess_config = tf.compat.v1.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=False,
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % task_id])
+    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
+    # sess.run(init)
+    if task_id == 0:
+        writer = tf.compat.v1.summary.FileWriter('logs/board', sess.graph)
+
+    my_feed_dict = {
+        dense_input: np.empty(shape=(batch_size, 13)),
+        sparse_input: np.empty(shape=(batch_size, 26)),
+        y_: np.empty(shape=(batch_size, 1)),
+    }
+
+    if args.all:
+        raw_log_file = './logs/tf_dist_%s_%d.log' % (args.model, task_id)
+        print('Processing all data, log to', raw_log_file)
+        log_file = open(raw_log_file, 'w')
+        iterations = dense_feature.shape[0] // batch_size
+        total_epoch = 21
+        start_index = 0
+        for ep in range(total_epoch):
+            print("epoch %d" % ep)
+            st_time = time.time()
+            train_loss, train_acc, train_auc = [], [], []
+            for it in range(iterations // 10 + (ep % 10 == 9) * (iterations % 10)):
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+                start_index += batch_size
+                if start_index + batch_size > dense_feature.shape[0]:
+                    start_index = 0
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                acc_val = np.equal(
+                    true_val,
+                    pred_val > 0.5)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+                train_auc.append(metrics.roc_auc_score(true_val, pred_val))
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            tra_auc = np.mean(train_auc)
+            en_time = time.time()
+            train_time = en_time - st_time
+
+            if args.val:
+                val_loss, val_acc, val_auc = [], [], []
+                for it in range(val_dense.shape[0] // batch_size):
+                    local_st = it * batch_size
+                    my_feed_dict[dense_input][:] = val_dense[local_st: local_st + batch_size]
+                    my_feed_dict[sparse_input][:] = val_sparse[local_st: local_st + batch_size]
+                    my_feed_dict[y_][:] = val_labels[local_st: local_st+batch_size]
+                    loss_val = sess.run([loss, y, y_], feed_dict=my_feed_dict)
+                    pred_val = loss_val[1]
+                    true_val = loss_val[2]
+                    acc_val = np.equal(
+                        true_val,
+                        pred_val > 0.5)
+                    val_loss.append(loss_val[0])
+                    val_acc.append(acc_val)
+                    val_auc.append(metrics.roc_auc_score(true_val, pred_val))
+                v_accuracy = np.mean(val_acc)
+                v_loss = np.mean(val_loss)
+                v_auc = np.mean(val_auc)
+                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
+                    % (tra_loss, tra_accuracy, tra_auc, v_loss, v_accuracy, v_auc, train_time)
+            else:
+                printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
+                    % (tra_loss, tra_accuracy, tra_auc, train_time)
+
+            print(printstr)
+            log_file.write(printstr + '\n')
+            log_file.flush()
+    else:
+        # here no val
+        iteration = dense_feature.shape[0] // batch_size
+
+        epoch = 10
+        for ep in range(epoch):
+            print('epoch', ep)
+            if ep == 5:
+                start = time.time()
+            ep_st = time.time()
+            train_loss = []
+            train_acc = []
+            for idx in range(iteration):
+                start_index = idx * batch_size
+                my_feed_dict[dense_input][:] = dense_feature[start_index: start_index + batch_size]
+                my_feed_dict[sparse_input][:] = sparse_feature[start_index: start_index + batch_size]
+                my_feed_dict[y_][:] = labels[start_index: start_index+batch_size]
+
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                pred_val = loss_val[1]
+                true_val = loss_val[2]
+                if pred_val.shape[1] == 1:  # for criteo case
+                    acc_val = np.equal(
+                        true_val,
+                        pred_val > 0.5)
+                else:
+                    acc_val = np.equal(
+                        np.argmax(pred_val, 1),
+                        np.argmax(true_val, 1)).astype(np.float32)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            ep_en = time.time()
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                  % (tra_loss, tra_accuracy, ep_en - ep_st))
+        print("tensorflow: ", (time.time() - start))
+
+
+def train_adult(model, cluster, task_id, nrank):
+    from models.load_data import load_adult_data
+    x_train_deep, x_train_wide, y_train = load_adult_data(return_val=False)
+    part_size = len(x_train_deep) // nrank
+    start = part_size * task_id
+    end = start + part_size if task_id != nrank - 1 else len(x_train_deep)
+    x_train_deep = x_train_deep[start:end]
+    x_train_wide = x_train_wide[start:end]
+    y_train = y_train[start:end]
+
+    batch_size = 128
+    total_epoch = 50
+    dim_wide = 809
+
+    worker_device = "/job:worker/task:%d/gpu:0" % (task_id)
+    with tf.device(worker_device):
+        X_deep = []
+        for i in range(8):
+            X_deep.append(tf.compat.v1.placeholder(tf.int32, [batch_size, 1]))
+        for i in range(4):
+            X_deep.append(tf.compat.v1.placeholder(
+                tf.float32, [batch_size, 1]))
+        X_wide = tf.compat.v1.placeholder(tf.float32, [batch_size, dim_wide])
+        y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 2])
+    loss, y, train_op, global_step = model(
+        X_deep, X_wide, y_, cluster, task_id)
+
+    with tf.device(
+            tf.compat.v1.train.replica_device_setter(
+                worker_device=worker_device,
+                cluster=cluster)):
+        server = tf.train.Server(
+            cluster, job_name="worker", task_index=task_id)
+        init = tf.global_variables_initializer()
+        sv = tf.train.Supervisor(
+            is_chief=(task_id == 0),
+            init_op=init,
+            recovery_wait_secs=1,
+            global_step=global_step)
+        sess_config = tf.ConfigProto(
+            # allow_soft_placement=True,
+            log_device_placement=False,
+            device_filters=["/job:ps",
+                            "/job:worker/task:%d" % task_id])
+        sess = sv.prepare_or_wait_for_session(
+            server.target, config=sess_config)
+
+        sess.run(init)
+
+        iterations = x_train_deep.shape[0] // batch_size
+        for ep in range(total_epoch):
+            print('epoch', ep)
+            if ep == 5:
+                start = time.time()
+            ep_st = time.time()
+            train_loss = []
+            train_acc = []
+            pre_index = 0
+
+            for it in range(iterations):
+                batch_x_deep = x_train_deep[pre_index:pre_index + batch_size]
+                batch_x_wide = x_train_wide[pre_index:pre_index + batch_size]
+                batch_y = y_train[pre_index:pre_index + batch_size]
+                pre_index += batch_size
+
+                my_feed_dict = dict()
+                for i in range(12):
+                    my_feed_dict[X_deep[i]] = np.array(
+                        batch_x_deep[:, 1]).reshape(-1, 1)
+
+                my_feed_dict[X_wide] = np.array(batch_x_wide)
+                my_feed_dict[y_] = batch_y
+                loss_val = sess.run([loss, y, y_, train_op],
+                                    feed_dict=my_feed_dict)
+                acc_val = np.equal(
+                    np.argmax(loss_val[1], 1),
+                    np.argmax(loss_val[2], 1)).astype(np.float32)
+                train_loss.append(loss_val[0])
+                train_acc.append(acc_val)
+            tra_accuracy = np.mean(train_acc)
+            tra_loss = np.mean(train_loss)
+            ep_en = time.time()
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                  % (tra_loss, tra_accuracy, ep_en - ep_st))
+        print("tensorflow: ", (time.time() - start))
+
+
+def test_bandwidth(cluster, task_id):
+    print('test bandwidth')
+    iters = 1000
+    params_size = 128 * 9
+    ps_device = "/job:ps/task:0/cpu:0"
+    worker_device = "/job:worker/task:%d/cpu:0" % (task_id)
+
+    with tf.device(ps_device):
+        dtype = tf.int32
+        params = tf.get_variable("params", shape=[params_size], dtype=dtype,
+                                 initializer=tf.zeros_initializer())
+    with tf.device(tf.compat.v1.train.replica_device_setter(
+            worker_device=worker_device,
+            cluster=cluster)):
+        update = tf.get_variable("update", shape=[params_size], dtype=dtype,
+                                 initializer=tf.ones_initializer())
+        add_op = params.assign(update)
+
+        server = tf.train.Server(
+            cluster, job_name="worker", task_index=task_id)
+        init = tf.global_variables_initializer()
+        sv = tf.train.Supervisor(
+            is_chief=(task_id == 0),
+            init_op=init,
+            recovery_wait_secs=1)
+        sess_config = tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=False,
+            device_filters=["/job:ps",
+                            "/job:worker/task:%d" % task_id])
+        sess = sv.prepare_or_wait_for_session(
+            server.target, config=sess_config)
+
+        sess.run(init)
+        # warm up
+        for i in range(5):
+            sess.run(add_op.op)
+
+        start_time = time.time()
+        for i in range(iters):
+            sess.run(add_op.op)
+        elapsed_time = time.time() - start_time
+        ans = float(iters)*(params_size / 1024 / 1024)/elapsed_time
+        print("transfer rate: %f MB/s" % (ans))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True,
+                        help="model to be tested")
+    parser.add_argument("--rank", type=int, required=True,
+                        help="rank of process")
+    parser.add_argument(
+        "--config", type=str, default='./settings/tf_dist_s1_w2.json', help="config file path")
+    parser.add_argument("--val", action="store_true",
+                        help="whether to use validation")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+    raw_model = args.model
+    task_id = int(args.rank)
+    raw_config = args.config
+
+    config = json.load(open(raw_config))
+    cluster = tf.train.ClusterSpec(config)
+
+    if raw_model != 'band':
+        import tf_models
+        model = eval('tf_models.' + raw_model)
+        dataset = raw_model.split('_')[-1]
+        print('Model:', raw_model)
+        if dataset == 'criteo':
+            train_criteo(model, cluster, task_id, len(config['worker']), args)
+        elif dataset == 'adult':
+            # not support val or all
+            train_adult(model, cluster, task_id, len(config['worker']))
+        else:
+            raise NotImplementedError
+    else:
+        test_bandwidth(cluster, task_id)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/ctr/tf_models/__init__.py b/examples/ctr/tf_models/__init__.py
new file mode 100644
index 0000000..659f885
--- /dev/null
+++ b/examples/ctr/tf_models/__init__.py
@@ -0,0 +1,4 @@
+from .tf_dcn_criteo import dcn_criteo
+from .tf_deepfm_criteo import dfm_criteo
+from .tf_wdl_criteo import wdl_criteo
+from .tf_wdl_adult import wdl_adult
diff --git a/examples/ctr/tf_models/tf_dcn_criteo.py b/examples/ctr/tf_models/tf_dcn_criteo.py
new file mode 100644
index 0000000..90a3b58
--- /dev/null
+++ b/examples/ctr/tf_models/tf_dcn_criteo.py
@@ -0,0 +1,69 @@
+import tensorflow as tf
+
+
+def cross_layer(x0, x1, device):
+    # x0: input embedding feature (batch_size, 26 * embedding_size + 13)
+    # x1: the output of last layer (batch_size, 26 * embedding_size + 13)
+
+    embed_dim = x1.shape[-1]
+    with tf.device(device):
+        w = tf.compat.v1.get_variable(name='w', shape=(embed_dim,))
+        b = tf.compat.v1.get_variable(name='b', shape=(embed_dim,))
+    x_1w = tf.tensordot(tf.reshape(x1, [-1, 1, embed_dim]), w, axes=1)
+    cross = x0 * x_1w
+    return cross + x1 + b
+
+
+def build_cross_layer(x0, num_layers=3, device=tf.device('/gpu:0')):
+    x1 = x0
+    for i in range(num_layers):
+        with tf.compat.v1.variable_scope('layer%d' % i):
+            x1 = cross_layer(x0, x1, device)
+    return x1
+
+
+def dcn_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.003 / 8  # here to comply with HETU
+    all_partitioner, embed_partitioner = (
+        partitioner, None) if part_all else (None, partitioner)
+    with tf.compat.v1.variable_scope('dcn', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+        with tf.device('/cpu:0'):
+            Embedding = tf.compat.v1.get_variable(name="Embedding", shape=(
+                feature_dimension, embedding_size), partitioner=embed_partitioner)
+            sparse_input_embedding = tf.nn.embedding_lookup(
+                Embedding, sparse_input)
+
+        device = '/gpu:0' if param_on_gpu else '/cpu:0'
+        with tf.device(device):
+            W1 = tf.compat.v1.get_variable(
+                name='W1', shape=[26*embedding_size + 13, 256])
+            W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+            W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256])
+            W4 = tf.compat.v1.get_variable(
+                name='W4', shape=[256 + 26 * embedding_size + 13, 1])
+
+        with tf.device('/gpu:0'):
+            flatten = tf.reshape(sparse_input_embedding,
+                                 (-1, 26*embedding_size))
+            x = tf.concat((flatten, dense_input), 1)
+            # CrossNet
+            cross_output = build_cross_layer(x, num_layers=3, device=device)
+            # DNN
+            flatten = x
+
+            fc1 = tf.matmul(flatten, W1)
+            relu1 = tf.nn.relu(fc1)
+            fc2 = tf.matmul(relu1, W2)
+            relu2 = tf.nn.relu(fc2)
+            y3 = tf.matmul(relu2, W3)
+
+            y4 = tf.concat((cross_output, y3), 1)
+            y = tf.matmul(y4, W4)
+            loss = tf.reduce_mean(
+                tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                learning_rate)
+            return loss, y, optimizer
diff --git a/examples/ctr/tf_models/tf_deepfm_criteo.py b/examples/ctr/tf_models/tf_deepfm_criteo.py
new file mode 100644
index 0000000..ea9d8f6
--- /dev/null
+++ b/examples/ctr/tf_models/tf_deepfm_criteo.py
@@ -0,0 +1,62 @@
+import tensorflow as tf
+
+
+def dfm_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.01 / 8  # here to comply with HETU
+    all_partitioner, embed_partitioner = (
+        partitioner, None) if part_all else (None, partitioner)
+    with tf.compat.v1.variable_scope('dfm', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+        with tf.device('/cpu:0'):
+            Embedding1 = tf.compat.v1.get_variable(name="Embedding1", shape=(
+                feature_dimension, 1), partitioner=embed_partitioner)
+            Embedding2 = tf.compat.v1.get_variable(name="embeddings", shape=(
+                feature_dimension, embedding_size), partitioner=embed_partitioner)
+            sparse_1dim_input = tf.nn.embedding_lookup(
+                Embedding1, sparse_input)
+            sparse_2dim_input = tf.nn.embedding_lookup(
+                Embedding2, sparse_input)
+
+        device = '/gpu:0' if param_on_gpu else '/cpu:0'
+        with tf.device(device):
+            FM_W = tf.compat.v1.get_variable(name='FM_W', shape=[13, 1])
+            W1 = tf.compat.v1.get_variable(
+                name='W1', shape=[26*embedding_size, 256])
+            W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+            W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 1])
+
+        with tf.device('/gpu:0'):
+            fm_dense_part = tf.matmul(dense_input, FM_W)
+            fm_sparse_part = tf.reduce_sum(sparse_1dim_input, 1)
+            # fst order output
+            y1 = fm_dense_part + fm_sparse_part
+
+            sparse_2dim_sum = tf.reduce_sum(sparse_2dim_input, 1)
+            sparse_2dim_sum_square = tf.multiply(
+                sparse_2dim_sum, sparse_2dim_sum)
+
+            sparse_2dim_square = tf.multiply(
+                sparse_2dim_input, sparse_2dim_input)
+            sparse_2dim_square_sum = tf.reduce_sum(sparse_2dim_square, 1)
+            sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
+            sparse_2dim_half = sparse_2dim * 0.5
+            # snd order output
+            y2 = tf.reduce_sum(sparse_2dim_half, 1, keepdims=True)
+
+            # DNN
+            flatten = tf.reshape(sparse_2dim_input, (-1, 26*embedding_size))
+            fc1 = tf.matmul(flatten, W1)
+            relu1 = tf.nn.relu(fc1)
+            fc2 = tf.matmul(relu1, W2)
+            relu2 = tf.nn.relu(fc2)
+            y3 = tf.matmul(relu2, W3)
+
+            y4 = y1 + y2
+            y = y4 + y3
+            loss = tf.reduce_mean(
+                tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                learning_rate)
+            return loss, y, optimizer
diff --git a/examples/ctr/tf_models/tf_wdl_adult.py b/examples/ctr/tf_models/tf_wdl_adult.py
new file mode 100644
index 0000000..4137d08
--- /dev/null
+++ b/examples/ctr/tf_models/tf_wdl_adult.py
@@ -0,0 +1,77 @@
+import tensorflow as tf
+import numpy as np
+
+
+def wdl_adult(X_deep, X_wide, y_, cluster=None, task_id=None):
+    lr_ = 5 / 128
+    dim_wide = 809
+    dim_deep = 68
+    use_ps = cluster is not None
+
+    if use_ps:
+        device = tf.device(tf.train.replica_device_setter(
+            worker_device="/job:worker/task:%d/gpu:0" % (task_id),
+            cluster=cluster))
+    else:
+        device = tf.device('/gpu:0')
+        global_step = tf.Variable(0, name="global_step", trainable=False)
+    with device:
+        if use_ps:
+            global_step = tf.Variable(0, name="global_step", trainable=False)
+
+        rand = np.random.RandomState(seed=123)
+        W = tf.Variable(rand.normal(scale=0.1, size=[
+                        dim_wide+20, 2]), dtype=tf.float32)
+        W1 = tf.Variable(rand.normal(scale=0.1, size=[
+                         dim_deep, 50]), dtype=tf.float32)
+        b1 = tf.Variable(rand.normal(scale=0.1, size=[50]), dtype=tf.float32)
+        W2 = tf.Variable(rand.normal(
+            scale=0.1, size=[50, 20]), dtype=tf.float32)
+        b2 = tf.Variable(rand.normal(scale=0.1, size=[20]), dtype=tf.float32)
+
+        Embedding = []
+
+        for i in range(8):
+            Embedding.append(tf.Variable(rand.normal(
+                scale=0.1, size=[20, 8]), dtype=tf.float32))
+
+        # deep
+        X_deep_input = None
+        for i in range(8):
+            now = tf.nn.embedding_lookup(Embedding[i], X_deep[i])
+            now = tf.reshape(now, (-1, 8))
+            if X_deep_input is None:
+                X_deep_input = now
+            else:
+                X_deep_input = tf.concat([X_deep_input, now], 1)
+
+        for i in range(4):
+            now = tf.reshape(X_deep[i + 8], (-1, 1))
+            X_deep_input = tf.concat([X_deep_input, now], 1)
+
+        mat1 = tf.matmul(X_deep_input, W1)
+        add1 = tf.add(mat1, b1)
+        relu1 = tf.nn.relu(add1)
+        dropout1 = relu1
+        mat2 = tf.matmul(dropout1, W2)
+        add2 = tf.add(mat2, b2)
+        relu2 = tf.nn.relu(add2)
+        dropout2 = relu2
+        dmodel = dropout2
+
+        # wide
+        wmodel = tf.concat([X_wide, dmodel], 1)
+        wmodel = tf.matmul(wmodel, W)
+
+        y = wmodel
+        loss = tf.reduce_mean(
+            tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
+        )
+
+        optimizer = tf.train.GradientDescentOptimizer(lr_)
+        train_op = optimizer.minimize(loss, global_step=global_step)
+
+        if use_ps:
+            return loss, y, train_op, global_step
+        else:
+            return loss, y, train_op
diff --git a/examples/ctr/tf_models/tf_wdl_criteo.py b/examples/ctr/tf_models/tf_wdl_criteo.py
new file mode 100644
index 0000000..d3d7454
--- /dev/null
+++ b/examples/ctr/tf_models/tf_wdl_criteo.py
@@ -0,0 +1,40 @@
+import tensorflow as tf
+
+
+def wdl_criteo(dense_input, sparse_input, y_, partitioner=None, part_all=True, param_on_gpu=True):
+    feature_dimension = 33762577
+    embedding_size = 128
+    learning_rate = 0.01 / 8  # here to comply with HETU
+    all_partitioner, embed_partitioner = (
+        partitioner, None) if part_all else (None, partitioner)
+    with tf.compat.v1.variable_scope('wdl', dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.01), partitioner=all_partitioner):
+        with tf.device('/cpu:0'):
+            Embedding = tf.compat.v1.get_variable(name="Embedding", shape=(
+                feature_dimension, embedding_size), partitioner=embed_partitioner)
+            sparse_input_embedding = tf.nn.embedding_lookup(
+                Embedding, sparse_input)
+        device = '/gpu:0' if param_on_gpu else '/cpu:0'
+        with tf.device(device):
+            W1 = tf.compat.v1.get_variable(name='W1', shape=[13, 256])
+            W2 = tf.compat.v1.get_variable(name='W2', shape=[256, 256])
+            W3 = tf.compat.v1.get_variable(name='W3', shape=[256, 256])
+            W4 = tf.compat.v1.get_variable(
+                name='W4', shape=[256 + 26 * embedding_size, 1])
+        with tf.device('/gpu:0'):
+            sparse_input_embedding = tf.reshape(
+                sparse_input_embedding, (-1, 26*embedding_size))
+            flatten = dense_input
+            fc1 = tf.matmul(flatten, W1)
+            relu1 = tf.nn.relu(fc1)
+            fc2 = tf.matmul(relu1, W2)
+            relu2 = tf.nn.relu(fc2)
+            y3 = tf.matmul(relu2, W3)
+
+            y4 = tf.concat((sparse_input_embedding, y3), 1)
+            y = tf.matmul(y4, W4)
+            loss = tf.reduce_mean(
+                tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_))
+
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                learning_rate)
+            return loss, y, optimizer
diff --git a/examples/gnn/README.md b/examples/gnn/README.md
new file mode 100644
index 0000000..28ea840
--- /dev/null
+++ b/examples/gnn/README.md
@@ -0,0 +1,112 @@
+# GNN Examples (with Distributed Settings)
+
+## Structure
+```
+- gnn
+    - gnn_tools/                  scripts to prepare data and other
+    - config/                 distributed configurations
+    - gnn_model/              gnn models
+    - run_dist.py           train gnn models in ps setting
+    - run_dist_hybrid.py    train gnn models in hybrid setting
+    - run_single.py         train with a single gpu
+
+```
+
+## Configuration file explained
+
+We use a simple yaml file to specify the run configuration.
+
+```yaml
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 4
+  DMLC_NUM_SERVER : 1
+launch :
+  worker : 4
+  server : 1
+  graph_server : 1
+  scheduler : true
+```
+
+The 4 k-v pair in "shared" are used for PS-lite parameter server and will be added into environment. When running on a cluster, you should change "DMLC_PS_ROOT_URI" into an available IP address in the cluster.
+
+The difference of GNN model and other models is that we need to launch a set of graph servers to carry out graph sampling. Note that the total number of graph server MUST be equal to the graph partition number. It is recommended that you partition the graph into the number of machines and launch one graph server on each machine.
+
+Note that there should be only 1 scheduler and should only be launched on the machine with DMLC_PS_ROOT_URI.
+
+Note that the launch automatically select network interface for you. If this fails, try adding "DMLC_INTERFACE : eth0" to select the right network device.
+
+##  Prepare graph datasets
+
+1. Prepare Normal dataset (use dense feature and no embedding)
+
+   ```shell
+python3 -m graphmix.partition [-d DatasetName] -n4 -p ~/yourDataPath
+   ```
+
+   We currently have the following dataset Cora, PubMed, Reddit, Flickr, Yelp, ogbn-products, ogbn-arxiv.
+
+2. Prepare ogbn-mag or Reddit dataset (with sparse embedding)
+
+   Then you can use the following command to partition the graph into 4 parts for 4-workers to use.
+
+   ```bash
+   python3 gnn_tools/part_graph.py [-d DatasetName] -n 4 -p ~/yourDataPath
+   ```
+
+   Also note that if you want to train on K node, replace the -n 4 with -n K.
+
+3. Prepare Amazon dataset: This dataset is introduced in the cluster-GCN paper and there are two file to be downloaded: [metadata.json](https://drive.google.com/file/d/0B2jJQxNRDl_rVVZCdWVnYmUyRDg) and [map_files](https://drive.google.com/file/d/0B3lPMIHmG6vGd2U3VHB0Wkk4cGM). Once you download and extract the files and put them together under gnn_tools directory you can run
+
+   ```bash
+   python3 prepare_amazon_dataset.py
+   ```
+
+   Note that you need nltk installed in your environment to run this script and this will take a while.
+
+   After running the script, you will get the two output file: graph.npz and sparsefeature.npy. Put them in the right place.
+
+   ```bash
+   mkdir -p ~/.graphmix_dataset/AmazonSparse
+   mv graph.npz sparsefeature.npy ~/.graphmix_dataset/AmazonSparse
+   ```
+
+   Finally, use the part_graph.py to partition the graph
+
+   ```
+   python3 gnn_tools/part_graph.py -d AmazonSparse -n 4 -p ~/yourDataPath
+   ```
+
+## Training GNN Models
+
+After you have prepare one graph dataset, you can start training Embedding Models on graph datasets. We take Reddit as an example.
+
+To train on PS communication mode. Run
+
+```
+python3 run_dist.py [configfile] -p ~/yourDataPath/Reddit [--dense]
+```
+
+To train on Hybrid communication mode. Run
+
+```
+mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit [--dense]
+```
+
+When running on Hybrid mode, you will also have to launch some servers and scheduler seperately
+
+```
+python3 run_dist_hybrid.py [configfile] -p ~/yourDataPath/Reddit --server
+```
+
+A --dense argument is used if you are training with a normal dataset (with dense feature).
+
+## Train with a single card
+
+This time you will have to run partition as we mentioned before with n=1. After that, run
+
+```shell
+python3 run_single.py -p ~/yourDataPath/Reddit [--dense]
+```
+
diff --git a/examples/gnn/config/local_w2.yml b/examples/gnn/config/local_w2.yml
new file mode 100644
index 0000000..5e53073
--- /dev/null
+++ b/examples/gnn/config/local_w2.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 2
+  DMLC_NUM_SERVER : 1
+launch :
+  worker : 2
+  server : 1
+  graph_server : 1
+  scheduler : true
diff --git a/examples/gnn/config/local_w4.yml b/examples/gnn/config/local_w4.yml
new file mode 100644
index 0000000..8a7ea2d
--- /dev/null
+++ b/examples/gnn/config/local_w4.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 4
+  DMLC_NUM_SERVER : 1
+launch :
+  worker : 4
+  server : 1
+  graph_server : 4
+  scheduler : true
diff --git a/examples/gnn/config/local_w8.yml b/examples/gnn/config/local_w8.yml
new file mode 100644
index 0000000..cc83196
--- /dev/null
+++ b/examples/gnn/config/local_w8.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 8
+  DMLC_NUM_SERVER : 1
+launch :
+  worker : 8
+  server : 1
+  graph_server : 4
+  scheduler : true
diff --git a/examples/gnn/config/single.yml b/examples/gnn/config/single.yml
new file mode 100644
index 0000000..f669fca
--- /dev/null
+++ b/examples/gnn/config/single.yml
@@ -0,0 +1,10 @@
+shared :
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13100
+  DMLC_NUM_WORKER : 1
+  DMLC_NUM_SERVER : 0
+launch :
+  worker : 1
+  server : 0
+  graph_server : 1
+  scheduler : true
diff --git a/examples/gnn/gnn_model/__init__.py b/examples/gnn/gnn_model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/gnn/gnn_model/layer.py b/examples/gnn/gnn_model/layer.py
new file mode 100644
index 0000000..9270a02
--- /dev/null
+++ b/examples/gnn/gnn_model/layer.py
@@ -0,0 +1,68 @@
+import hetu as ht
+from hetu import init
+
+
+class GCN(object):
+    def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0,
+                 name="GCN", custom_init=None):
+        if custom_init is not None:
+            self.weight = ht.Variable(
+                value=custom_init[0], name=name+"_Weight")
+            self.bias = ht.Variable(value=custom_init[1], name=name+"_Bias")
+        else:
+            self.weight = init.xavier_uniform(
+                shape=(in_features, out_features), name=name+"_Weight")
+            self.bias = init.zeros(shape=(out_features,), name=name+"_Bias")
+        # self.mp is a sparse matrix and should appear in feed_dict later
+        self.mp = norm_adj
+        self.activation = activation
+        self.dropout = dropout
+        self.output_width = out_features
+
+    def __call__(self, x):
+        """
+            Build the computation graph, return the output node
+        """
+        if self.dropout > 0:
+            x = ht.dropout_op(x, 1 - self.dropout)
+        x = ht.matmul_op(x, self.weight)
+        msg = x + ht.broadcastto_op(self.bias, x)
+        x = ht.csrmm_op(self.mp, msg)
+        if self.activation == "relu":
+            x = ht.relu_op(x)
+        elif self.activation is not None:
+            raise NotImplementedError
+        return x
+
+
+class SageConv(object):
+    def __init__(self, in_features, out_features, norm_adj, activation=None, dropout=0,
+                 name="GCN", custom_init=None, mp_val=None):
+
+        self.weight = init.xavier_uniform(
+            shape=(in_features, out_features), name=name+"_Weight")
+        self.bias = init.zeros(shape=(out_features,), name=name+"_Bias")
+        self.weight2 = init.xavier_uniform(
+            shape=(in_features, out_features), name=name+"_Weight")
+        # self.mp is a sparse matrix and should appear in feed_dict later
+        self.mp = norm_adj
+        self.activation = activation
+        self.dropout = dropout
+        self.output_width = 2 * out_features
+
+    def __call__(self, x):
+        """
+            Build the computation graph, return the output node
+        """
+        feat = x
+        if self.dropout > 0:
+            x = ht.dropout_op(x, 1 - self.dropout)
+
+        x = ht.csrmm_op(self.mp, x)
+        x = ht.matmul_op(x, self.weight)
+        x = x + ht.broadcastto_op(self.bias, x)
+        if self.activation == "relu":
+            x = ht.relu_op(x)
+        elif self.activation is not None:
+            raise NotImplementedError
+        return ht.concat_op(x, ht.matmul_op(feat, self.weight2), axis=1)
diff --git a/examples/gnn/gnn_model/model.py b/examples/gnn/gnn_model/model.py
new file mode 100644
index 0000000..502271c
--- /dev/null
+++ b/examples/gnn/gnn_model/model.py
@@ -0,0 +1,62 @@
+import hetu as ht
+import numpy as np
+from .layer import GCN, SageConv
+
+
+def convert_to_one_hot(vals, max_val=0):
+    """Helper method to convert label array to one-hot array."""
+    if max_val == 0:
+        max_val = vals.max() + 1
+    one_hot_vals = np.zeros((vals.size, max_val))
+    one_hot_vals[np.arange(vals.size), vals] = 1
+    return one_hot_vals
+
+
+def sparse_model(int_feature, hidden_layer_size, embedding_idx_max, embedding_width, num_classes, lr):
+    y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot(
+        g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu()))
+    mask_ = ht.Variable(name="mask_")
+    index_ = ht.GNNDataLoaderOp(lambda g: ht.array(
+        g.i_feat[:, 0:-2], ctx=ht.cpu()), ctx=ht.cpu())
+    embedding = ht.init.random_normal(
+        [embedding_idx_max, embedding_width], stddev=0.1)
+    embed = ht.embedding_lookup_op(embedding, index_)
+    feat = ht.array_reshape_op(embed, (-1, int_feature * embedding_width))
+
+    norm_adj_ = ht.Variable("message_passing", trainable=False, value=None)
+    gcn1 = GCN(int_feature * embedding_width,
+               hidden_layer_size, norm_adj_, activation="relu")
+    gcn2 = GCN(gcn1.output_width, num_classes, norm_adj_)
+    x = gcn1(feat)
+    y = gcn2(x)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    train_loss = loss * mask_
+    train_loss = ht.reduce_mean_op(train_loss, [0])
+    opt = ht.optim.SGDOptimizer(lr)
+    train_op = opt.minimize(train_loss)
+    # model input & model output
+    return [loss, y, train_op], [mask_, norm_adj_]
+
+
+def dense_model(feature_dim, hidden_layer_size, num_classes, lr, arch=GCN):
+    y_ = ht.GNNDataLoaderOp(lambda g: ht.array(convert_to_one_hot(
+        g.i_feat[:, -2], max_val=num_classes), ctx=ht.cpu()))
+    mask_ = ht.Variable(name="mask_")
+    feat = ht.GNNDataLoaderOp(lambda g: ht.array(
+        g.f_feat, ctx=ht.cpu()), ctx=ht.cpu())
+
+    norm_adj_ = ht.Variable("message_passing", trainable=False, value=None)
+    gcn1 = arch(feature_dim, hidden_layer_size, norm_adj_, activation="relu")
+    gcn2 = arch(gcn1.output_width, hidden_layer_size,
+                norm_adj_, activation="relu")
+    classifier = ht.init.xavier_uniform(shape=(gcn2.output_width, num_classes))
+    x = gcn1(feat)
+    x = gcn2(x)
+    y = ht.matmul_op(x, classifier)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    train_loss = loss * mask_
+    train_loss = ht.reduce_mean_op(train_loss, [0])
+    opt = ht.optim.SGDOptimizer(lr)
+    train_op = opt.minimize(train_loss)
+    # model input & model output
+    return [loss, y, train_op], [mask_, norm_adj_]
diff --git a/examples/gnn/gnn_model/utils.py b/examples/gnn/gnn_model/utils.py
new file mode 100644
index 0000000..33feeb9
--- /dev/null
+++ b/examples/gnn/gnn_model/utils.py
@@ -0,0 +1,46 @@
+import hetu
+import graphmix
+import numpy as np
+from tqdm import tqdm
+
+
+def padding(graph, target_num_nodes):
+    assert graph.num_nodes <= target_num_nodes
+    graph.convert2coo()
+    new_graph = graphmix.Graph(graph.edge_index, target_num_nodes)
+    new_graph.tag = graph.tag
+    new_graph.type = graph.type
+    extra = target_num_nodes - graph.num_nodes
+    new_graph.i_feat = np.concatenate(
+        [graph.i_feat, np.tile(graph.i_feat[0], [extra, 1])])
+    new_graph.f_feat = np.concatenate(
+        [graph.f_feat, np.tile(graph.f_feat[0], [extra, 1])])
+    if graph.extra.size:
+        new_graph.extra = np.concatenate([graph.extra, np.zeros([extra, 1])])
+    return new_graph
+
+
+def prepare_data(ngraph):
+    cli = graphmix.Client()
+    graphs = []
+    for i in tqdm(range(ngraph)):
+        query = cli.pull_graph()
+        graph = cli.wait(query)
+        graphs.append(graph)
+    max_num_nodes = 0
+    for i in range(ngraph):
+        max_num_nodes = max(max_num_nodes, graphs[i].num_nodes)
+    for i in range(ngraph):
+        graphs[i] = padding(graphs[i], max_num_nodes)
+    return graphs
+
+
+def get_norm_adj(graph, device, use_original_gcn_norm=False):
+    norm = graph.gcn_norm(use_original_gcn_norm)
+    mp_mat = hetu.ndarray.sparse_array(
+        values=norm,
+        indices=(graph.edge_index[1], graph.edge_index[0]),
+        shape=(graph.num_nodes, graph.num_nodes),
+        ctx=device
+    )
+    return mp_mat
diff --git a/examples/gnn/gnn_tools/__init__.py b/examples/gnn/gnn_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/gnn/gnn_tools/launcher.py b/examples/gnn/gnn_tools/launcher.py
new file mode 100644
index 0000000..bebb77b
--- /dev/null
+++ b/examples/gnn/gnn_tools/launcher.py
@@ -0,0 +1,135 @@
+import os
+import os.path as osp
+import signal
+import yaml
+import multiprocessing
+
+import libc_graphmix as _C
+import hetu as ht
+from graphmix.shard import Shard
+
+default_graph_root_port = 27770
+
+
+def start_graph_server(shard, server_init):
+    os.environ['GRAPHMIX_ROLE'] = "server"
+    _C.init()
+    shard.load_graph_shard(_C.rank())
+    server = _C.start_server()
+    server.init_meta(shard.meta)
+    server.init_data(shard.f_feat, shard.i_feat, shard.edges)
+    del shard
+    print("GraphMix Server {} : data initialized at {}:{}".format(
+        _C.rank(), _C.ip(), _C.port()))
+    _C.barrier_all()
+    server_init(server)
+    _C.finalize()
+
+
+def start_server():
+    os.environ["DMLC_ROLE"] = "server"
+    ht.server_init()
+    ht.server_finish()
+
+# two scheduler in one process
+
+
+def start_scheduler():
+    os.environ['GRAPHMIX_ROLE'] = "scheduler"
+    os.environ['DMLC_ROLE'] = "scheduler"
+    _C.init()
+    ht.scheduler_init()
+    ht.scheduler_finish()
+    _C.finalize()
+
+
+def start_worker(func, args):
+    os.environ['GRAPHMIX_ROLE'] = "worker"
+    os.environ['DMLC_ROLE'] = "worker"
+    _C.init()
+    ht.worker_init()
+    args.local_rank = _C.rank() % args.num_local_worker
+    _C.barrier_all()
+    func(args)
+    ht.worker_finish()
+    _C.finalize()
+
+
+def start_worker_standalone(func, args, local_rank):
+    args.local_rank = local_rank
+    func(args)
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in process_list:
+        proc.kill()
+    exit(0)
+
+
+process_list = []
+
+
+def launch_graphmix_and_hetu_ps(target, args, server_init, hybrid_config=None):
+    # open setting file
+    file_path = osp.abspath(osp.expanduser(osp.normpath(args.config)))
+    with open(file_path) as setting_file:
+        settings = yaml.load(setting_file.read(), Loader=yaml.FullLoader)
+
+    # write environment variables
+    for key, value in settings["shared"].items():
+        os.environ[str(key)] = str(value)
+
+    # the graph data path is relative to the setting file path
+    graph_data_path = osp.abspath(osp.expanduser(osp.normpath(args.path)))
+    print("GraphMix launcher : Using Graph Data from ", graph_data_path)
+
+    # load graph and set the server number equal to the number of graph parts
+    shard = Shard(graph_data_path)
+    os.environ['GRAPHMIX_NUM_SERVER'] = str(shard.meta["num_part"])
+    os.environ['GRAPHMIX_NUM_WORKER'] = os.environ['DMLC_NUM_WORKER']
+    os.environ['GRAPHMIX_ROOT_URI'] = os.environ['DMLC_PS_ROOT_URI']
+    os.environ['GRAPHMIX_ROOT_PORT'] = str(default_graph_root_port)
+    if 'DMLC_INTERFACE' in os.environ.keys():
+        os.environ['GRAPHMIX_INTERFACE'] = os.environ['DMLC_INTERFACE']
+
+    # get local job number
+    args.num_local_worker = int(settings["launch"]["worker"])
+    args.num_local_graph_server = int(settings["launch"]["graph_server"])
+    args.num_local_server = int(settings["launch"]["server"])
+    args.scheduler = settings["launch"]["scheduler"]
+    assert args.num_local_graph_server <= shard.meta["num_part"]
+    assert args.num_local_worker <= int(os.environ['DMLC_NUM_WORKER'])
+    assert args.num_local_server <= int(os.environ['DMLC_NUM_SERVER'])
+    if hybrid_config == "worker":
+        args.num_local_server = 0
+        args.num_local_graph_server = 0
+        args.scheduler = False
+        args.num_local_worker = 1
+    elif hybrid_config == "server":
+        args.num_local_worker = 0
+
+    # launch workers
+    for i in range(args.num_local_worker):
+        proc = multiprocessing.Process(
+            target=start_worker, args=[target, args])
+        process_list.append(proc)
+    # launch graph servers
+    for i in range(args.num_local_graph_server):
+        proc = multiprocessing.Process(
+            target=start_graph_server, args=[shard, server_init])
+        process_list.append(proc)
+    # launch ps servers
+    for i in range(args.num_local_server):
+        proc = multiprocessing.Process(target=start_server, args=[])
+        process_list.append(proc)
+    # launch scheduler
+    if args.scheduler:
+        proc = multiprocessing.Process(target=start_scheduler)
+        process_list.append(proc)
+    # wait until all process finish
+    for proc in process_list:
+        proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in process_list:
+        proc.join()
diff --git a/examples/gnn/gnn_tools/log.py b/examples/gnn/gnn_tools/log.py
new file mode 100644
index 0000000..1c5f536
--- /dev/null
+++ b/examples/gnn/gnn_tools/log.py
@@ -0,0 +1,68 @@
+import multiprocessing
+import numpy as np
+import time
+
+logfile = open("log.txt", "w")
+
+
+class SharedTrainingStat():
+    def __init__(self):
+        self.manager = multiprocessing.Manager()
+        self.lock = self.manager.Lock()
+        self.total = self.manager.Value("total", 0)
+        self.acc = self.manager.Value("acc", 0)
+        self.loss = self.manager.Value("loss", 0.0)
+        self.count = self.manager.Value("count", 0)
+        self.train_total = self.manager.Value("train_total", 0)
+        self.train_acc = self.manager.Value("train_acc", 0)
+        self.train_loss = self.manager.Value("train_loss", 0.0)
+        self.train_count = self.manager.Value("train_count", 0)
+        self.time = []
+
+    def update(self, acc, total, loss):
+        self.lock.acquire()
+        self.total.value += total
+        self.acc.value += acc
+        self.loss.value += loss
+        self.count.value += 1
+        self.lock.release()
+
+    def update_train(self, acc, total, loss):
+        self.lock.acquire()
+        self.train_total.value += total
+        self.train_acc.value += acc
+        self.train_loss.value += loss
+        self.train_count.value += 1
+        self.lock.release()
+
+    def print(self, start=""):
+        self.lock.acquire()
+        if len(self.time) > 3:
+            epoch_time = np.array(self.time[1:])-np.array(self.time[:-1])
+            print(
+                "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time)))
+        self.time.append(time.time())
+        print(
+            start,
+            "test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
+                self.loss.value / self.count.value,
+                self.acc.value / self.total.value,
+                self.train_loss.value / self.train_count.value,
+                self.train_acc.value / self.train_total.value
+            )
+        )
+        print(
+            self.loss.value / self.count.value, self.acc.value / self.total.value,
+            self.train_loss.value / self.train_count.value, self.train_acc.value /
+            self.train_total.value,
+            file=logfile, flush=True
+        )
+        self.total.value = 0
+        self.acc.value = 0
+        self.loss.value = 0
+        self.count.value = 0
+        self.train_total.value = 0
+        self.train_acc.value = 0
+        self.train_loss.value = 0
+        self.train_count.value = 0
+        self.lock.release()
diff --git a/examples/gnn/gnn_tools/part_graph.py b/examples/gnn/gnn_tools/part_graph.py
new file mode 100644
index 0000000..45896e5
--- /dev/null
+++ b/examples/gnn/gnn_tools/part_graph.py
@@ -0,0 +1,25 @@
+from graphmix.partition import part_graph
+from graphmix.dataset import load_dataset
+from sparse_datasets import load_sparse_dataset
+import argparse
+import os.path as osp
+import yaml
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", "-d", required=True)
+    parser.add_argument("--nparts", "-n", required=True)
+    parser.add_argument("--path", "-p", required=True)
+    args = parser.parse_args()
+    output_path = str(args.path)
+    nparts = int(args.nparts)
+    dataset, idx_max = load_sparse_dataset(args.dataset)
+    output_path = osp.expanduser(osp.join(output_path, args.dataset))
+    part_graph(dataset, nparts, output_path)
+    # now write idx_max into meta.yml
+    meta_file = osp.join(output_path, "meta.yml")
+    with open(meta_file) as f:
+        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
+    meta["idx_max"] = idx_max
+    with open(meta_file, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
diff --git a/examples/gnn/gnn_tools/prepare_amazon_data.py b/examples/gnn/gnn_tools/prepare_amazon_data.py
new file mode 100644
index 0000000..71beb0d
--- /dev/null
+++ b/examples/gnn/gnn_tools/prepare_amazon_data.py
@@ -0,0 +1,120 @@
+import json
+import ast
+import numpy as np
+import nltk
+# all products with metadata
+filemap = {'train': './amazon-3M_train_map.txt',
+           'test': './amazon-3M_test_map.txt',
+           'bow': './amazon-3M_feature_map.txt',
+           'meta': './metadata.json',
+           'output': './graph.npz',
+           'output_sparse': './sparsefeature.npy'}
+
+
+def getBagofWord():
+    bow = dict()
+    with open(filemap['bow'], 'r') as f:
+        # start with 1, 0 for padding
+        word_cnt = 1
+        for line in f.read().strip().split():
+            bow[line] = word_cnt
+            word_cnt += 1
+    return bow
+
+
+def gettoken(descriptions, length):
+    bow = getBagofWord()
+    token_matrix = []
+    for desc in descriptions:
+        token_id = []
+        token = nltk.word_tokenize(desc.lower())
+        for word in token:
+            if word in bow:
+                token_id.append(bow[word])
+                if len(token_id) == length:
+                    break
+        while len(token_id) < length:
+            token_id.append(0)
+        token_matrix.append(token_id)
+    return np.array(token_matrix)
+
+
+prod_all = dict()
+prod_rcd = dict()
+with open(filemap['meta'], 'r') as f:
+    for line in f:
+        prod = ast.literal_eval(line.strip().replace('\n', '\\n'))
+        asin = prod['asin']
+        prod_all[asin] = prod
+        if 'related' in prod and 'categories' in prod and 'description' in prod:
+            prod_rcd[asin] = prod
+
+testNodes = set()
+prod_gcn = dict()
+asin2id = dict()
+cnt_id = 0
+asinlist = []
+
+for kword in ['train', 'test']:
+    with open(filemap[kword], 'r') as f:
+        for line in f:
+            asin = line.split()[0]
+            if asin in prod_rcd:
+                if kword == 'test':
+                    testNodes.add(asin)
+                prod_gcn[asin] = prod_rcd[asin]
+                asin2id[asin] = cnt_id
+                cnt_id += 1
+                asinlist.append(asin)
+
+graphlen = len(prod_gcn)
+print('#products with rel/cat/des/feat (GCN assumptions)', graphlen)
+print('#trainNodes:', graphlen-len(testNodes), 'testNodes:', len(testNodes))
+
+print(len(asin2id))
+
+cat2id = dict()
+cnt_id = 0
+
+class_map = np.zeros(graphlen).astype(np.int32)
+train_map = np.zeros(graphlen).astype(np.int32)
+descriptions = []
+for idx, asin in enumerate(asinlist):
+    prod = prod_gcn[asin]
+    isTest = True if asin in testNodes else False
+
+    cat = prod['categories'][0][0]
+    if cat not in cat2id:
+        cat2id[cat] = (cnt_id, 0, 0)
+        cnt_id += 1
+
+    if isTest:
+        cat2id[cat] = (cat2id[cat][0], cat2id[cat][1], cat2id[cat][2]+1)
+    else:
+        cat2id[cat] = (cat2id[cat][0], cat2id[cat][1]+1, cat2id[cat][2])
+
+    class_map[idx] = cat2id[cat][0]
+    train_map[idx] = 0 if isTest else 1
+    if "title" in prod:
+        descriptions.append(prod["title"] + " " + prod['description'])
+    else:
+        descriptions.append(prod['description'])
+
+print('Classes:', cat2id)
+print("Num Classes:", len(cat2id))
+
+links_set = set()
+for idx, asin in enumerate(asinlist):
+    for rel, neighbors in prod_gcn[asin]['related'].items():
+        for asin_nei in neighbors:
+            if asin_nei not in asin2id:
+                continue
+            idx_nei = asin2id[asin_nei]
+            lk = (idx, idx_nei) if idx_nei > idx else (idx_nei, idx)
+            if lk not in links_set:
+                links_set.add(lk)
+links = np.array(list(links_set))
+print('#links between products:', len(links))
+token_matrix = gettoken(descriptions, 16)
+np.savez(file=filemap['output'], y=class_map, train_map=train_map, edge=links)
+np.save(file=filemap['output_sparse'], arr=token_matrix)
diff --git a/examples/gnn/gnn_tools/sparse_datasets.py b/examples/gnn/gnn_tools/sparse_datasets.py
new file mode 100644
index 0000000..54582f5
--- /dev/null
+++ b/examples/gnn/gnn_tools/sparse_datasets.py
@@ -0,0 +1,89 @@
+import graphmix
+from graphmix.dataset import load_dataset
+import numpy as np
+import os.path as osp
+
+
+class AmazonSparseDataset():
+    def __init__(self, dataset_root):
+        self.name = "AmazonSparse"
+        data = np.load(osp.join(dataset_root, "graph.npz"))
+        feat = np.load(osp.join(dataset_root, "sparsefeature.npy"))
+        num_nodes = feat.shape[0]
+        edge = data['edge'].T
+        directed = np.concatenate([edge, edge[[1, 0]]], axis=1)
+        self.idx_max = np.max(feat) + 1
+        node_id = np.arange(num_nodes).reshape(-1, 1) + self.idx_max
+        self.idx_max += num_nodes
+        self.x = np.empty([num_nodes, 0])
+        self.y = np.concatenate(
+            [feat, node_id, data['y'].reshape(-1, 1)], axis=-1)
+        self.train_mask = data["train_map"]
+        self.graph = graphmix.Graph(
+            edge_index=directed,
+            num_nodes=num_nodes
+        )
+        self.num_classes = int(np.max(data['y']) + 1)
+
+
+class OGBNmagDataset():
+    def __init__(self, dataset_root):
+        self.name = "ogbn-mag"
+        from ogb.nodeproppred import PygNodePropPredDataset
+        dataset = PygNodePropPredDataset(name=self.name, root=dataset_root)
+        data = dataset[0]
+        year = data.node_year['paper'].numpy()
+        self.train_mask = year < 2018
+        edge = data.edge_index_dict['paper', 'cites', 'paper'].numpy()
+        directed = np.concatenate([edge, edge[[1, 0]]], axis=1)
+        num_nodes = data.num_nodes_dict['paper']
+        self.graph = graphmix.Graph(
+            edge_index=directed,
+            num_nodes=num_nodes
+        )
+        self.num_classes = dataset.num_classes
+
+        def process_sparse_idx(rel, length, base):
+            sp_idx = [[] for i in range(num_nodes)]
+            for i, j in rel.T:
+                sp_idx[i].append(j)
+            for i in range(num_nodes):
+                if len(sp_idx[i]) > length:
+                    sp_idx[i] = sp_idx[i][0:length]
+                while len(sp_idx[i]) < length:
+                    sp_idx[i].append(-1)
+            sp_idx = np.array(sp_idx)
+            sp_idx += (base + 1)
+            return sp_idx
+
+        node_id = np.arange(num_nodes).reshape(-1, 1)
+        field = data.edge_index_dict[(
+            'paper', 'has_topic', 'field_of_study')].numpy()
+        paper_field = process_sparse_idx(field, 10, num_nodes)
+        idx_max = num_nodes + data.num_nodes_dict['field_of_study'] + 1
+        author = data.edge_index_dict[('author', 'writes', 'paper')].numpy()
+        paper_author = process_sparse_idx(author[[1, 0]], 10, idx_max)
+        idx_max += data.num_nodes_dict['author'] + 1
+        self.idx_max = idx_max
+        self.x = np.empty([num_nodes, 0])
+        self.y = np.concatenate([
+            paper_field, paper_author, node_id, data.y_dict["paper"].numpy()
+        ], axis=1)
+
+
+def load_sparse_dataset(name):
+    root_dir = osp.expanduser(osp.join('~/.graphmix_dataset/', name))
+    if name == "Reddit":
+        dataset = load_dataset(name)
+        idx_max = dataset.x.shape[0]
+        node_id = np.arange(idx_max).reshape(-1, 1)
+        dataset.y = np.concatenate([node_id, dataset.y.reshape(-1, 1)], axis=1)
+    elif name == "AmazonSparse":
+        dataset = AmazonSparseDataset(root_dir)
+        idx_max = dataset.idx_max
+    elif name == "ogbn-mag":
+        dataset = OGBNmagDataset(root_dir)
+        idx_max = dataset.idx_max
+    else:
+        raise NotImplementedError
+    return dataset, int(idx_max)
diff --git a/examples/gnn/run_dist.py b/examples/gnn/run_dist.py
new file mode 100644
index 0000000..8b1bccc
--- /dev/null
+++ b/examples/gnn/run_dist.py
@@ -0,0 +1,91 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model
+from gnn_tools.log import SharedTrainingStat
+import graphmix
+
+import hetu as ht
+
+import numpy as np
+import argparse
+
+# usage : on each machine
+#   python3 run_dist.py [configfile] [-p data_path]
+
+
+def train_main(args):
+    cli = graphmix.Client()
+    meta = cli.meta
+    hidden_layer_size = args.hidden_size
+    num_epoch = args.num_epoch
+    rank = cli.rank()
+    nrank = cli.num_worker()
+    ctx = ht.gpu(rank % args.num_local_worker)
+    embedding_width = args.hidden_size
+    # the last two is train label and other train mask
+    num_int_feature = meta["int_feature"] - 2
+    # sample some graphs
+    ngraph = meta["train_node"] // (args.batch_size * nrank)
+    graphs = prepare_data(ngraph)
+    # build model
+    [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+        num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+
+    idx = 0
+    graph = graphs[idx]
+    idx = (idx + 1) % ngraph
+    ht.GNNDataLoaderOp.step(graph)
+    ht.GNNDataLoaderOp.step(graph)
+    executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS',
+                           use_sparse_pull=False, cstable_policy=args.cache)
+    nbatches = meta["train_node"] // (args.batch_size * nrank)
+    for epoch in range(num_epoch):
+        for _ in range(nbatches):
+            graph_nxt = graphs[idx]
+            idx = (idx + 1) % ngraph
+            ht.GNNDataLoaderOp.step(graph_nxt)
+            train_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+            eval_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+            feed_dict = {
+                norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)),
+                mask_: train_mask
+            }
+            loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+            y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+            acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+            train_acc = np.sum(
+                (y_predicted == graph.i_feat[:, -2]) * train_mask)
+            stat.update(acc, eval_mask.sum(), np.sum(
+                loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+            stat.update_train(train_acc, train_mask.sum(), np.sum(
+                loss_val.asnumpy()*train_mask)/train_mask.sum())
+            ht.get_worker_communicate().BarrierWorker()
+            graph = graph_nxt
+        if rank == 0:
+            stat.print(epoch)
+
+
+def server_init(server):
+    batch_size = args.batch_size
+    server.init_cache(0.1, graphmix.cache.LFUOpt)
+    worker_per_server = server.num_worker() // server.num_server()
+    server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+                       depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+    server.is_ready()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config")
+    parser.add_argument("--path", "-p", required=True)
+    parser.add_argument("--num_epoch", default=300, type=int)
+    parser.add_argument("--hidden_size", default=128, type=int)
+    parser.add_argument("--learning_rate", default=1, type=float)
+    parser.add_argument("--batch_size", default=128, type=int)
+    parser.add_argument("--cache", default="LFUOpt", type=str)
+    args = parser.parse_args()
+    stat = SharedTrainingStat()
+    launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init)
diff --git a/examples/gnn/run_dist_hybrid.py b/examples/gnn/run_dist_hybrid.py
new file mode 100644
index 0000000..21cf333
--- /dev/null
+++ b/examples/gnn/run_dist_hybrid.py
@@ -0,0 +1,153 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model
+import graphmix
+
+import hetu as ht
+from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+
+import numpy as np
+import time
+import os
+import sys
+import multiprocessing
+import argparse
+
+# usage :
+#   mpirun -np 4 --allow-run-as-root python3 run_dist_hybrid.py [configfile] [-p data_path]
+#   python3 run_dist_hybrid.py [configfile] [-p data_path] --server
+
+
+class TrainStat():
+    def __init__(self, comm):
+        self.file = open("log.txt", "w")
+        self.train_stat = np.zeros(4)
+        self.test_stat = np.zeros(4)
+        self.count = 0
+        self.time = []
+        self.comm = comm
+
+    def update_test(self, cnt, total, loss):
+        self.test_stat += [1, cnt, total, loss]
+
+    def update_train(self, cnt, total, loss):
+        self.train_stat += [1, cnt, total, loss]
+
+    def sync_and_clear(self):
+        self.count += 1
+        train_stat = ht.array(self.train_stat, ht.cpu())
+        test_stat = ht.array(self.test_stat, ht.cpu())
+        self.comm.dlarrayNcclAllReduce(
+            train_stat, train_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream)
+        self.comm.dlarrayNcclAllReduce(
+            test_stat, test_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, self.comm.stream)
+        self.comm.stream.sync()
+        train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy()
+        printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
+            self.count,
+            test_stat[3] / test_stat[0],
+            test_stat[1] / test_stat[2],
+            train_stat[3] / train_stat[0],
+            train_stat[1] / train_stat[2],
+        )
+        logstr = "{} {} {} {}".format(
+            test_stat[3] / test_stat[0],
+            test_stat[1] / test_stat[2],
+            train_stat[3] / train_stat[0],
+            train_stat[1] / train_stat[2],
+        )
+        self.time.append(time.time())
+        if self.comm.device_id.value == 0:
+            print(printstr, flush=True)
+            print(logstr, file=self.file, flush=True)
+            if len(self.time) > 3:
+                epoch_time = np.array(self.time[1:])-np.array(self.time[:-1])
+                print(
+                    "epoch time: {:.3f}+-{:.3f}".format(np.mean(epoch_time), np.var(epoch_time)))
+
+        self.train_stat[:] = 0
+        self.test_stat[:] = 0
+
+
+def train_main(args):
+    comm = ht.wrapped_mpi_nccl_init()
+    device_id = comm.dev_id
+    cli = graphmix.Client()
+    meta = cli.meta
+    hidden_layer_size = args.hidden_size
+    num_epoch = args.num_epoch
+    rank = cli.rank()
+    nrank = cli.num_worker()
+    ctx = ht.gpu(device_id)
+    embedding_width = args.hidden_size
+    # the last two is train label and other train mask
+    num_int_feature = meta["int_feature"] - 2
+    # sample some graphs
+    ngraph = 10 * meta["train_node"] // (args.batch_size * nrank)
+    graphs = prepare_data(ngraph)
+    # build model
+    [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+        num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+    idx = 0
+    graph = graphs[idx]
+    idx = (idx + 1) % ngraph
+    ht.GNNDataLoaderOp.step(graph)
+    ht.GNNDataLoaderOp.step(graph)
+    executor = ht.Executor([loss, y, train_op], ctx=ctx, comm_mode='Hybrid',
+                           use_sparse_pull=False, cstable_policy=args.cache)
+    nbatches = meta["train_node"] // (args.batch_size * nrank)
+    train_state = TrainStat(comm)
+    for epoch in range(num_epoch):
+        for _ in range(nbatches):
+            graph_nxt = graphs[idx]
+            idx = (idx + 1) % ngraph
+            ht.GNNDataLoaderOp.step(graph_nxt)
+            train_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+            eval_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+            feed_dict = {
+                norm_adj_: get_norm_adj(graph, ht.gpu(device_id)),
+                mask_: train_mask
+            }
+            loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+            y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+            acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+            train_acc = np.sum(
+                (y_predicted == graph.i_feat[:, -2]) * train_mask)
+            train_state.update_test(acc, eval_mask.sum(), np.sum(
+                loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+            train_state.update_train(train_acc, train_mask.sum(), np.sum(
+                loss_val.asnumpy()*train_mask)/train_mask.sum())
+            ht.get_worker_communicate().BarrierWorker()
+            graph = graph_nxt
+        train_state.sync_and_clear()
+
+
+def server_init(server):
+    batch_size = args.batch_size
+    server.init_cache(0.1, graphmix.cache.LFUOpt)
+    worker_per_server = server.num_worker() // server.num_server()
+    server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+                       depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+    server.is_ready()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config")
+    parser.add_argument("--path", "-p", required=True)
+    parser.add_argument("--num_epoch", default=300, type=int)
+    parser.add_argument("--hidden_size", default=128, type=int)
+    parser.add_argument("--learning_rate", default=1, type=float)
+    parser.add_argument("--batch_size", default=128, type=int)
+    parser.add_argument("--cache", default="LFUOpt", type=str)
+    parser.add_argument("--server", action="store_true")
+    args = parser.parse_args()
+    if args.server:
+        launch_graphmix_and_hetu_ps(
+            train_main, args, server_init, hybrid_config="server")
+    else:
+        launch_graphmix_and_hetu_ps(
+            train_main, args, server_init, hybrid_config="worker")
diff --git a/examples/gnn/run_single.py b/examples/gnn/run_single.py
new file mode 100644
index 0000000..b5a8418
--- /dev/null
+++ b/examples/gnn/run_single.py
@@ -0,0 +1,95 @@
+from gnn_tools.launcher import launch_graphmix_and_hetu_ps
+from gnn_tools.log import SharedTrainingStat
+from gnn_model.utils import get_norm_adj, prepare_data
+from gnn_model.model import sparse_model, dense_model
+import graphmix
+
+import hetu as ht
+
+import numpy as np
+import time
+import os
+import sys
+import argparse
+
+# usage
+#   python3 run_single.py [-p data_path]
+
+
+def train_main(args):
+    cli = graphmix.Client()
+    meta = cli.meta
+    hidden_layer_size = args.hidden_size
+    num_epoch = args.num_epoch
+    rank = cli.rank()
+    nrank = cli.num_worker()
+    ctx = ht.gpu(rank % args.num_local_worker)
+    embedding_width = args.hidden_size
+    # the last two is train label and other train mask
+    num_int_feature = meta["int_feature"] - 2
+    # sample some graphs
+    ngraph = meta["train_node"] // (args.batch_size * nrank)
+    graphs = prepare_data(ngraph)
+    # build model
+    if args.dense:
+        [loss, y, train_op], [mask_, norm_adj_] = dense_model(
+            meta["float_feature"], args.hidden_size, meta["class"], args.learning_rate)
+    else:
+        [loss, y, train_op], [mask_, norm_adj_] = sparse_model(
+            num_int_feature, args.hidden_size, meta["idx_max"], args.hidden_size, meta["class"], args.learning_rate)
+
+    idx = 0
+    graph = graphs[idx]
+    idx = (idx + 1) % ngraph
+    ht.GNNDataLoaderOp.step(graph)
+    ht.GNNDataLoaderOp.step(graph)
+    executor = ht.Executor([loss, y, train_op], ctx=ctx)
+    nbatches = meta["train_node"] // (args.batch_size * nrank)
+    for epoch in range(num_epoch):
+        for _ in range(nbatches):
+            graph_nxt = graphs[idx]
+            idx = (idx + 1) % ngraph
+            ht.GNNDataLoaderOp.step(graph_nxt)
+            train_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] == 1)
+            eval_mask = np.bitwise_and(
+                graph.extra[:, 0], graph.i_feat[:, -1] != 1)
+            feed_dict = {
+                norm_adj_: get_norm_adj(graph, ht.gpu(rank % args.num_local_worker)),
+                mask_: train_mask
+            }
+            loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
+            y_predicted = y_predicted.asnumpy().argmax(axis=1)
+
+            acc = np.sum((y_predicted == graph.i_feat[:, -2]) * eval_mask)
+            train_acc = np.sum(
+                (y_predicted == graph.i_feat[:, -2]) * train_mask)
+            stat.update(acc, eval_mask.sum(), np.sum(
+                loss_val.asnumpy()*eval_mask)/eval_mask.sum())
+            stat.update_train(train_acc, train_mask.sum(), np.sum(
+                loss_val.asnumpy()*train_mask)/train_mask.sum())
+            graph = graph_nxt
+        stat.print(epoch)
+
+
+def server_init(server):
+    batch_size = args.batch_size
+    server.init_cache(0.1, graphmix.cache.LFUOpt)
+    worker_per_server = server.num_worker() // server.num_server()
+    server.add_sampler(graphmix.sampler.GraphSage, batch_size=batch_size,
+                       depth=2, width=2, thread=4 * worker_per_server, subgraph=True)
+    server.is_ready()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="config/single.yml")
+    parser.add_argument("--path", "-p", required=True)
+    parser.add_argument("--num_epoch", default=300, type=int)
+    parser.add_argument("--hidden_size", default=128, type=int)
+    parser.add_argument("--learning_rate", default=1, type=float)
+    parser.add_argument("--batch_size", default=128, type=int)
+    parser.add_argument("--dense", action="store_true")
+    args = parser.parse_args()
+    stat = SharedTrainingStat()
+    launch_graphmix_and_hetu_ps(train_main, args, server_init=server_init)
diff --git a/examples/nlp/.gitignore b/examples/nlp/.gitignore
new file mode 100644
index 0000000..c90eef6
--- /dev/null
+++ b/examples/nlp/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+iwslt2016/
+logs/
\ No newline at end of file
diff --git a/examples/nlp/README.md b/examples/nlp/README.md
new file mode 100644
index 0000000..53351d8
--- /dev/null
+++ b/examples/nlp/README.md
@@ -0,0 +1,18 @@
+# NLP Examples
+In this directory we provide simple implementations for Transformer model. We use the IWSLT2016 de-en dataset. 
+## Structure
+```
+- nlp
+    - hparams.py                    Hyperparameters
+    - prepare_data.py               Downloading and preparing data
+    - data_load.py                  Dataloader
+    - hetu_transformer.py           Transformer model in hetu
+    - tf_transformer.py             Transformer model in tensorflow
+    - train_hetu_transformer.py     Trainer for hetu
+    - train_tf_transformer.py       Trainer for tensorflow
+```
+## Usage
+```bash
+python train_{framework}_transformer.py
+```
+To change the hyperparameters, please modify `hparams.py` file.
\ No newline at end of file
diff --git a/examples/nlp/__init__.py b/examples/nlp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/nlp/bookcorpus/bookcorpus.py b/examples/nlp/bookcorpus/bookcorpus.py
new file mode 100644
index 0000000..42c696e
--- /dev/null
+++ b/examples/nlp/bookcorpus/bookcorpus.py
@@ -0,0 +1,92 @@
+# Lint as: python3
+"""The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """
+
+
+import glob
+import os
+import pathlib
+
+import datasets
+
+
+_DESCRIPTION = """\
+Books are a rich source of both fine-grained information, how a character, \
+an object or a scene looks like, as well as high-level semantics, what \
+someone is thinking, feeling and how these states evolve through a story.\
+This version of bookcorpus has 17868 dataset items (books). Each item contains \
+two fields: title and text. The title is the name of the book (just the file name) \
+while text contains unprocessed book text. The bookcorpus has been prepared by \
+Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community \
+driven platform dedicated to the archiving and long-term preservation of any and \
+all data including but by no means limited to... websites, books, games, software, \
+video, audio, other digital-obscura and ideas.
+"""
+
+_CITATION = """\
+@InProceedings{Zhu_2015_ICCV,
+    title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
+    author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
+    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+    month = {December},
+    year = {2015}
+}
+"""
+_PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27"
+# _DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"
+_DOWNLOAD_URL = "/home/xiaonan/develope/Athena/datasets/books_doc_format.tar.gz"
+
+
+class BookCorpusOpenConfig(datasets.BuilderConfig):
+    """BuilderConfig for BookCorpus."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for BookCorpus.
+        Args:
+        **kwargs: keyword arguments forwarded to super.
+        """
+        super(BookCorpusOpenConfig, self).__init__(
+            version=datasets.Version("1.0.0", ""), **kwargs)
+
+
+class BookCorpusOpen(datasets.GeneratorBasedBuilder):
+    """BookCorpus dataset."""
+
+    BUILDER_CONFIGS = [
+        BookCorpusOpenConfig(
+            name="plain_text",
+            description="Plain text",
+        )
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "title": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_PROJECT_URL,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
+                                    "directory": arch_path}),
+        ]
+
+    def _generate_examples(self, directory):
+        glob_target = os.path.join(directory, "**/*.epub.txt")
+        book_files = glob.glob(glob_target, recursive=True)
+        book_files = sorted(book_files)
+        _id = 0
+        for book_file_path in book_files:
+            path = pathlib.PurePath(book_file_path)
+            with open(book_file_path, mode="r", encoding="utf-8") as f:
+                yield _id, {"title": str(path.name), "text": f.read()},
+                _id += 1
diff --git a/examples/nlp/data_load.py b/examples/nlp/data_load.py
new file mode 100644
index 0000000..b5966f2
--- /dev/null
+++ b/examples/nlp/data_load.py
@@ -0,0 +1,120 @@
+import numpy as np
+
+
+class DataLoader(object):
+    def __init__(self, fpath1, fpath2, maxlen1, maxlen2, vocab_fpath):
+        self.sents1, self.sents2 = self.load_data(
+            fpath1, fpath2, maxlen1, maxlen2)
+        self.token2idx, self.idx2token = self.load_vocab(vocab_fpath)
+        self.maxlen1 = maxlen1
+        self.maxlen2 = maxlen2
+
+    def load_vocab(self, vocab_fpath):
+        '''Loads vocabulary file and returns idx<->token maps
+        vocab_fpath: string. vocabulary file path.
+        Note that these are reserved
+        0: <pad>, 1: <unk>, 2: <s>, 3: </s>
+
+        Returns
+        two dictionaries.
+        '''
+        vocab = [line.split()[0] for line in open(
+            vocab_fpath, 'r', encoding='utf-8').read().splitlines()]
+        token2idx = {token: idx for idx, token in enumerate(vocab)}
+        idx2token = {idx: token for idx, token in enumerate(vocab)}
+        return token2idx, idx2token
+
+    def load_data(self, fpath1, fpath2, maxlen1, maxlen2):
+        '''Loads source and target data and filters out too lengthy samples.
+        fpath1: source file path. string.
+        fpath2: target file path. string.
+        maxlen1: source sent maximum length. scalar.
+        maxlen2: target sent maximum length. scalar.
+
+        Returns
+        sents1: list of source sents
+        sents2: list of target sents
+        '''
+        sents1, sents2 = [], []
+        with open(fpath1, 'r', encoding='utf-8') as f1, open(fpath2, 'r', encoding='utf-8') as f2:
+            for sent1, sent2 in zip(f1, f2):
+                if len(sent1.split()) + 1 > maxlen1:
+                    continue  # 1: </s>
+                if len(sent2.split()) + 1 > maxlen2:
+                    continue  # 1: </s>
+                sents1.append(sent1.strip())
+                sents2.append(sent2.strip())
+        return sents1, sents2
+
+    def encode(self, inp, type, dict):
+        '''Converts string to number. Used for `generator_fn`.
+        inp: 1d byte array.
+        type: "x" (source side) or "y" (target side)
+        dict: token2idx dictionary
+
+        Returns
+        list of numbers
+        '''
+        inp_str = inp
+        if type == "x":
+            tokens = inp_str.split() + ["</s>"]
+        else:
+            tokens = ["<s>"] + inp_str.split() + ["</s>"]
+
+        x = [dict.get(t, dict["<unk>"]) for t in tokens]
+        return x
+
+    def make_epoch_data(self, batch_size, shuffle=False):
+        import copy
+        new_sents1 = copy.deepcopy(self.sents1)
+        new_sents2 = copy.deepcopy(self.sents2)
+        if shuffle:
+            import random
+            random.shuffle(new_sents1)
+            random.shuffle(new_sents2)
+        xs = [self.encode(sent1, "x", self.token2idx) for sent1 in new_sents1]
+        ys = [self.encode(sent2, "y", self.token2idx) for sent2 in new_sents2]
+        batch_xs = []
+        batch_ys = []
+        for i in range(0, len(xs), batch_size):
+            start = i
+            end = start + batch_size
+            batch_xs.append(xs[start:end])
+            batch_ys.append(ys[start:end])
+        if len(batch_xs[-1]) != batch_size:
+            batch_xs = batch_xs[:-1]
+            batch_ys = batch_ys[:-1]
+        self.cur_xs = batch_xs
+        self.cur_ys = batch_ys
+        self.batch_num = len(batch_xs)
+        self.idx = 0
+
+    def get_batch(self, fill_maxlen=True):
+        if self.idx >= self.batch_num:
+            assert False
+        cur_batch_x = self.cur_xs[self.idx]
+        cur_batch_y = self.cur_ys[self.idx]
+        self.idx += 1
+
+        if fill_maxlen:
+            cur_largest_len_x = self.maxlen1
+            cur_largest_len_y = self.maxlen2
+        else:
+            cur_largest_len_x = max([len(x) for x in cur_batch_x])
+            cur_largest_len_y = max([len(y) for y in cur_batch_y])
+
+        cur_batch_x = np.array([self.align(x, cur_largest_len_x)
+                                for x in cur_batch_x]).astype(np.float32)
+        cur_batch_y = np.array([self.align(y, cur_largest_len_y)
+                                for y in cur_batch_y]).astype(np.float32)
+        return (cur_batch_x, cur_largest_len_x), (cur_batch_y, cur_largest_len_y)
+
+    def align(self, arr, length):
+        ori_len = len(arr)
+        if length > ori_len:
+            return arr + [0] * (length - ori_len)
+        else:
+            return arr[:length]
+
+    def get_pad(self):
+        return self.token2idx["<pad>"]
diff --git a/examples/nlp/hetu_transformer.py b/examples/nlp/hetu_transformer.py
new file mode 100644
index 0000000..907f7a8
--- /dev/null
+++ b/examples/nlp/hetu_transformer.py
@@ -0,0 +1,266 @@
+import hetu as ht
+from hetu import init
+import numpy as np
+
+
+def layer_norm(
+    input_tensor,
+    feature_size,
+    eps=1e-8
+):
+    scale = init.ones(name='layer_norm_scale', shape=(feature_size, ))
+    bias = init.zeros(name='layer_norm_biad', shape=(feature_size, ))
+    return ht.layer_normalization_op(input_tensor, scale, bias, eps=eps)
+
+
+def dense(
+    input_tensor,
+    fan_in,
+    fan_out,
+    activation=None,
+    kernel_initializer=init.xavier_normal,
+    bias_initializer=init.zeros
+):
+    weights = kernel_initializer(name='dense_weights', shape=(fan_in, fan_out))
+    bias = bias_initializer(name='dense_bias', shape=(fan_out,))
+    outputs = ht.matmul_op(input_tensor, weights)
+    outputs = outputs + ht.broadcastto_op(bias, outputs)
+    if activation is not None:
+        outputs = activation(outputs)
+    return outputs
+
+
+def dropout(
+    input_tensor,
+    dropout_prob
+):
+    if dropout_prob is None or dropout_prob == 0.0:
+        return input_tensor
+    output = ht.dropout_op(input_tensor, 1.0 - dropout_prob)
+    return output
+
+
+def get_token_embeddings(vocab_size, num_units, initializer=init.xavier_normal, zero_pad=True):
+    if zero_pad:
+        embedding_part = initializer(
+            name='embedding_table', shape=(vocab_size-1, num_units))
+        padding_zero = init.zeros(
+            name='padding_zero', shape=(1, num_units), trainable=False)
+        embeddings = ht.concat_op(padding_zero, embedding_part)
+    else:
+        embeddings = initializer(
+            name='embedding_table', shape=(vocab_size, num_units))
+    return embeddings
+
+
+def multihead_attention(
+        queries, keys, values,
+        config,
+        query_act=None, key_act=None, value_act=None,
+        attention_mask=None,
+        causality=False):
+
+    def transpose_for_scores(input_tensor):
+        output_tensor = ht.array_reshape_op(
+            input_tensor, [config.batch_size, -1, config.num_heads, config.d_model // config.num_heads])
+
+        output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
+        return output_tensor
+
+    batch_size = config.batch_size
+    hidden_size = config.d_model
+    num_attention_heads = config.num_heads
+    caus_len = config.maxlen2 - 1
+    attention_probs_dropout_prob = config.dropout_rate
+
+    size_per_head = hidden_size // num_attention_heads
+
+    # reshape to 2d
+    queries2d = ht.array_reshape_op(
+        queries, [-1, hidden_size])  # (N * T_q, d_model)
+    keys2d = ht.array_reshape_op(keys, [-1, hidden_size])  # (N * T_k, d_model)
+    values2d = ht.array_reshape_op(
+        values, [-1, hidden_size])  # (N * T_k, d_model)
+
+    # linear transformation
+    query_layer = dense(queries2d, hidden_size, hidden_size,
+                        query_act)  # (N * T_k, d_model)
+    key_layer = dense(keys2d, hidden_size, hidden_size,
+                      key_act)  # (N * T_k, d_model)
+    value_layer = dense(values2d, hidden_size, hidden_size,
+                        value_act)  # (N * T_k, d_model)
+
+    # transpose
+    query_layer = transpose_for_scores(query_layer)  # (N, h, T_q, d_model/h)
+    key_layer = transpose_for_scores(key_layer)  # (N, h, T_k, d_model/h)
+    value_layer = transpose_for_scores(value_layer)  # (N, h, T_k, d_model/h)
+
+    # score
+    attention_scores = ht.batch_matmul_op(
+        query_layer, key_layer, trans_B=True)  # (N, h, T_q, T_k)
+    attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head)))
+
+    # mask
+    if attention_mask is not None:
+        zeros = ht.Variable('no_mask', value=np.array(
+            (0,), dtype=np.float32), trainable=False)
+        adder = ht.Variable('attention_mask', value=np.array(
+            (-2**32+1,), dtype=np.float32), trainable=False)
+        zeros = ht.broadcastto_op(zeros, attention_mask)
+        adder = ht.broadcastto_op(adder, attention_mask)
+        attention_mask = ht.where_op(attention_mask, zeros, adder)  # (N, T)
+        attention_mask = ht.array_reshape_op(
+            attention_mask, [batch_size, 1, 1, -1])
+        attention_scores = attention_scores + \
+            ht.broadcastto_op(attention_mask, attention_scores)
+    if causality:
+        tril = ht.Variable(name='tril', value=np.tril(
+            np.ones((caus_len, caus_len))), trainable=False)  # (T, T)
+        future_masks = ht.broadcast_shape_op(
+            tril, [batch_size, num_attention_heads, caus_len, caus_len])
+        adder = ht.Variable('future_mask', value=np.array(
+            (-2**32+1,), dtype=np.float32), trainable=False)
+        adder = ht.broadcastto_op(adder, future_masks)
+        attention_scores = ht.where_op(
+            future_masks, attention_scores, adder)  # (N, h, T, T)
+
+    # probs
+    attention_probs = ht.softmax_op(attention_scores)
+    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+    context_layer = ht.batch_matmul_op(attention_probs, value_layer)
+    context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3])
+    outputs = ht.array_reshape_op(
+        context_layer,
+        [batch_size, -1, num_attention_heads * size_per_head])
+
+    # Residual connection
+    outputs = outputs + queries  # (N, T_q, d_model)
+
+    # Normalize
+    outputs = layer_norm(outputs, hidden_size)  # (N, T_q, d_model)
+    return outputs
+
+
+def ff(inputs, config):
+    outputs = ht.array_reshape_op(inputs, [-1, config.d_model])
+    outputs = dense(outputs, config.d_model,
+                    config.d_ff, activation=ht.relu_op)
+    outputs = dense(outputs, config.d_ff, config.d_model)
+    outputs = ht.array_reshape_op(
+        outputs, [config.batch_size, -1, config.d_model])
+    outputs = outputs + inputs
+    outputs = layer_norm(outputs, config.d_model)
+    return outputs
+
+
+def label_smoothing(inputs, V, epsilon=0.1):
+    # V = inputs.shape[-1] # number of channels
+    return ((1-epsilon) * inputs) + (epsilon / V)
+
+
+def positional_encoding(
+    inputs,
+    inputs_shape,
+    maxlen,
+    masking=True
+):
+    N, T, E = tuple(inputs_shape)
+    position_enc = np.array([
+        [pos / np.power(10000, (i & -2)/E) for i in range(E)]
+        for pos in range(maxlen)])
+    position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+    position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+
+    position_enc = position_enc[:T, :]
+    outputs = ht.Variable(name='position_enc', value=np.tile(
+        position_enc, [N, 1, 1]), trainable=False)
+    zeros = ht.Variable(name='zeros', value=np.zeros(
+        inputs_shape), trainable=False)
+
+    if masking:
+        outputs = ht.where_op(inputs, outputs, zeros)
+
+    return outputs
+
+
+class Transformer(object):
+    def __init__(self, hp):
+        self.hp = hp
+        self.embeddings = get_token_embeddings(
+            self.hp.vocab_size, self.hp.d_model, zero_pad=True)
+
+    def encode(self, xs):
+        x = xs
+
+        # embedding
+        enc = ht.embedding_lookup_op(self.embeddings, x)  # (N, T1, d_model)
+        enc = enc * self.hp.d_model**0.5  # scale
+
+        enc += positional_encoding(enc, (self.hp.batch_size,
+                                         self.hp.maxlen1, self.hp.d_model), self.hp.maxlen1)
+        enc = dropout(enc, self.hp.dropout_rate)
+
+        # Blocks
+        for i in range(self.hp.num_blocks):
+            # self-attention
+            enc = multihead_attention(
+                queries=enc, keys=enc, values=enc,
+                config=self.hp,
+                attention_mask=x,
+                causality=False
+            )
+            # feed forward
+            enc = ff(enc, config=self.hp)
+        memory = enc
+        return memory
+
+    def decode(self, ys, memory, src_masks):
+        decoder_inputs = ys
+
+        # embedding
+        dec = ht.embedding_lookup_op(
+            self.embeddings, decoder_inputs)  # (N, T2, d_model)
+        dec = dec * self.hp.d_model ** 0.5  # scale
+
+        dec += positional_encoding(dec, (self.hp.batch_size,
+                                         self.hp.maxlen2-1, self.hp.d_model), self.hp.maxlen2)
+        dec = dropout(dec, self.hp.dropout_rate)
+
+        # Blocks
+        for i in range(self.hp.num_blocks):
+            # Masked self-attention (Note that causality is True at this time)
+            dec = multihead_attention(
+                queries=dec, keys=dec, values=dec,
+                config=self.hp,
+                attention_mask=decoder_inputs,
+                causality=True,
+            )
+            # Vanilla attention
+            dec = multihead_attention(
+                queries=dec, keys=memory, values=memory,
+                config=self.hp,
+                attention_mask=src_masks,
+                causality=False,
+            )
+            # Feed Forward
+            dec = ff(dec, config=self.hp)
+
+        dec = ht.array_reshape_op(
+            dec, [-1, self.hp.d_model])  # (N * T, d_model)
+        logits = ht.array_reshape_op(ht.matmul_op(dec, self.embeddings, trans_B=True), [
+                                     self.hp.batch_size, -1, self.hp.vocab_size])  # (N, T, vocab)
+
+        return logits
+
+    def train(self, xs, ys):
+        # forward
+        memory = self.encode(xs)
+        logits = self.decode(ys[0], memory, xs)
+
+        # train scheme
+        y = ys[1]
+        y_ = label_smoothing(ht.one_hot_op(
+            y, self.hp.vocab_size), self.hp.vocab_size)  # (N, T, vocab)
+        loss = ht.softmaxcrossentropy_op(logits, y_)
+
+        return loss
diff --git a/examples/nlp/hparams.py b/examples/nlp/hparams.py
new file mode 100644
index 0000000..fbce591
--- /dev/null
+++ b/examples/nlp/hparams.py
@@ -0,0 +1,63 @@
+import argparse
+
+
+class Hparams:
+    parser = argparse.ArgumentParser()
+
+    # prepro
+    parser.add_argument('--vocab_size', default=32000, type=int)
+
+    # train
+    # files
+    parser.add_argument('--train1', default='iwslt2016/segmented/train.de.bpe',
+                        help="german training segmented data")
+    parser.add_argument('--train2', default='iwslt2016/segmented/train.en.bpe',
+                        help="english training segmented data")
+    parser.add_argument('--eval1', default='iwslt2016/segmented/eval.de.bpe',
+                        help="german evaluation segmented data")
+    parser.add_argument('--eval2', default='iwslt2016/segmented/eval.en.bpe',
+                        help="english evaluation segmented data")
+    parser.add_argument('--eval3', default='iwslt2016/prepro/eval.en',
+                        help="english evaluation unsegmented data")
+
+    # vocabulary
+    parser.add_argument('--vocab', default='iwslt2016/segmented/bpe.vocab',
+                        help="vocabulary file path")
+
+    # training scheme
+    parser.add_argument('--batch_size', default=16, type=int)
+    parser.add_argument('--eval_batch_size', default=128, type=int)
+
+    parser.add_argument('--lr', default=0.0003,
+                        type=float, help="learning rate")
+    parser.add_argument('--warmup_steps', default=4000, type=int)
+    parser.add_argument('--logdir', default="logs/tf", help="log directory")
+    parser.add_argument('--num_epochs', default=20, type=int)
+    parser.add_argument('--evaldir', default="logs/tf/eval",
+                        help="evaluation dir")
+
+    # model
+    parser.add_argument('--d_model', default=512, type=int,
+                        help="hidden dimension of encoder/decoder")
+    parser.add_argument('--d_ff', default=2048, type=int,
+                        help="hidden dimension of feedforward layer")
+    parser.add_argument('--num_blocks', default=6, type=int,
+                        help="number of encoder/decoder blocks")
+    parser.add_argument('--num_heads', default=8, type=int,
+                        help="number of attention heads")
+    parser.add_argument('--maxlen1', default=100, type=int,
+                        help="maximum length of a source sequence")
+    parser.add_argument('--maxlen2', default=100, type=int,
+                        help="maximum length of a target sequence")
+    parser.add_argument('--dropout_rate', default=0.3, type=float)
+    parser.add_argument('--smoothing', default=0.1, type=float,
+                        help="label smoothing rate")
+
+    # test
+    parser.add_argument('--test1', default='iwslt2016/segmented/test.de.bpe',
+                        help="german test segmented data")
+    parser.add_argument('--test2', default='iwslt2016/prepro/test.en',
+                        help="english test data")
+    parser.add_argument('--ckpt', help="checkpoint file path")
+    parser.add_argument('--test_batch_size', default=128, type=int)
+    parser.add_argument('--testdir', default="test/1", help="test result dir")
diff --git a/examples/nlp/prepare_data.py b/examples/nlp/prepare_data.py
new file mode 100644
index 0000000..85b9ff6
--- /dev/null
+++ b/examples/nlp/prepare_data.py
@@ -0,0 +1,128 @@
+import os
+import wget
+import tarfile
+import errno
+import sentencepiece as spm
+import re
+from hparams import Hparams
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def prepro(hp):
+    """Load raw data -> Preprocessing -> Segmenting with sentencepice
+    hp: hyperparams. argparse.
+    """
+    logging.info("# Check if raw files exist")
+    train1 = "iwslt2016/de-en/train.tags.de-en.de"
+    train2 = "iwslt2016/de-en/train.tags.de-en.en"
+    eval1 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.de.xml"
+    eval2 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.en.xml"
+    test1 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.de.xml"
+    test2 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.en.xml"
+    for f in (train1, train2, eval1, eval2, test1, test2):
+        if not os.path.isfile(f):
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f)
+
+    logging.info("# Preprocessing")
+    # train
+
+    def _prepro(x): return [line.strip() for line in open(x, 'r').read().split("\n")
+                            if not line.startswith("<")]
+    prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
+    assert len(prepro_train1) == len(
+        prepro_train2), "Check if train source and target files match."
+
+    # eval
+    def _prepro(x): return [re.sub("<[^>]+>", "", line).strip()
+                            for line in open(x, 'r').read().split("\n")
+                            if line.startswith("<seg id")]
+    prepro_eval1, prepro_eval2 = _prepro(eval1), _prepro(eval2)
+    assert len(prepro_eval1) == len(
+        prepro_eval2), "Check if eval source and target files match."
+
+    # test
+    prepro_test1, prepro_test2 = _prepro(test1), _prepro(test2)
+    assert len(prepro_test1) == len(
+        prepro_test2), "Check if test source and target files match."
+
+    logging.info("Let's see how preprocessed data look like")
+    logging.info("prepro_train1:", prepro_train1[0])
+    logging.info("prepro_train2:", prepro_train2[0])
+    logging.info("prepro_eval1:", prepro_eval1[0])
+    logging.info("prepro_eval2:", prepro_eval2[0])
+    logging.info("prepro_test1:", prepro_test1[0])
+    logging.info("prepro_test2:", prepro_test2[0])
+
+    logging.info("# write preprocessed files to disk")
+    os.makedirs("iwslt2016/prepro", exist_ok=True)
+
+    def _write(sents, fname):
+        with open(fname, 'w') as fout:
+            fout.write("\n".join(sents))
+
+    _write(prepro_train1, "iwslt2016/prepro/train.de")
+    _write(prepro_train2, "iwslt2016/prepro/train.en")
+    _write(prepro_train1+prepro_train2, "iwslt2016/prepro/train")
+    _write(prepro_eval1, "iwslt2016/prepro/eval.de")
+    _write(prepro_eval2, "iwslt2016/prepro/eval.en")
+    _write(prepro_test1, "iwslt2016/prepro/test.de")
+    _write(prepro_test2, "iwslt2016/prepro/test.en")
+
+    logging.info("# Train a joint BPE model with sentencepiece")
+    os.makedirs("iwslt2016/segmented", exist_ok=True)
+    train = '--input=iwslt2016/prepro/train --pad_id=0 --unk_id=1 \
+             --bos_id=2 --eos_id=3\
+             --model_prefix=iwslt2016/segmented/bpe --vocab_size={} \
+             --model_type=bpe'.format(hp.vocab_size)
+    spm.SentencePieceTrainer.Train(train)
+
+    logging.info("# Load trained bpe model")
+    sp = spm.SentencePieceProcessor()
+    sp.Load("iwslt2016/segmented/bpe.model")
+
+    logging.info("# Segment")
+
+    def _segment_and_write(sents, fname):
+        with open(fname, "w") as fout:
+            for sent in sents:
+                pieces = sp.EncodeAsPieces(sent)
+                fout.write(" ".join(pieces) + "\n")
+
+    _segment_and_write(prepro_train1, "iwslt2016/segmented/train.de.bpe")
+    _segment_and_write(prepro_train2, "iwslt2016/segmented/train.en.bpe")
+    _segment_and_write(prepro_eval1, "iwslt2016/segmented/eval.de.bpe")
+    _segment_and_write(prepro_eval2, "iwslt2016/segmented/eval.en.bpe")
+    _segment_and_write(prepro_test1, "iwslt2016/segmented/test.de.bpe")
+
+    logging.info("Let's see how segmented data look like")
+    print("train1:", open("iwslt2016/segmented/train.de.bpe", 'r').readline())
+    print("train2:", open("iwslt2016/segmented/train.en.bpe", 'r').readline())
+    print("eval1:", open("iwslt2016/segmented/eval.de.bpe", 'r').readline())
+    print("eval2:", open("iwslt2016/segmented/eval.en.bpe", 'r').readline())
+    print("test1:", open("iwslt2016/segmented/test.de.bpe", 'r').readline())
+
+
+if __name__ == '__main__':
+    if not os.path.exists('iwslt2016'):
+        os.mkdir('iwslt2016')
+    os.chdir('iwslt2016')
+    file_name = 'de-en.tgz'
+    if not os.path.exists(file_name):
+        print('Downloading iwslt2016...')
+        url = 'https://wit3.fbk.eu/archive/2016-01//texts/de/en/de-en.tgz'
+        file_name = wget.download(url)
+        print()
+    if not os.path.exists('de-en'):
+        print('Extracting iwslt2016...')
+        with tarfile.open(file_name) as tar:
+            tar.extractall('./')
+
+    os.chdir('../')
+    hparams = Hparams()
+    parser = hparams.parser
+    hp = parser.parse_args()
+    print('Preprocessing iwslt2016...')
+    prepro(hp)
+    logging.info("Done")
diff --git a/examples/nlp/processBertData.py b/examples/nlp/processBertData.py
new file mode 100644
index 0000000..6089bf9
--- /dev/null
+++ b/examples/nlp/processBertData.py
@@ -0,0 +1,290 @@
+from datasets import load_dataset
+import random
+import hetu
+import os
+import numpy as np
+
+# https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz
+
+
+class TrainingInstance(object):
+    """A single training instance (sentence pair)."""
+
+    def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+                 is_random_next):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+
+    def __str__(self):
+        s = ""
+        s += "tokens: %s\n" % (" ".join(
+            [str(x) for x in self.tokens]))
+        s += "segment_ids: %s\n" % (" ".join([str(x)
+                                              for x in self.segment_ids]))
+        s += "is_random_next: %s\n" % self.is_random_next
+        s += "masked_lm_positions: %s\n" % (" ".join(
+            [str(x) for x in self.masked_lm_positions]))
+        s += "masked_lm_labels: %s\n" % (" ".join(
+            [str(x) for x in self.masked_lm_labels]))
+        s += "\n"
+        return s
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+    """Creates the predictions for the masked LM objective."""
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        cand_indexes.append(i)
+    rng.shuffle(cand_indexes)
+    output_tokens = list(tokens)
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+    masked_lms = []
+    for index in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        masked_token = None
+        # replace with [MASK] at 80%.
+        if rng.random() < 0.8:
+            masked_token = "[MASK]"
+        else:
+            # keep original at 10%.
+            if rng.random() < 0.5:
+                masked_token = tokens[index]
+            #  replace with random word at 10%.
+            else:
+                masked_token = vocab_words[rng.randint(
+                    0, len(vocab_words) - 1)]
+        output_tokens[index] = masked_token
+        masked_lms.append([index, tokens[index]])
+
+    masked_lms.sort(key=lambda x: x[0])
+    masked_lm_positions = []
+    masked_lm_labels = []
+
+    for p in masked_lms:
+        masked_lm_positions.append(p[0])
+        masked_lm_labels.append(p[1])
+
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def create_data_from_document(all_document,  doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+    """ Create Training example for input document """
+    document = all_document[doc_id]
+    max_num_tokens = max_seq_length - 3  # [CLS], [SEP], [SEP]
+    target_seq_length = max_num_tokens
+    # generate short sequence at the probility of short_seq_prob
+    # In order to minimize the mismatch between pre-training and fine-tuning.
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # create sentence A
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend([current_chunk[j]])
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+                    for _ in range(10):
+                        random_document_index = rng.randint(
+                            0, len(all_document) - 1)
+                        if random_document_index != doc_id:
+                            break
+                    # If picked random document is the same as the current document
+                    if random_document_index == doc_id:
+                        is_random_next = False
+                    random_document = all_document[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend([random_document[j]])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend([current_chunk[j]])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+                (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
+                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels)
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
+
+
+def convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq):
+
+    num_instances = len(instances)
+    input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+    input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+    segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
+    masked_lm_positions_list = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    masked_lm_ids_list = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    next_sentence_labels_list = np.zeros(num_instances, dtype="int32")
+    for (idx, instance) in enumerate(instances):
+        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+        input_mask = [1] * len(input_ids)
+        segment_ids = list(instance.segment_ids)
+        assert len(input_ids) <= max_seq_length
+
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        masked_lm_positions = list(instance.masked_lm_positions)
+        masked_lm_ids = tokenizer.convert_tokens_to_ids(
+            instance.masked_lm_labels)
+
+        while len(masked_lm_positions) < max_predictions_per_seq:
+            masked_lm_positions.append(0)
+            masked_lm_ids.append(0)
+
+        next_sentence_label = 1 if instance.is_random_next else 0
+
+        input_ids_list[idx][:] = input_ids
+        input_mask_list[idx][:] = input_mask
+        segment_ids_list[idx][:] = segment_ids
+        masked_lm_positions_list[idx][:] = masked_lm_ids
+        next_sentence_labels_list[idx] = next_sentence_label
+
+    return input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list
+
+
+def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng):
+
+    documents = []
+    for i in range(dataset['train'].shape[0]):
+        tokens = tokenizer.tokenize(dataset['train'][i]['text'])
+        documents.append(tokens)
+        print(len(tokens))
+
+    vocab_words = list(tokenizer.vocab.keys())
+    instances = []
+
+    for doc_id in range(len(documents)):
+        instances.extend(create_data_from_document(documents, doc_id,
+                                                   max_seq_length, short_seq_prob, masked_lm_prob,
+                                                   max_predictions_per_seq, vocab_words, rng))
+
+    # instance:
+    # tokens
+    # segment_ids
+    # is_random_next
+    # masked_lm_positions
+    # masked_lm_labels
+    return convert_instance_to_data(instances, tokenizer, max_seq_length, max_predictions_per_seq)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+def show_dataset_detail(dataset):
+    print(dataset.shape)
+    print(dataset.column_names)
+    print(dataset['train'].features)
+    print(dataset['train'][0]['text'])
+
+
+if __name__ == "__main__":
+    max_seq_length = 512
+    do_lower_case = True
+    short_seq_prob = 0.1
+    masked_lm_prob = 0.15
+    max_predictions_per_seq = 20
+
+    vocab_path = "/home/xiaonan/develope/Athena/datasets/bert-base-uncased-vocab.txt"
+    dataset = load_dataset(
+        '/home/xiaonan/develope/Athena/examples/nlp/bookcorpus', cache_dir=".")
+    print("total number of documents {} ".format(dataset['train'].shape[0]))
+    random_seed = 123
+    rng = random.Random(random_seed)
+    tokenizer = hetu.BertTokenizer(
+        vocab_file=vocab_path, do_lower_case=do_lower_case)
+
+    input_ids_list, input_mask_list, segment_ids_list, masked_lm_positions_list, next_sentence_labels_list = create_pretrain_data(
+        dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng)
+    print(input_ids_list[-1])
+    print(input_mask_list[-1])
+    print(segment_ids_list[-1])
+    print(masked_lm_positions_list[-1])
+    print(next_sentence_labels_list[-1])
diff --git a/examples/nlp/tf_transformer.py b/examples/nlp/tf_transformer.py
new file mode 100644
index 0000000..2c6b1d7
--- /dev/null
+++ b/examples/nlp/tf_transformer.py
@@ -0,0 +1,442 @@
+import numpy as np
+import tensorflow as tf
+
+from tqdm import tqdm
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def ln(inputs, epsilon=1e-8, scope="ln"):
+    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
+    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
+    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
+    scope: Optional scope for `variable_scope`.
+
+    Returns:
+      A tensor with the same shape and data dtype as `inputs`.
+    '''
+    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+        inputs_shape = inputs.get_shape()
+        params_shape = inputs_shape[-1:]
+
+        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
+        beta = tf.get_variable("beta", params_shape,
+                               initializer=tf.zeros_initializer())
+        gamma = tf.get_variable("gamma", params_shape,
+                                initializer=tf.ones_initializer())
+        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
+        outputs = gamma * normalized + beta
+
+    return outputs
+
+
+def get_token_embeddings(vocab_size, num_units, initializer=tf.contrib.layers.xavier_initializer(), zero_pad=True):
+    '''Constructs token embedding matrix.
+    Note that the column of index 0's are set to zeros.
+    vocab_size: scalar. V.
+    num_units: embedding dimensionalty. E.
+    zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
+    To apply query/key masks easily, zero pad is turned on.
+
+    Returns
+    weight variable: (V, E)
+    '''
+    with tf.variable_scope("shared_weight_matrix"):
+        embeddings = tf.get_variable('weight_mat',
+                                     dtype=tf.float32,
+                                     shape=(vocab_size, num_units),
+                                     initializer=initializer)
+        if zero_pad:
+            embeddings = tf.concat((tf.zeros(shape=[1, num_units]),
+                                    embeddings[1:, :]), 0)
+    return embeddings
+
+
+def multihead_attention(
+        queries, keys, values,
+        batch_size, hidden_size,
+        num_attention_heads=8,
+        query_act=None, key_act=None, value_act=None,
+        attention_mask=None,
+        attention_probs_dropout_prob=0.0,
+        training=True, causality=False,
+        scope="multihead_attention"):
+
+    def transpose_for_scores(input_tensor):
+        output_tensor = tf.reshape(
+            input_tensor, [batch_size, -1, num_attention_heads, hidden_size // num_attention_heads])
+
+        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+        return output_tensor
+
+    size_per_head = hidden_size // num_attention_heads
+    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+        # linear transformation
+        query_layer = tf.layers.dense(
+            queries, hidden_size, activation=query_act)  # (N, T_q, d_model)
+        key_layer = tf.layers.dense(
+            keys, hidden_size, activation=key_act)  # (N, T_k, d_model)
+        value_layer = tf.layers.dense(
+            values, hidden_size, activation=value_act)  # (N, T_k, d_model)
+
+        # transpose
+        query_layer = transpose_for_scores(
+            query_layer)  # (N, h, T_q, d_model/h)
+        key_layer = transpose_for_scores(key_layer)  # (N, h, T_k, d_model/h)
+        value_layer = transpose_for_scores(
+            value_layer)  # (N, h, T_k, d_model/h)
+
+        # score
+        attention_scores = tf.matmul(
+            query_layer, key_layer, transpose_b=True)  # (N, h, T_q, T_k)
+        attention_scores /= size_per_head ** 0.5
+
+        # mask
+        if attention_mask is not None:
+            attention_mask = tf.to_float(attention_mask)  # (N, T_k)
+            attention_mask = tf.reshape(
+                attention_mask, [batch_size, 1, 1, -1])  # (N, 1, 1, T_k)
+            attention_scores = attention_scores + \
+                attention_mask * (-2**32+1)  # (N, h, T_q, T_k)
+        if causality:
+            diag_vals = tf.ones_like(
+                attention_scores[0, 0, :, :])  # (T_q, T_k)
+            tril = tf.linalg.LinearOperatorLowerTriangular(
+                diag_vals).to_dense()  # (T_q, T_k)
+            future_masks = tf.broadcast_to(
+                tril, [batch_size, num_attention_heads, tril.shape[0], tril.shape[1]])  # (N, h, T_q, T_k)
+            paddings = tf.ones_like(future_masks) * (-2**32+1)
+            attention_scores = tf.where(
+                tf.equal(future_masks, 0), paddings, attention_scores)
+
+        # probs
+        attention_probs = tf.nn.softmax(attention_scores)  # (N, h, T_q, T_k)
+        attention_probs = tf.layers.dropout(
+            attention_probs, rate=attention_probs_dropout_prob, training=training)
+        # (N, h, T_q, d_model/h)
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(
+            context_layer, [0, 2, 1, 3])  # (N, T_q, h, d_model/h)
+        outputs = tf.reshape(context_layer, [
+                             batch_size, -1, num_attention_heads * size_per_head])  # (N, T_q, d_model)
+
+        # Residual connection
+        outputs += queries  # (N, T_q, d_model)
+
+        # Normalize
+        outputs = ln(outputs)  # (N, T_q, d_model)
+
+    return outputs
+
+
+def ff(inputs, num_units, scope="positionwise_feedforward"):
+    '''position-wise feed forward net. See 3.3
+
+    inputs: A 3d tensor with shape of [N, T, C].
+    num_units: A list of two integers.
+    scope: Optional scope for `variable_scope`.
+
+    Returns:
+      A 3d tensor with the same shape and dtype as inputs
+    '''
+    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+        # Inner layer
+        outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
+        # Outer layer
+        outputs = tf.layers.dense(outputs, num_units[1])
+        # Residual connection
+        outputs += inputs
+        # Normalize
+        outputs = ln(outputs)
+    return outputs
+
+
+def label_smoothing(inputs, epsilon=0.1):
+    '''Applies label smoothing. See 5.4 and https://arxiv.org/abs/1512.00567.
+    inputs: 3d tensor. [N, T, V], where V is the number of vocabulary.
+    epsilon: Smoothing rate.
+
+    For example,
+
+    ```
+    import tensorflow as tf
+    inputs = tf.convert_to_tensor([[[0, 0, 1], 
+       [0, 1, 0],
+       [1, 0, 0]],
+
+      [[1, 0, 0],
+       [1, 0, 0],
+       [0, 1, 0]]], tf.float32)
+
+    outputs = label_smoothing(inputs)
+
+    with tf.Session() as sess:
+        print(sess.run([outputs]))
+
+    >>
+    [array([[[ 0.03333334,  0.03333334,  0.93333334],
+        [ 0.03333334,  0.93333334,  0.03333334],
+        [ 0.93333334,  0.03333334,  0.03333334]],
+
+       [[ 0.93333334,  0.03333334,  0.03333334],
+        [ 0.93333334,  0.03333334,  0.03333334],
+        [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
+    ```    
+    '''
+    V = inputs.get_shape().as_list()[-1]  # number of channels
+    return ((1-epsilon) * inputs) + (epsilon / V)
+
+
+def positional_encoding(inputs,
+                        maxlen,
+                        masking=True,
+                        scope="positional_encoding"):
+    '''Sinusoidal Positional_Encoding. See 3.5
+    inputs: 3d tensor. (N, T, E)
+    maxlen: scalar. Must be >= T
+    masking: Boolean. If True, padding positions are set to zeros.
+    scope: Optional scope for `variable_scope`.
+
+    returns
+    3d tensor that has the same shape as inputs.
+    '''
+
+    E = inputs.get_shape().as_list()[-1]  # static
+    N, T = tf.shape(inputs)[0], tf.shape(inputs)[1]  # dynamic
+    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+        # position indices
+        position_ind = tf.tile(tf.expand_dims(
+            tf.range(T), 0), [N, 1])  # (N, T)
+
+        # First part of the PE function: sin and cos argument
+        position_enc = np.array([
+            [pos / np.power(10000, (i-i % 2)/E) for i in range(E)]
+            for pos in range(maxlen)])
+
+        # Second part, apply the cosine to even columns and sin to odds.
+        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+        position_enc = tf.convert_to_tensor(
+            position_enc, tf.float32)  # (maxlen, E)
+
+        # lookup
+        outputs = tf.nn.embedding_lookup(position_enc, position_ind)
+
+        # masks
+        if masking:
+            outputs = tf.where(tf.equal(inputs, 0), inputs, outputs)
+
+        return tf.to_float(outputs)
+
+# def noam_scheme(init_lr, global_step, warmup_steps=4000.):
+#     '''Noam scheme learning rate decay
+#     init_lr: initial learning rate. scalar.
+#     global_step: scalar.
+#     warmup_steps: scalar. During warmup_steps, learning rate increases
+#         until it reaches init_lr.
+#     '''
+#     step = tf.cast(global_step + 1, dtype=tf.float32)
+#     return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
+
+
+class Transformer(object):
+    '''
+    xs: tuple of
+        x: int32 tensor. (N, T1)
+        x_seqlens: int32 tensor. (N,)
+        sents1: str tensor. (N,)
+    ys: tuple of
+        decoder_input: int32 tensor. (N, T2)
+        y: int32 tensor. (N, T2)
+        y_seqlen: int32 tensor. (N, )
+        sents2: str tensor. (N,)
+    training: boolean.
+    '''
+
+    def __init__(self, hp):
+        self.hp = hp
+        # self.token2idx, self.idx2token = load_vocab(hp.vocab)
+        self.embeddings = get_token_embeddings(
+            self.hp.vocab_size, self.hp.d_model, zero_pad=True)
+
+    def encode(self, xs, training=True):
+        '''
+        Returns
+        memory: encoder outputs. (N, T1, d_model)
+        '''
+        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
+            x = xs
+
+            # src_masks
+            src_masks = tf.math.equal(x, 0)  # (N, T1)
+
+            # embedding
+            enc = tf.nn.embedding_lookup(
+                self.embeddings, x)  # (N, T1, d_model)
+            enc *= self.hp.d_model**0.5  # scale
+
+            enc += positional_encoding(enc, self.hp.maxlen1)
+            enc = tf.layers.dropout(
+                enc, self.hp.dropout_rate, training=training)
+
+            # Blocks
+            for i in range(self.hp.num_blocks):
+                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
+                    # self-attention
+                    enc = multihead_attention(
+                        queries=enc, keys=enc, values=enc,
+                        batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+                        num_attention_heads=self.hp.num_heads,
+                        attention_mask=src_masks,
+                        attention_probs_dropout_prob=self.hp.dropout_rate,
+                        training=training,
+                        causality=False
+                    )
+                    # feed forward
+                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
+        memory = enc
+        return memory, src_masks
+
+    def decode(self, ys, memory, src_masks, training=True):
+        '''
+        memory: encoder outputs. (N, T1, d_model)
+        src_masks: (N, T1)
+
+        Returns
+        logits: (N, T2, V). float32.
+        y_hat: (N, T2). int32
+        y: (N, T2). int32
+        sents2: (N,). string.
+        '''
+        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
+            decoder_inputs = ys
+
+            # tgt_masks
+            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)
+
+            # embedding
+            dec = tf.nn.embedding_lookup(
+                self.embeddings, decoder_inputs)  # (N, T2, d_model)
+            dec *= self.hp.d_model ** 0.5  # scale
+
+            dec += positional_encoding(dec, self.hp.maxlen2)
+            dec = tf.layers.dropout(
+                dec, self.hp.dropout_rate, training=training)
+
+            # Blocks
+            for i in range(self.hp.num_blocks):
+                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
+                    # Masked self-attention (Note that causality is True at this time)
+                    dec = multihead_attention(
+                        queries=dec, keys=dec, values=dec,
+                        batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+                        num_attention_heads=self.hp.num_heads,
+                        attention_mask=tgt_masks,
+                        attention_probs_dropout_prob=self.hp.dropout_rate,
+                        training=training,
+                        causality=True,
+                        scope="self_attention"
+                    )
+                    # Vanilla attention
+                    dec = multihead_attention(
+                        queries=dec, keys=memory, values=memory,
+                        batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
+                        num_attention_heads=self.hp.num_heads,
+                        attention_mask=src_masks,
+                        attention_probs_dropout_prob=self.hp.dropout_rate,
+                        training=training,
+                        causality=False,
+                        scope="vanilla_attention"
+                    )
+                    # Feed Forward
+                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
+
+        # Final linear projection (embedding weights are shared)
+        weights = tf.transpose(self.embeddings)  # (d_model, vocab_size)
+        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
+        # y_hat = tf.to_int32(tf.argmax(logits, axis=-1))
+
+        return logits
+
+    def train(self, xs, ys):
+        '''
+        Returns
+        loss: scalar.
+        train_op: training operation
+        global_step: scalar.
+        summaries: training summary node
+        '''
+        # forward
+        memory, src_masks = self.encode(xs)
+        logits = self.decode(ys[0], memory, src_masks)
+
+        # train scheme
+        y = ys[1]
+        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
+        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+            logits=logits, labels=y_)
+
+        return loss
+
+    # def eval(self, xs, ys):
+    #     '''Predicts autoregressively
+    #     At inference, input ys is ignored.
+    #     Returns
+    #     y_hat: (N, T2)
+    #     '''
+    #     decoder_inputs, y, y_seqlen, sents2 = ys
+
+    #     decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx["<s>"]
+    #     ys = (decoder_inputs, y, y_seqlen, sents2)
+
+    #     memory, sents1, src_masks = self.encode(xs, False)
+
+    #     logging.info("Inference graph is being built. Please be patient.")
+    #     for _ in tqdm(range(self.hp.maxlen2)):
+    #         logits, y_hat, y, sents2 = self.decode(ys, memory, src_masks, False)
+    #         if tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"]: break
+
+    #         _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
+    #         ys = (_decoder_inputs, y, y_seqlen, sents2)
+
+    #     # monitor a random sample
+    #     n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32)
+    #     sent1 = sents1[n]
+    #     pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
+    #     sent2 = sents2[n]
+
+    #     tf.summary.text("sent1", sent1)
+    #     tf.summary.text("pred", pred)
+    #     tf.summary.text("sent2", sent2)
+    #     summaries = tf.summary.merge_all()
+
+    #     return y_hat, summaries
+
+
+# def convert_idx_to_token_tensor(inputs, idx2token):
+#     '''Converts int32 tensor to string tensor.
+#     inputs: 1d int32 tensor. indices.
+#     idx2token: dictionary
+
+#     Returns
+#     1d string tensor.
+#     '''
+#     def my_func(inputs):
+#         return " ".join(idx2token[elem] for elem in inputs)
+
+#     return tf.py_func(my_func, [inputs], tf.string)
+
+# def load_vocab(vocab_fpath):
+#     '''Loads vocabulary file and returns idx<->token maps
+#     vocab_fpath: string. vocabulary file path.
+#     Note that these are reserved
+#     0: <pad>, 1: <unk>, 2: <s>, 3: </s>
+
+#     Returns
+#     two dictionaries.
+#     '''
+#     vocab = [line.split()[0] for line in open(vocab_fpath, 'r', encoding='utf-8').read().splitlines()]
+#     token2idx = {token: idx for idx, token in enumerate(vocab)}
+#     idx2token = {idx: token for idx, token in enumerate(vocab)}
+#     return token2idx, idx2token
diff --git a/examples/nlp/train_hetu_transformer.py b/examples/nlp/train_hetu_transformer.py
new file mode 100644
index 0000000..174f2b1
--- /dev/null
+++ b/examples/nlp/train_hetu_transformer.py
@@ -0,0 +1,62 @@
+from tqdm import tqdm
+import os
+import math
+import logging
+from hparams import Hparams
+from hetu_transformer import Transformer
+from data_load import DataLoader
+import hetu as ht
+import numpy as np
+# import time
+
+logging.basicConfig(level=logging.INFO)
+
+
+logging.info("# hparams")
+hparams = Hparams()
+parser = hparams.parser
+hp = parser.parse_args()
+print(hp)
+
+logging.info("# Prepare train/eval batches")
+dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)
+
+ctx = ht.gpu(1)
+xs = ht.Variable(name='xs')
+ys1 = ht.Variable(name='ys1')
+ys2 = ht.Variable(name='ys2')
+nonpadding = ht.Variable(name='nonpadding')
+
+logging.info("# Load model")
+m = Transformer(hp)
+loss = m.train(xs, (ys1, ys2))
+loss = ht.div_op(ht.reduce_sum_op(loss * nonpadding,
+                                  axes=[0, 1]), ht.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7)
+opt = ht.optim.SGDOptimizer(hp.lr)
+train_op = opt.minimize(loss)
+executor = ht.Executor([loss, train_op], ctx=ctx)
+
+logging.info("# Session")
+
+
+for ep in range(hp.num_epochs):
+    dataloader.make_epoch_data(hp.batch_size)
+    for i in tqdm(range(dataloader.batch_num)):
+        xs_val, ys_val = dataloader.get_batch()
+        # st = time.time()
+        xs_val = xs_val[0]
+        ys1_val = ys_val[0][:, :-1]
+        ys2_val = ys_val[0][:, 1:]
+        nonpadding_val = np.not_equal(
+            ys2_val, dataloader.get_pad()).astype(np.float32)
+        _loss, _ = executor.run(
+            feed_dict={xs: xs_val, ys1: ys1_val, ys2: ys2_val, nonpadding: nonpadding_val})
+        # en = time.time()
+        # if i == 100:
+        #     exit()
+
+        log_str = 'Iteration %d, loss %f' % (i, _loss.asnumpy())
+        print(log_str)
+        # print('time: ', (en - st))
+
+logging.info("Done")
diff --git a/examples/nlp/train_tf_transformer.py b/examples/nlp/train_tf_transformer.py
new file mode 100644
index 0000000..8def5d0
--- /dev/null
+++ b/examples/nlp/train_tf_transformer.py
@@ -0,0 +1,98 @@
+import tensorflow as tf
+
+from tqdm import tqdm
+import os
+import math
+import logging
+from hparams import Hparams
+from tf_transformer import Transformer
+from data_load import DataLoader
+# import time
+
+logging.basicConfig(level=logging.INFO)
+
+
+logging.info("# hparams")
+hparams = Hparams()
+parser = hparams.parser
+hp = parser.parse_args()
+print(hp)
+# save_hparams(hp, hp.logdir)
+
+logging.info("# Prepare train/eval batches")
+dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)
+
+xs = tf.placeholder(name='xs', dtype=tf.int32, shape=[16, 100])
+ys1 = tf.placeholder(name='ys1', dtype=tf.int32, shape=[16, 99])
+ys2 = tf.placeholder(name='ys2', dtype=tf.int32, shape=[16, 99])
+
+logging.info("# Load model")
+m = Transformer(hp)
+loss = m.train(xs, (ys1, ys2))
+nonpadding = tf.to_float(tf.not_equal(ys2, dataloader.get_pad()))  # 0: <pad>
+loss = tf.reduce_sum(loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
+
+global_step = tf.train.get_or_create_global_step()
+optimizer = tf.train.GradientDescentOptimizer(hp.lr)
+train_op = optimizer.minimize(loss, global_step=global_step)
+# y_hat, eval_summaries = m.eval(xs, ys)
+# y_hat = m.infer(xs, ys)
+
+logging.info("# Session")
+saver = tf.train.Saver(max_to_keep=hp.num_epochs)
+with tf.Session() as sess:
+    ckpt = tf.train.latest_checkpoint(hp.logdir)
+    if ckpt is None:
+        logging.info("Initializing from scratch")
+        sess.run(tf.global_variables_initializer())
+        # save_variable_specs(os.path.join(hp.logdir, "specs"))
+    else:
+        saver.restore(sess, ckpt)
+
+    _gs = sess.run(global_step)
+
+    for ep in range(hp.num_epochs):
+        dataloader.make_epoch_data(hp.batch_size)
+        for i in tqdm(range(dataloader.batch_num)):
+            xs_val, ys_val = dataloader.get_batch()
+            # st = time.time()
+            _loss, _, _gs = sess.run([loss, train_op, global_step], feed_dict={
+                                     xs: xs_val[0], ys1: ys_val[0][:, :-1], ys2: ys_val[0][:, 1:]})
+            # en = time.time()
+            # if i == 100:
+            #     exit()
+            # epoch = math.ceil(_gs / num_train_batches)
+
+            log_str = 'Iteration %d, loss %f' % (i, _loss)
+            print(log_str)
+            # print('time: ', (en - st))
+
+        # logging.info("epoch {} is done".format(ep))
+        # _loss = sess.run(loss) # train loss
+
+        # logging.info("# test evaluation")
+        # _, _eval_summaries = sess.run([eval_init_op, eval_summaries])
+        # summary_writer.add_summary(_eval_summaries, _gs)
+
+        # logging.info("# get hypotheses")
+        # hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, m.idx2token)
+
+        # logging.info("# write results")
+        # model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss)
+        # if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir)
+        # translation = os.path.join(hp.evaldir, model_output)
+        # with open(translation, 'w') as fout:
+        #     fout.write("\n".join(hypotheses))
+
+        # logging.info("# calc bleu score and append it to translation")
+        # calc_bleu(hp.eval3, translation)
+
+        # logging.info("# save models")
+        # ckpt_name = os.path.join(hp.logdir, model_output)
+        # saver.save(sess, ckpt_name, global_step=_gs)
+        # logging.info("after training of {} epochs, {} has been saved.".format(epoch, ckpt_name))
+
+        # logging.info("# fall back to train mode")
+
+
+logging.info("Done")
diff --git a/examples/nlp/wikipedia.py b/examples/nlp/wikipedia.py
new file mode 100644
index 0000000..d86c9ac
--- /dev/null
+++ b/examples/nlp/wikipedia.py
@@ -0,0 +1,540 @@
+"""Wikipedia dataset containing cleaned articles of all languages."""
+import bz2
+import codecs
+import json
+import re
+import xml.etree.cElementTree as etree
+
+import datasets
+
+
+logger = datasets.logging.get_logger(__name__)
+
+
+_CITATION = """\
+@ONLINE {wikidump,
+    author = {Wikimedia Foundation},
+    title  = {Wikimedia Downloads},
+    url    = {https://dumps.wikimedia.org}
+}
+"""
+
+_DESCRIPTION = """\
+Wikipedia dataset containing cleaned articles of all languages.
+The datasets are built from the Wikipedia dump
+(https://dumps.wikimedia.org/) with one split per language. Each example
+contains the content of one full Wikipedia article with cleaning to strip
+markdown and unwanted sections (references, etc.).
+"""
+
+_LICENSE = (
+    "This work is licensed under the Creative Commons Attribution-ShareAlike "
+    "3.0 Unported License. To view a copy of this license, visit "
+    "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
+    "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
+)
+
+# Source: https://en.wikipedia.org/wiki/List_of_Wikipedias (accessed 3/1/2019)
+# Removed because no articles: hz.
+WIKIPEDIA_LANGUAGES = [
+    "aa",
+    "ab",
+    "ace",
+    "ady",
+    "af",
+    "ak",
+    "als",
+    "am",
+    "an",
+    "ang",
+    "ar",
+    "arc",
+    "arz",
+    "as",
+    "ast",
+    "atj",
+    "av",
+    "ay",
+    "az",
+    "azb",
+    "ba",
+    "bar",
+    "bat-smg",
+    "bcl",
+    "be",
+    "be-x-old",
+    "bg",
+    "bh",
+    "bi",
+    "bjn",
+    "bm",
+    "bn",
+    "bo",
+    "bpy",
+    "br",
+    "bs",
+    "bug",
+    "bxr",
+    "ca",
+    "cbk-zam",
+    "cdo",
+    "ce",
+    "ceb",
+    "ch",
+    "cho",
+    "chr",
+    "chy",
+    "ckb",
+    "co",
+    "cr",
+    "crh",
+    "cs",
+    "csb",
+    "cu",
+    "cv",
+    "cy",
+    "da",
+    "de",
+    "din",
+    "diq",
+    "dsb",
+    "dty",
+    "dv",
+    "dz",
+    "ee",
+    "el",
+    "eml",
+    "en",
+    "eo",
+    "es",
+    "et",
+    "eu",
+    "ext",
+    "fa",
+    "ff",
+    "fi",
+    "fiu-vro",
+    "fj",
+    "fo",
+    "fr",
+    "frp",
+    "frr",
+    "fur",
+    "fy",
+    "ga",
+    "gag",
+    "gan",
+    "gd",
+    "gl",
+    "glk",
+    "gn",
+    "gom",
+    "gor",
+    "got",
+    "gu",
+    "gv",
+    "ha",
+    "hak",
+    "haw",
+    "he",
+    "hi",
+    "hif",
+    "ho",
+    "hr",
+    "hsb",
+    "ht",
+    "hu",
+    "hy",
+    "ia",
+    "id",
+    "ie",
+    "ig",
+    "ii",
+    "ik",
+    "ilo",
+    "inh",
+    "io",
+    "is",
+    "it",
+    "iu",
+    "ja",
+    "jam",
+    "jbo",
+    "jv",
+    "ka",
+    "kaa",
+    "kab",
+    "kbd",
+    "kbp",
+    "kg",
+    "ki",
+    "kj",
+    "kk",
+    "kl",
+    "km",
+    "kn",
+    "ko",
+    "koi",
+    "krc",
+    "ks",
+    "ksh",
+    "ku",
+    "kv",
+    "kw",
+    "ky",
+    "la",
+    "lad",
+    "lb",
+    "lbe",
+    "lez",
+    "lfn",
+    "lg",
+    "li",
+    "lij",
+    "lmo",
+    "ln",
+    "lo",
+    "lrc",
+    "lt",
+    "ltg",
+    "lv",
+    "mai",
+    "map-bms",
+    "mdf",
+    "mg",
+    "mh",
+    "mhr",
+    "mi",
+    "min",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "mrj",
+    "ms",
+    "mt",
+    "mus",
+    "mwl",
+    "my",
+    "myv",
+    "mzn",
+    "na",
+    "nah",
+    "nap",
+    "nds",
+    "nds-nl",
+    "ne",
+    "new",
+    "ng",
+    "nl",
+    "nn",
+    "no",
+    "nov",
+    "nrm",
+    "nso",
+    "nv",
+    "ny",
+    "oc",
+    "olo",
+    "om",
+    "or",
+    "os",
+    "pa",
+    "pag",
+    "pam",
+    "pap",
+    "pcd",
+    "pdc",
+    "pfl",
+    "pi",
+    "pih",
+    "pl",
+    "pms",
+    "pnb",
+    "pnt",
+    "ps",
+    "pt",
+    "qu",
+    "rm",
+    "rmy",
+    "rn",
+    "ro",
+    "roa-rup",
+    "roa-tara",
+    "ru",
+    "rue",
+    "rw",
+    "sa",
+    "sah",
+    "sat",
+    "sc",
+    "scn",
+    "sco",
+    "sd",
+    "se",
+    "sg",
+    "sh",
+    "si",
+    "simple",
+    "sk",
+    "sl",
+    "sm",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "srn",
+    "ss",
+    "st",
+    "stq",
+    "su",
+    "sv",
+    "sw",
+    "szl",
+    "ta",
+    "tcy",
+    "te",
+    "tet",
+    "tg",
+    "th",
+    "ti",
+    "tk",
+    "tl",
+    "tn",
+    "to",
+    "tpi",
+    "tr",
+    "ts",
+    "tt",
+    "tum",
+    "tw",
+    "ty",
+    "tyv",
+    "udm",
+    "ug",
+    "uk",
+    "ur",
+    "uz",
+    "ve",
+    "vec",
+    "vep",
+    "vi",
+    "vls",
+    "vo",
+    "wa",
+    "war",
+    "wo",
+    "wuu",
+    "xal",
+    "xh",
+    "xmf",
+    "yi",
+    "yo",
+    "za",
+    "zea",
+    "zh",
+    "zh-classical",
+    "zh-min-nan",
+    "zh-yue",
+    "zu",
+]
+
+_BASE_URL_TMPL = "https://dumps.wikimedia.org/{lang}wiki/{date}/"
+_INFO_FILE = "dumpstatus.json"
+
+
+class WikipediaConfig(datasets.BuilderConfig):
+    """BuilderConfig for Wikipedia."""
+
+    def __init__(self, language=None, date=None, **kwargs):
+        """BuilderConfig for Wikipedia.
+        Args:
+          language: string, the language code for the Wikipedia dump to use.
+          date: string, date of the Wikipedia dump in YYYYMMDD format. A list of
+            available dates can be found at https://dumps.wikimedia.org/enwiki/.
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(WikipediaConfig, self).__init__(
+            name="{0}.{1}".format(date, language),
+            description="Wikipedia dataset for {0}, parsed from {1} dump.".format(
+                language, date),
+            **kwargs,
+        )
+        self.date = date
+        self.language = language
+
+
+_VERSION = datasets.Version("1.0.0", "")
+
+
+class Wikipedia(datasets.BeamBasedBuilder):
+    """Wikipedia dataset."""
+
+    # Use mirror (your.org) to avoid download caps.
+    BUILDER_CONFIG_CLASS = WikipediaConfig
+    BUILDER_CONFIGS = [
+        WikipediaConfig(
+            version=_VERSION,
+            language=lang,
+            date="20200501",
+        )  # pylint:disable=g-complex-comprehension
+        for lang in WIKIPEDIA_LANGUAGES
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features({"title": datasets.Value(
+                "string"), "text": datasets.Value("string")}),
+            # No default supervised_keys.
+            supervised_keys=None,
+            homepage="https://dumps.wikimedia.org",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager, pipeline):
+        def _base_url(lang):
+            return _BASE_URL_TMPL.format(lang=lang.replace("-", "_"), date=self.config.date)
+
+        lang = self.config.language
+
+        info_url = _base_url(lang) + _INFO_FILE
+        # Use dictionary since testing mock always returns the same result.
+        downloaded_files = dl_manager.download_and_extract({"info": info_url})
+
+        xml_urls = []
+        total_bytes = 0
+        with open(downloaded_files["info"], encoding="utf-8") as f:
+            dump_info = json.load(f)
+        multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
+        assert (
+            multistream_dump_info["status"] == "done"
+        ), "Specified dump (%s) multistream status is not 'done': %s" % (
+            _base_url(lang),
+            multistream_dump_info["status"],
+        )
+
+        for fname, info in multistream_dump_info["files"].items():
+            if ".xml" not in fname:
+                continue
+            total_bytes += info["size"]
+            xml_urls.append(_base_url(lang) + fname)
+
+            # Use dictionary since testing mock always returns the same result.
+        downloaded_files = dl_manager.download({"xml": xml_urls})
+        if not pipeline.is_local():
+            downloaded_files = dl_manager.ship_files_with_pipeline(
+                downloaded_files, pipeline)
+
+        return [
+            datasets.SplitGenerator(  # pylint:disable=g-complex-comprehension
+                name=datasets.Split.TRAIN, gen_kwargs={
+                    "filepaths": downloaded_files["xml"], "language": lang}
+            )
+        ]
+
+    def _build_pcollection(self, pipeline, filepaths, language):
+        """Build PCollection of examples in the raw (text) form."""
+        import apache_beam as beam
+        import mwparserfromhell
+
+        def _extract_content(filepath):
+            """Extracts article content from a single WikiMedia XML file."""
+            logger.info("generating examples from = %s", filepath)
+            with beam.io.filesystems.FileSystems.open(filepath) as f:
+                f = bz2.BZ2File(filename=f)
+                # Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563
+                utf_f = codecs.getreader("utf-8")(f)
+                context = etree.iterparse(utf_f, events=("end",))
+                for unused_event, elem in context:
+                    if not elem.tag.endswith("page"):
+                        continue
+                    namespace = elem.tag[:-4]
+                    title = elem.find("./{0}title".format(namespace)).text
+                    ns = elem.find("./{0}ns".format(namespace)).text
+                    id_ = elem.find("./{0}id".format(namespace)).text
+
+                    # Filter pages that are not in the "main" namespace.
+                    if ns != "0":
+                        elem.clear()
+                        continue
+
+                    raw_content = elem.find(
+                        "./{0}revision/{0}text".format(namespace)).text
+                    elem.clear()
+
+                    # Filter redirects.
+                    if raw_content is None or raw_content.lower().startswith("#redirect"):
+                        beam.metrics.Metrics.counter(
+                            language, "filtered-redirects").inc()
+                        continue
+
+                    beam.metrics.Metrics.counter(
+                        language, "extracted-examples").inc()
+                    yield (id_, title, raw_content)
+
+        def _clean_content(inputs):
+            """Cleans raw wikicode to extract text."""
+            id_, title, raw_content = inputs
+            try:
+                text = _parse_and_clean_wikicode(
+                    raw_content, parser=mwparserfromhell)
+            except (mwparserfromhell.parser.ParserError) as e:
+                beam.metrics.Metrics.counter(language, "parser-error").inc()
+                logger.error("mwparserfromhell ParseError: %s", e)
+                return
+
+            if not text:
+                beam.metrics.Metrics.counter(
+                    language, "empty-clean-examples").inc()
+                return
+
+            beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
+
+            yield id_, {"title": title, "text": text}
+
+        return (
+            pipeline
+            | "Initialize" >> beam.Create(filepaths)
+            | "Extract content" >> beam.FlatMap(_extract_content)
+            | "Distribute" >> beam.transforms.Reshuffle()
+            | "Clean content" >> beam.FlatMap(_clean_content)
+        )
+
+
+def _parse_and_clean_wikicode(raw_content, parser):
+    """Strips formatting and unwanted sections from raw page content."""
+    wikicode = parser.parse(raw_content)
+
+    # Filters for references, tables, and file/image links.
+    re_rm_wikilink = re.compile(
+        "^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE)
+
+    def rm_wikilink(obj):
+        return bool(re_rm_wikilink.match(str(obj.title)))
+
+    def rm_tag(obj):
+        return str(obj.tag) in {"ref", "table"}
+
+    def rm_template(obj):
+        return obj.name.lower() in {"reflist", "notelist", "notelist-ua", "notelist-lr", "notelist-ur", "notelist-lg"}
+
+    def try_remove_obj(obj, section):
+        try:
+            section.remove(obj)
+        except ValueError:
+            # For unknown reasons, objects are sometimes not found.
+            pass
+
+    section_text = []
+    # Filter individual sections to clean.
+    for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
+        for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True):
+            try_remove_obj(obj, section)
+        for obj in section.ifilter_templates(matches=rm_template, recursive=True):
+            try_remove_obj(obj, section)
+        for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
+            try_remove_obj(obj, section)
+
+        section_text.append(section.strip_code().strip())
+    return "\n\n".join(section_text)
diff --git a/examples/rec/.gitignore b/examples/rec/.gitignore
new file mode 100644
index 0000000..ccb0cb2
--- /dev/null
+++ b/examples/rec/.gitignore
@@ -0,0 +1,2 @@
+datasets/
+logs/
diff --git a/examples/rec/README.md b/examples/rec/README.md
new file mode 100644
index 0000000..5fc2afa
--- /dev/null
+++ b/examples/rec/README.md
@@ -0,0 +1,42 @@
+# Recommendation Model Example (with Distributed Settings)
+In this directory we provide NCF model for recommendation task on movielens dataset.
+
+## Structure
+```
+- rec
+    - run_hetu.py           basic trainer for hetu
+    - run_tf.py             basic trainer for tensorflow
+    - run_tfworker.py       trainer for tensorflow in PS setting
+    - run_parallax.py       trainer for tensorflow in parallax setting
+    - hetu_ncf.py           model implementatino in hetu
+    - tf_ncf.py             model implementation in tensorflow
+    - movielens.py          script to download and handle dataset
+```
+
+## Prepare movielens data
+Simply `python movielens.py` .
+
+## Usage
+```bash
+# run locally
+python run_hetu.py
+# run in ps setting (locally)
+bash ps_ncf.sh
+# run in hybrid setting (locally)
+bash hybrid_ncf.sh
+
+# run tensorflow locally
+python run_tf.py
+# run tensorflow in parallax
+python {absolute_path_to}/run_parallax.py
+# run tensorflow in ps setting
+python ../ctr/tf_launch_server.py --config {config} --id {rank}
+python run_tfworker.py --rank {rank} --config {config}
+# or
+python ../ctr/tf_launch_server.py --config ../ctr/settings/tf_local_s1_w8.json --id 0
+bash tf_8workers.sh
+```
+
+
+## Configuration
+Please refer to `ctr` directory.
diff --git a/examples/rec/hetu_ncf.py b/examples/rec/hetu_ncf.py
new file mode 100644
index 0000000..ecf8b5d
--- /dev/null
+++ b/examples/rec/hetu_ncf.py
@@ -0,0 +1,47 @@
+import hetu as ht
+from hetu import init
+
+import numpy as np
+
+
+def neural_mf(user_input, item_input, y_, num_users, num_items):
+    embed_dim = 8
+    layers = [64, 32, 16, 8]
+    learning_rate = 0.01
+
+    User_Embedding = init.random_normal(
+        (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ht.cpu(0))
+    Item_Embedding = init.random_normal(
+        (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ht.cpu(0))
+
+    user_latent = ht.embedding_lookup_op(
+        User_Embedding, user_input, ctx=ht.cpu(0))
+    item_latent = ht.embedding_lookup_op(
+        Item_Embedding, item_input, ctx=ht.cpu(0))
+
+    mf_user_latent = ht.slice_op(user_latent, (0, 0), (-1, embed_dim))
+    mlp_user_latent = ht.slice_op(user_latent, (0, embed_dim), (-1, -1))
+    mf_item_latent = ht.slice_op(item_latent, (0, 0), (-1, embed_dim))
+    mlp_item_latent = ht.slice_op(item_latent, (0, embed_dim), (-1, -1))
+
+    W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1')
+    W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2')
+    W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3')
+    W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4')
+
+    mf_vector = ht.mul_op(mf_user_latent, mf_item_latent)
+    mlp_vector = ht.concat_op(mlp_user_latent, mlp_item_latent, axis=1)
+    fc1 = ht.matmul_op(mlp_vector, W1)
+    relu1 = ht.relu_op(fc1)
+    fc2 = ht.matmul_op(relu1, W2)
+    relu2 = ht.relu_op(fc2)
+    fc3 = ht.matmul_op(relu2, W3)
+    relu3 = ht.relu_op(fc3)
+    concat_vector = ht.concat_op(mf_vector, relu3, axis=1)
+    y = ht.matmul_op(concat_vector, W4)
+    y = ht.sigmoid_op(y)
+    loss = ht.binarycrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    opt = ht.optim.SGDOptimizer(learning_rate=learning_rate)
+    train_op = opt.minimize(loss)
+    return loss, y, train_op
diff --git a/examples/rec/hybrid_ncf.sh b/examples/rec/hybrid_ncf.sh
new file mode 100644
index 0000000..39f7c9a
--- /dev/null
+++ b/examples/rec/hybrid_ncf.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_hetu.py
+
+python -m hetu.launcher ${workdir}/../ctr/settings/local_s1.yml -n 1 --sched &
+mpirun --allow-run-as-root -np 4 python ${mainpy} --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_w4.yml
diff --git a/examples/rec/movielens.py b/examples/rec/movielens.py
new file mode 100644
index 0000000..90b5048
--- /dev/null
+++ b/examples/rec/movielens.py
@@ -0,0 +1,119 @@
+import os
+import wget
+import zipfile
+from collections import defaultdict as dd
+import numpy as np
+import scipy.sparse as sp
+from tqdm import tqdm
+
+
+DATASETS = ["ml-1m", "ml-20m", "ml-25m"]
+urls = {
+    "ml-1m": "https://files.grouplens.org/datasets/movielens/ml-1m.zip",
+    "ml-20m": "https://files.grouplens.org/datasets/movielens/ml-20m.zip",
+    "ml-25m": "https://files.grouplens.org/datasets/movielens/ml-25m.zip",
+}
+
+
+def download(dataset, data_dir, num_negatives=4):
+    if not os.path.exists(data_dir):
+        os.mkdir(data_dir)
+    assert dataset in ["ml-1m", "ml-20m",
+                       "ml-25m"], 'Invalid dataset: %s.' % dataset
+    data_subdir = os.path.join(data_dir, dataset)
+    print('Data in', data_subdir)
+    zip_file = os.path.join(data_dir, dataset + '.zip')
+    ratings = os.path.join(data_subdir, 'ratings.csv')
+    if not os.path.exists(ratings):
+        if not os.path.exists(zip_file):
+            print('Downloading movielens %s...' % dataset)
+            wget.download(urls[dataset], zip_file)
+        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+            print('Extracting movielens %s...' % dataset)
+            zip_ref.extractall(data_dir)
+    ratings = os.path.join(data_subdir, 'ratings.csv')
+
+    num_users, num_items = {
+        'ml-1m': (6040, 3706),
+        'ml-20m': (138493, 26744),
+        'ml-25m': (162541, 59047),
+    }[dataset]
+
+    # Generate raw training and testing files
+    item_reverse_mapping = {}
+    cur_item_idx = 0
+    latest = [(0, -1)] * num_users
+    mat = sp.dok_matrix((num_users, num_items), dtype=np.float32)
+    with open(ratings, 'r') as fr:
+        fr.readline()
+        for line in tqdm(fr):
+            entries = line.strip().split(',')
+            user = int(entries[0])
+            item = int(entries[1])
+            if item not in item_reverse_mapping:
+                item_reverse_mapping[item] = cur_item_idx
+                cur_item_idx += 1
+            rating = float(entries[2])
+            if rating <= 0:
+                continue
+            reitem = item_reverse_mapping[item]
+            mat[user-1, reitem] = 1
+            timestamp = int(entries[-1])
+            if latest[user-1][0] < timestamp:
+                latest[user-1] = (timestamp, reitem)
+    print('#users:', num_users, '#items:', num_items)
+
+    new_lates = np.concatenate((np.array(latest, dtype=np.int32)[
+                               :, 1:], np.empty((num_users, 99), dtype=np.int32)), 1)
+
+    # sample for test data first, each user 99 items, using all data
+    for i, lat in enumerate(latest):
+        new_lates[i][0] = lat[1]
+        for k in range(1, 100):
+            j = np.random.randint(num_items)
+            while (i, j) in mat.keys():
+                j = np.random.randint(num_items)
+            new_lates[i][k] = j
+    np.save(os.path.join(data_subdir, 'test.npy'), new_lates)
+
+    # sample for train data, each data with num_negative negative samples
+    all_num = (1 + num_negatives) * (len(mat.keys()) - num_users)
+    user_input = np.empty((all_num,), dtype=np.int32)
+    item_input = np.empty((all_num,), dtype=np.int32)
+    labels = np.empty((all_num,), dtype=np.int32)
+    idx = 0
+    for (i, j) in mat.keys():
+        if new_lates[i][0] == j:
+            continue
+        # positive instance
+        user_input[idx] = i
+        item_input[idx] = j
+        labels[idx] = 1
+        idx += 1
+        # negative instances
+        for t in range(num_negatives):
+            k = np.random.randint(num_items)
+            while (i, k) in mat.keys():
+                k = np.random.randint(num_items)
+            user_input[idx] = i
+            item_input[idx] = k
+            labels[idx] = 0
+            idx += 1
+    assert all_num == idx
+    np.savez(os.path.join(data_subdir, 'train.npz'),
+             user_input=user_input, item_input=item_input, labels=labels)
+
+
+def getdata(dataset, data_dir='datasets'):
+    assert dataset in ["ml-1m", "ml-20m",
+                       "ml-25m"], 'Invalid dataset: %s.' % dataset
+    data_subdir = os.path.join(data_dir, dataset)
+    file_paths = [os.path.join(data_subdir, data)
+                  for data in ['train.npz', 'test.npy']]
+    if any([not os.path.exists(path) for path in file_paths]):
+        download(dataset, data_dir)
+    return np.load(file_paths[0]), np.load(file_paths[1])
+
+
+if __name__ == "__main__":
+    download('ml-25m', 'datasets')
diff --git a/examples/rec/ps_ncf.sh b/examples/rec/ps_ncf.sh
new file mode 100644
index 0000000..415f752
--- /dev/null
+++ b/examples/rec/ps_ncf.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_hetu.py
+
+python ${mainpy} --comm PS --cache lfuopt --bound 3 --config ${workdir}/../ctr/settings/local_s1_w4.yml
diff --git a/examples/rec/run_hetu.py b/examples/rec/run_hetu.py
new file mode 100644
index 0000000..e20cb85
--- /dev/null
+++ b/examples/rec/run_hetu.py
@@ -0,0 +1,187 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import numpy as np
+import yaml
+import time
+import math
+import argparse
+from tqdm import tqdm
+from hetu_ncf import neural_mf
+import heapq  # for retrieval topK
+
+
+def getHitRatio(ranklist, gtItem):
+    for item in ranklist:
+        if item == gtItem:
+            return 1
+    return 0
+
+
+def getNDCG(ranklist, gtItem):
+    for i in range(len(ranklist)):
+        item = ranklist[i]
+        if item == gtItem:
+            return math.log(2) / math.log(i+2)
+    return 0
+
+
+class Logging(object):
+    def __init__(self, path='logs/hetulog.txt'):
+        with open(path, 'w') as fw:
+            fw.write('')
+        self.path = path
+
+    def write(self, s):
+        print(s)
+        with open(self.path, 'a') as fw:
+            fw.write(s + '\n')
+            fw.flush()
+
+
+def worker(args):
+    def validate():
+        hits, ndcgs = [], []
+        for idx in range(testData.shape[0]):
+            start_index = idx * 100
+            predictions = executor.run(
+                'validate', convert_to_numpy_ret_vals=True)
+            map_item_score = {
+                testItemInput[start_index + i]: predictions[0][i] for i in range(100)}
+            gtItem = testItemInput[start_index]
+            # Evaluate top rank list
+            ranklist = heapq.nlargest(
+                topK, map_item_score, key=map_item_score.get)
+            hr = getHitRatio(ranklist, gtItem)
+            ndcg = getNDCG(ranklist, gtItem)
+            hits.append(hr)
+            ndcgs.append(ndcg)
+        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+        return hr, ndcg
+
+    def get_current_shard(data):
+        if args.comm is not None:
+            part_size = data.shape[0] // nrank
+            start = part_size * rank
+            end = start + part_size if rank != nrank - 1 else data.shape[0]
+            return data[start:end]
+        else:
+            return data
+
+    device_id = 0
+    if args.comm == 'PS':
+        rank = ht.get_worker_communicate().rank()
+        nrank = int(os.environ['DMLC_NUM_WORKER'])
+        device_id = rank % 8
+    elif args.comm == 'Hybrid':
+        comm = ht.wrapped_mpi_nccl_init()
+        device_id = comm.dev_id
+        rank = comm.rank
+        nrank = int(os.environ['DMLC_NUM_WORKER'])
+
+    from movielens import getdata
+    if args.all:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = get_current_shard(trainData['user_input'])
+        trainItems = get_current_shard(trainData['item_input'])
+        trainLabels = get_current_shard(trainData['labels'])
+        testData = get_current_shard(testData)
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+    else:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = get_current_shard(trainData['user_input'][:1024000])
+        trainItems = get_current_shard(trainData['item_input'][:1024000])
+        trainLabels = get_current_shard(trainData['labels'][:1024000])
+        testData = get_current_shard(testData[:1470])
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+
+    num_users, num_items = {
+        'ml-1m': (6040, 3706),
+        'ml-20m': (138493, 26744),
+        'ml-25m': (162541, 59047),
+    }['ml-25m']
+    # assert not args.all or num_users == testData.shape[0]
+    batch_size = 1024
+    num_negatives = 4
+    topK = 10
+    user_input = ht.dataloader_op([
+        ht.Dataloader(trainUsers, batch_size, 'train'),
+        ht.Dataloader(testUserInput, 100, 'validate'),
+    ])
+    item_input = ht.dataloader_op([
+        ht.Dataloader(trainItems, batch_size, 'train'),
+        ht.Dataloader(testItemInput, 100, 'validate'),
+    ])
+    y_ = ht.dataloader_op([
+        ht.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'),
+    ])
+
+    loss, y, train_op = neural_mf(
+        user_input, item_input, y_, num_users, num_items)
+
+    executor = ht.Executor({'train': [loss, train_op], 'validate': [y]}, ctx=ht.gpu(device_id),
+                           comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
+
+    path = 'logs/hetulog_%s' % ({None: 'local',
+                                 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm])
+    path += '_%d.txt' % rank if args.comm else '.txt'
+    log = Logging(path=path)
+    epoch = 7
+    start = time.time()
+    for ep in range(epoch):
+        ep_st = time.time()
+        log.write('epoch %d' % ep)
+        train_loss = []
+        for idx in tqdm(range(executor.get_batch_num('train'))):
+            loss_val = executor.run('train', convert_to_numpy_ret_vals=True)
+            train_loss.append(loss_val[0])
+
+        tra_loss = np.mean(train_loss)
+        ep_en = time.time()
+
+        # validate phase
+        if args.val:
+            hr, ndcg = validate()
+            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+                tra_loss, hr, ndcg, ep_en - ep_st)
+        else:
+            printstr = "train_loss: %.4f, train_time: %.4f" % (
+                tra_loss, ep_en - ep_st)
+        log.write(printstr)
+    log.write('all time: %f' % (time.time() - start))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--val", action="store_true",
+                        help="whether to perform validation")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data, default to use 1024000 training data")
+    parser.add_argument("--comm", default=None,
+                        help="whether to use distributed setting, can be None, AllReduce, PS, Hybrid")
+    parser.add_argument("--bsp", action="store_true",
+                        help="whether to use bsp instead of asp")
+    parser.add_argument("--cache", default=None, help="cache policy")
+    parser.add_argument("--bound", default=100, help="cache bound")
+    parser.add_argument(
+        "--config", type=str, default="./settings/local_s1_w4.yml", help="configuration for ps")
+    args = parser.parse_args()
+
+    if args.comm is None:
+        worker(args)
+    elif args.comm == 'Hybrid':
+        settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+        value = settings['shared']
+        os.environ['DMLC_ROLE'] = 'worker'
+        for k, v in value.items():
+            os.environ[k] = str(v)
+        worker(args)
+    elif args.comm == 'PS':
+        launch(worker, args)
+    else:
+        raise NotImplementedError
diff --git a/examples/rec/run_parallax.py b/examples/rec/run_parallax.py
new file mode 100644
index 0000000..276d49a
--- /dev/null
+++ b/examples/rec/run_parallax.py
@@ -0,0 +1,192 @@
+import os
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq  # for retrieval topK
+import math
+
+from autodist import AutoDist
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
+from autodist.strategy.base import Strategy
+from autodist.kernel.common.utils import get_op_name
+from tensorflow.python.framework import ops
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+# Please DO NOT modify /etc/bash.bashrc to activate conda environment.
+# Use python_venv in spec yml file instead.
+# Use absolute path of python file.
+# Here we use the tf native partitioner instead of autodist's PartitionPS.
+
+
+class Parallaxx(PSLoadBalancing, AllReduce):
+    """
+    Modify original parallax to remove replica on CPUs.
+    """
+
+    def __init__(self, chunk_size=128, local_proxy_variable=False, sync=True, staleness=0):
+        PSLoadBalancing.__init__(self, local_proxy_variable, sync, staleness)
+        AllReduce.__init__(self, chunk_size)
+
+    # pylint: disable=attribute-defined-outside-init
+    def build(self, graph_item, resource_spec):
+        """Generate the strategy."""
+        expr = Strategy()
+
+        # For each variable, generate variable synchronizer config
+        expr.graph_config.replicas.extend(
+            [k for k, v in resource_spec.gpu_devices])
+        reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+        self.loads = {ps: 0.0 for ps in reduction_device_names}
+
+        # Generate node config
+        node_config = []
+        for idx, var in enumerate(graph_item.trainable_var_op_to_var.values()):
+            var_op_name = get_op_name(var.name)
+            grad, _, _ = graph_item.var_op_name_to_grad_info[var_op_name]
+            if isinstance(grad, ops.Tensor):  # this is a dense variable
+                group_id = idx // self.chunk_size
+                config = self._gen_all_reduce_node_config(
+                    var.name, group=group_id)
+            else:  # sparse updates
+                # For Parallax Strategy, all PS vars are sparse so we don't use a proxy.
+                # Sparse variables are likely larger, so keeping copies would be costlier,
+                # and usually each device only requires a small part of the overall variable.
+                config = self._gen_ps_node_config(
+                    var,
+                    # For Parallax Strategy, all PS vars are sparse which does not need proxy.
+                    False,
+                    self._sync,
+                    self._staleness
+                )
+            node_config.append(config)
+        expr.node_config.extend(node_config)
+
+        return expr
+
+
+def getHitRatio(ranklist, gtItem):
+    for item in ranklist:
+        if item == gtItem:
+            return 1
+    return 0
+
+
+def getNDCG(ranklist, gtItem):
+    for i in range(len(ranklist)):
+        item = ranklist[i]
+        if item == gtItem:
+            return math.log(2) / math.log(i+2)
+    return 0
+
+
+class Logging(object):
+    def __init__(self, path='logs/tflog.txt'):
+        with open(path, 'w') as fw:
+            fw.write('')
+        self.path = path
+
+    def write(self, s):
+        print(s)
+        with open(self.path, 'a') as fw:
+            fw.write(s + '\n')
+            fw.flush()
+
+
+def main():
+    resource_spec_file = os.path.join(os.path.dirname(
+        __file__), '../ctr/settings', 'plx_local_spec.yml')
+    autodist = AutoDist(resource_spec_file, Parallaxx())
+    respec = ResourceSpec(resource_spec_file)
+
+    def validate():
+        # validate phase
+        hits, ndcgs = [], []
+        for idx in range(num_users):
+            start_index = idx * 100
+            my_feed_dict = {
+                user_input: testUserInput[start_index:start_index+100],
+                item_input: testItemInput[start_index:start_index+100],
+            }
+            predictions = sess.run([y], feed_dict=my_feed_dict)
+            map_item_score = {
+                testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+            # Evaluate top rank list
+            ranklist = heapq.nlargest(
+                topK, map_item_score, key=map_item_score.get)
+            hr = getHitRatio(ranklist, testItemInput[start_index])
+            ndcg = getNDCG(ranklist, testItemInput[start_index])
+            hits.append(hr)
+            ndcgs.append(ndcg)
+        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+        return hr, ndcg
+
+    from movielens import getdata
+    trainData, testData = getdata('ml-25m', 'datasets')
+    testUserInput = np.repeat(
+        np.arange(testData.shape[0], dtype=np.int32), 100)
+    testItemInput = testData.reshape((-1,))
+    num_users, num_items = {
+        'ml-1m': (6040, 3706),
+        'ml-20m': (138493, 26744),
+        'ml-25m': (162541, 59047),
+    }['ml-25m']
+    batch_size = 1024
+    num_negatives = 4
+    topK = 10
+    with tf.Graph().as_default() as g, autodist.scope():
+        user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+        item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+        y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+        loss, y, opt = neural_mf(
+            user_input, item_input, y_, num_users, num_items)
+        train_op = opt.minimize(loss)
+
+        sess = autodist.create_distributed_session()
+
+        log = Logging(path=os.path.join(
+            os.path.dirname(__file__), 'logs', 'tfplx.txt'))
+        epoch = 7
+        iterations = trainData['user_input'].shape[0] // batch_size
+        start = time.time()
+        for ep in range(epoch):
+            ep_st = time.time()
+            log.write('epoch %d' % ep)
+            train_loss = []
+            for idx in range(iterations):
+                start_index = idx * batch_size
+                my_feed_dict = {
+                    user_input: trainData['user_input'][start_index:start_index+batch_size],
+                    item_input: trainData['item_input'][start_index:start_index+batch_size],
+                    y_: trainData['labels'][start_index:start_index+batch_size],
+                }
+
+                loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+                train_loss.append(loss_val[0])
+
+            tra_loss = np.mean(train_loss)
+            ep_en = time.time()
+
+            # validate phase
+            hr, ndcg = validate()
+            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+                tra_loss, hr, ndcg, ep_en - ep_st)
+            log.write(printstr)
+        log.write('all time:', (time.time() - start))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/rec/run_tf.py b/examples/rec/run_tf.py
new file mode 100644
index 0000000..f5c1e03
--- /dev/null
+++ b/examples/rec/run_tf.py
@@ -0,0 +1,145 @@
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq  # for retrieval topK
+import math
+
+
+def getHitRatio(ranklist, gtItem):
+    for item in ranklist:
+        if item == gtItem:
+            return 1
+    return 0
+
+
+def getNDCG(ranklist, gtItem):
+    for i in range(len(ranklist)):
+        item = ranklist[i]
+        if item == gtItem:
+            return math.log(2) / math.log(i+2)
+    return 0
+
+
+class Logging(object):
+    def __init__(self, path='logs/tflog.txt'):
+        with open(path, 'w') as fw:
+            fw.write('')
+        self.path = path
+
+    def write(self, s):
+        print(s)
+        with open(self.path, 'a') as fw:
+            fw.write(s + '\n')
+            fw.flush()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--val", action="store_true",
+                        help="whether to perform validation")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+
+    def validate():
+        # validate phase
+        hits, ndcgs = [], []
+        for idx in range(num_users):
+            start_index = idx * 100
+            my_feed_dict = {
+                user_input: testUserInput[start_index:start_index+100],
+                item_input: testItemInput[start_index:start_index+100],
+            }
+            predictions = sess.run([y], feed_dict=my_feed_dict)
+            map_item_score = {
+                testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+            # Evaluate top rank list
+            ranklist = heapq.nlargest(
+                topK, map_item_score, key=map_item_score.get)
+            hr = getHitRatio(ranklist, testItemInput[start_index])
+            ndcg = getNDCG(ranklist, testItemInput[start_index])
+            hits.append(hr)
+            ndcgs.append(ndcg)
+        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+        return hr, ndcg
+
+    from movielens import getdata
+    if args.all:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = trainData['user_input']
+        trainItems = trainData['item_input']
+        trainLabels = trainData['labels']
+        testData = testData
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+    else:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = trainData['user_input'][:1024000]
+        trainItems = trainData['item_input'][:1024000]
+        trainLabels = trainData['labels'][:1024000]
+        testData = testData[:1470]
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+    num_users, num_items = {
+        'ml-1m': (6040, 3706),
+        'ml-20m': (138493, 26744),
+        'ml-25m': (162541, 59047),
+    }['ml-25m']
+    batch_size = 1024
+    num_negatives = 4
+    topK = 10
+    user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+    item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+    y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+    loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items)
+    train_op = opt.minimize(loss)
+
+    init = tf.compat.v1.global_variables_initializer()
+    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+    sess = tf.compat.v1.Session(
+        config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
+    sess.run(init)
+
+    log = Logging()
+    epoch = 7
+    iterations = trainUsers.shape[0] // batch_size
+    start = time.time()
+    for ep in range(epoch):
+        ep_st = time.time()
+        log.write('epoch %d' % ep)
+        train_loss = []
+        for idx in range(iterations):
+            start_index = idx * batch_size
+            my_feed_dict = {
+                user_input: trainUsers[start_index:start_index+batch_size],
+                item_input: trainItems[start_index:start_index+batch_size],
+                y_: trainLabels[start_index:start_index+batch_size],
+            }
+
+            loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+            train_loss.append(loss_val[0])
+
+        tra_loss = np.mean(train_loss)
+        ep_en = time.time()
+
+        # validate phase
+        if args.val:
+            hr, ndcg = validate()
+            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+                tra_loss, hr, ndcg, ep_en - ep_st)
+        else:
+            printstr = "train_loss: %.4f, train_time: %.4f" % (
+                tra_loss, ep_en - ep_st)
+        log.write(printstr)
+    log.write('all time:%f' % (time.time() - start))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/rec/run_tfworker.py b/examples/rec/run_tfworker.py
new file mode 100644
index 0000000..20c0402
--- /dev/null
+++ b/examples/rec/run_tfworker.py
@@ -0,0 +1,192 @@
+import os
+import json
+import numpy as np
+import tensorflow as tf
+import time
+import argparse
+from tqdm import tqdm
+from tf_ncf import neural_mf
+import heapq  # for retrieval topK
+import math
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+
+def getHitRatio(ranklist, gtItem):
+    for item in ranklist:
+        if item == gtItem:
+            return 1
+    return 0
+
+
+def getNDCG(ranklist, gtItem):
+    for i in range(len(ranklist)):
+        item = ranklist[i]
+        if item == gtItem:
+            return math.log(2) / math.log(i+2)
+    return 0
+
+
+class Logging(object):
+    def __init__(self, path='logs/tflog.txt'):
+        with open(path, 'w') as fw:
+            fw.write('')
+        self.path = path
+
+    def write(self, s):
+        print(s)
+        with open(self.path, 'a') as fw:
+            fw.write(s + '\n')
+            fw.flush()
+
+
+def train_ncf(cluster, rank, nrank, args):
+    def validate():
+        # validate phase
+        hits, ndcgs = [], []
+        for idx in range(testData.shape[0]):
+            start_index = idx * 100
+            my_feed_dict = {
+                user_input: testUserInput[start_index:start_index+100],
+                item_input: testItemInput[start_index:start_index+100],
+            }
+            predictions = sess.run([y], feed_dict=my_feed_dict)
+            map_item_score = {
+                testItemInput[start_index+i]: predictions[0][i] for i in range(100)}
+
+            # Evaluate top rank list
+            ranklist = heapq.nlargest(
+                topK, map_item_score, key=map_item_score.get)
+            hr = getHitRatio(ranklist, testItemInput[start_index])
+            ndcg = getNDCG(ranklist, testItemInput[start_index])
+            hits.append(hr)
+            ndcgs.append(ndcg)
+        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+        return hr, ndcg
+
+    def get_current_shard(data):
+        part_size = data.shape[0] // nrank
+        start = part_size * rank
+        end = start + part_size if rank != nrank - 1 else data.shape[0]
+        return data[start:end]
+
+    from movielens import getdata
+    if args.all:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = get_current_shard(trainData['user_input'])
+        trainItems = get_current_shard(trainData['item_input'])
+        trainLabels = get_current_shard(trainData['labels'])
+        testData = get_current_shard(testData)
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+    else:
+        trainData, testData = getdata('ml-25m', 'datasets')
+        trainUsers = get_current_shard(trainData['user_input'][:1024000])
+        trainItems = get_current_shard(trainData['item_input'][:1024000])
+        trainLabels = get_current_shard(trainData['labels'][:1024000])
+        testData = get_current_shard(testData[:1470])
+        testUserInput = np.repeat(
+            np.arange(testData.shape[0], dtype=np.int32), 100)
+        testItemInput = testData.reshape((-1,))
+
+    num_users, num_items = {
+        'ml-1m': (6040, 3706),
+        'ml-20m': (138493, 26744),
+        'ml-25m': (162541, 59047),
+    }['ml-25m']
+    batch_size = 1024
+    num_negatives = 4
+    topK = 10
+
+    worker_device = "/job:worker/task:%d/gpu:0" % (rank)
+    with tf.device(worker_device):
+        user_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+        item_input = tf.compat.v1.placeholder(tf.int32, [None, ])
+        y_ = tf.compat.v1.placeholder(tf.float32, [None, ])
+
+    with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster)):
+        server_num = len(cluster.as_dict()['ps'])
+        embed_partitioner = tf.fixed_size_partitioner(
+            server_num, 0) if server_num > 1 else None
+        loss, y, opt = neural_mf(
+            user_input, item_input, y_, num_users, num_items, embed_partitioner)
+        train_op = opt.minimize(loss)
+
+    server = tf.train.Server(
+        cluster, job_name="worker", task_index=rank)
+    init = tf.compat.v1.global_variables_initializer()
+    sv = tf.train.Supervisor(
+        is_chief=(rank == 0),
+        init_op=init,
+        recovery_wait_secs=1)
+    sess_config = tf.compat.v1.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=False,
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % rank])
+    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
+
+    log = Logging(path='logs/tflog%d.txt' % rank)
+    epoch = 7
+    iterations = trainUsers.shape[0] // batch_size
+    start = time.time()
+    for ep in range(epoch):
+        ep_st = time.time()
+        log.write('epoch %d' % ep)
+        train_loss = []
+        for idx in tqdm(range(iterations)):
+            start_index = idx * batch_size
+            my_feed_dict = {
+                user_input: trainUsers[start_index:start_index+batch_size],
+                item_input: trainItems[start_index:start_index+batch_size],
+                y_: trainLabels[start_index:start_index+batch_size],
+            }
+
+            loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict)
+            train_loss.append(loss_val[0])
+
+        tra_loss = np.mean(train_loss)
+        ep_en = time.time()
+
+        # validate phase
+        if args.val:
+            hr, ndcg = validate()
+            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (
+                tra_loss, hr, ndcg, ep_en - ep_st)
+        else:
+            printstr = "train_loss: %.4f, train_time: %.4f" % (
+                tra_loss, ep_en - ep_st)
+        log.write(printstr)
+    log.write('all time: %f' % (time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--val", action="store_true",
+                        help="whether to perform validation")
+    parser.add_argument("--rank", type=int, required=True,
+                        help="rank of process")
+    parser.add_argument(
+        "--config", type=str, default='../ctr/settings/tf_local_s1_w2.json', help="config file path")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    args = parser.parse_args()
+    task_id = int(args.rank)
+    raw_config = args.config
+
+    config = json.load(open(raw_config))
+    cluster = tf.train.ClusterSpec(config)
+
+    train_ncf(cluster, task_id, len(config['worker']), args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/rec/tf_8workers.sh b/examples/rec/tf_8workers.sh
new file mode 100644
index 0000000..cd8c494
--- /dev/null
+++ b/examples/rec/tf_8workers.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+workdir=$(cd $(dirname $0); pwd)
+mainpy=${workdir}/run_tfworker.py
+
+CUDA_VISIBLE_DEVICES=0 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 0 &
+CUDA_VISIBLE_DEVICES=1 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 1 &
+CUDA_VISIBLE_DEVICES=2 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 2 &
+CUDA_VISIBLE_DEVICES=3 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 3 &
+CUDA_VISIBLE_DEVICES=4 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 4 &
+CUDA_VISIBLE_DEVICES=5 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 5 &
+CUDA_VISIBLE_DEVICES=6 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 6 &
+CUDA_VISIBLE_DEVICES=7 python ${mainpy} --config ${workdir}/../ctr/settings/tf_local_s1_w8.json --rank 7 &
+wait
\ No newline at end of file
diff --git a/examples/rec/tf_ncf.py b/examples/rec/tf_ncf.py
new file mode 100644
index 0000000..c8ca51f
--- /dev/null
+++ b/examples/rec/tf_ncf.py
@@ -0,0 +1,47 @@
+import tensorflow as tf
+
+
+def neural_mf(user_input, item_input, y_, num_users, num_items, embed_partitioner=None):
+    embed_dim = 8
+    layers = [64, 32, 16, 8]
+    learning_rate = 0.01
+    with tf.compat.v1.variable_scope('nmf', dtype=tf.float32):
+        with tf.device('/cpu:0'):
+            User_Embedding = tf.compat.v1.get_variable(name="user_embed", shape=(
+                num_users, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner)
+            Item_Embedding = tf.compat.v1.get_variable(name="item_embed", shape=(
+                num_items, embed_dim + layers[0] // 2), initializer=tf.random_normal_initializer(stddev=0.01), partitioner=embed_partitioner)
+
+            user_latent = tf.nn.embedding_lookup(User_Embedding, user_input)
+            item_latent = tf.nn.embedding_lookup(Item_Embedding, item_input)
+
+            W1 = tf.compat.v1.get_variable(name='W1', shape=(
+                layers[0], layers[1]), initializer=tf.random_normal_initializer(stddev=0.1))
+            W2 = tf.compat.v1.get_variable(name='W2', shape=(
+                layers[1], layers[2]), initializer=tf.random_normal_initializer(stddev=0.1))
+            W3 = tf.compat.v1.get_variable(name='W3', shape=(
+                layers[2], layers[3]), initializer=tf.random_normal_initializer(stddev=0.1))
+            W4 = tf.compat.v1.get_variable(name='W4', shape=(
+                embed_dim + layers[3], 1), initializer=tf.random_normal_initializer(stddev=0.1))
+
+        with tf.device('/gpu:0'):
+            mf_user_latent, mlp_user_latent = tf.split(
+                user_latent, [embed_dim, layers[0] // 2], 1)
+            mf_item_latent, mlp_item_latent = tf.split(
+                item_latent, [embed_dim, layers[0] // 2], 1)
+            mf_vector = tf.multiply(mf_user_latent, mf_item_latent)
+            mlp_vector = tf.concat((mlp_user_latent, mlp_item_latent), 1)
+            fc1 = tf.matmul(mlp_vector, W1)
+            relu1 = tf.nn.relu(fc1)
+            fc2 = tf.matmul(relu1, W2)
+            relu2 = tf.nn.relu(fc2)
+            fc3 = tf.matmul(relu2, W3)
+            relu3 = tf.nn.relu(fc3)
+            concat_vector = tf.concat((mf_vector, relu3), 1)
+            y = tf.reshape(tf.matmul(concat_vector, W4), (-1,))
+            loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)
+            loss = tf.reduce_mean(loss)
+            y = tf.sigmoid(y)
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                learning_rate)
+    return loss, y, optimizer
diff --git a/examples/runner/README.md b/examples/runner/README.md
new file mode 100644
index 0000000..272dd45
--- /dev/null
+++ b/examples/runner/README.md
@@ -0,0 +1,27 @@
+## Usage
+This directory contains examples using `heturun` command.
+
+* Data Parallel (MLP model and WDL model):
+```bash
+# Local Data Parallel Using AllReduce
+heturun -c local_allreduce.yml python run_mlp.py --config lar
+
+# Local Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters
+heturun -c local_ps.yml python run_wdl.py --config lhy
+
+# Local Data Parallel Using PS
+heturun -c local_ps.yml python run_mlp.py --config lps
+heturun -c local_ps.yml python run_wdl.py --config lps
+
+# Distributed Data Parallel Using AllReduce
+heturun -c remote_allreduce.yml python run_mlp.py --config rar
+
+# Distributed Data Parallel Using AllReduce for Dense Parameters and PS for Sparse(Embedding) Parameters
+heturun -c remote_ps.yml python run_wdl.py --config rhy
+
+# Distributed Data Parallel Using PS
+heturun -c remote_ps.yml python run_mlp.py --config rps
+heturun -c remote_ps.yml python run_wdl.py --config rps
+```
+
+* For other parallel schemes, please refer to `parallel` directory.
diff --git a/examples/runner/local_allreduce.yml b/examples/runner/local_allreduce.yml
new file mode 100644
index 0000000..5028d3f
--- /dev/null
+++ b/examples/runner/local_allreduce.yml
@@ -0,0 +1,5 @@
+nodes:
+  - host: localhost
+    servers: 0
+    workers: 4
+    chief: true
diff --git a/examples/runner/local_ps.yml b/examples/runner/local_ps.yml
new file mode 100644
index 0000000..c71709c
--- /dev/null
+++ b/examples/runner/local_ps.yml
@@ -0,0 +1,5 @@
+nodes:
+  - host: localhost
+    servers: 1
+    workers: 4
+    chief: true
diff --git a/examples/runner/models/MLP.py b/examples/runner/models/MLP.py
new file mode 100644
index 0000000..df60da8
--- /dev/null
+++ b/examples/runner/models/MLP.py
@@ -0,0 +1,33 @@
+import hetu as ht
+from hetu import init
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = init.random_normal(shape=shape, stddev=0.1, name=name+'_weight')
+    bias = init.random_normal(shape=shape[-1:], stddev=0.1, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+def mlp(x, y_):
+    '''
+    MLP model, for MNIST dataset.
+
+    Parameters:
+        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
+        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    Return:
+        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
+        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
+    '''
+
+    print("Building MLP model...")
+    x = fc(x, (784, 256), 'mlp_fc1', with_relu=True)
+    x = fc(x, (256, 256), 'mlp_fc2', with_relu=True)
+    y = fc(x, (256, 10), 'mlp_fc3', with_relu=False)
+    loss = ht.softmaxcrossentropy_op(y, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    return loss, y
diff --git a/examples/runner/models/__init__.py b/examples/runner/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/runner/models/load_data.py b/examples/runner/models/load_data.py
new file mode 100644
index 0000000..4edeb0d
--- /dev/null
+++ b/examples/runner/models/load_data.py
@@ -0,0 +1,220 @@
+import numpy as np
+import six.moves.cPickle as pickle
+import gzip
+import os
+
+
+def load_mnist_data(dataset):
+    """ Load the dataset
+    Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    """
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        from six.moves import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    # Load the dataset
+    with gzip.open(dataset, 'rb') as f:
+        try:
+            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
+        except:
+            train_set, valid_set, test_set = pickle.load(f)
+    # train_set, valid_set, test_set format: tuple(input, target)
+    # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
+    # where each row corresponds to an example. target is a
+    # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
+    # as the number of rows in the input. It should give the target
+    # to the example with the same index in the input.
+    return train_set, valid_set, test_set
+
+
+def convert_to_one_hot(vals, max_val=0):
+    """Helper method to convert label array to one-hot array."""
+    if max_val == 0:
+        max_val = vals.max() + 1
+    one_hot_vals = np.zeros((vals.size, max_val))
+    one_hot_vals[np.arange(vals.size), vals] = 1
+    return one_hot_vals
+
+###########################################################################
+# adult
+###########################################################################
+
+
+def maybe_download(train_data, test_data):
+    import pandas as pd
+    """if adult data "train.csv" and "test.csv" are not in your directory,
+    download them.
+    """
+
+    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+               "marital_status", "occupation", "relationship", "race", "gender",
+               "capital_gain", "capital_loss", "hours_per_week", "native_country",
+               "income_bracket"]
+
+    if not os.path.exists(train_data):
+        print("downloading training data...")
+        df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                               names=COLUMNS, skipinitialspace=True)
+    else:
+        df_train = pd.read_csv("train.csv")
+
+    if not os.path.exists(test_data):
+        print("downloading testing data...")
+        df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+                              names=COLUMNS, skipinitialspace=True, skiprows=1)
+    else:
+        df_test = pd.read_csv("test.csv")
+
+    return df_train, df_test
+
+
+def cross_columns(x_cols):
+    """simple helper to build the crossed columns in a pandas dataframe
+    """
+    crossed_columns = dict()
+    colnames = ['_'.join(x_c) for x_c in x_cols]
+    for cname, x_c in zip(colnames, x_cols):
+        crossed_columns[cname] = x_c
+    return crossed_columns
+
+
+def val2idx(df, cols):
+    """helper to index categorical columns before embeddings.
+    """
+    val_types = dict()
+    for c in cols:
+        val_types[c] = df[c].unique()
+
+    val_to_idx = dict()
+    for k, v in val_types.items():
+        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
+
+    for k, v in val_to_idx.items():
+        df[k] = df[k].apply(lambda x: v[x])
+
+    unique_vals = dict()
+    for c in cols:
+        unique_vals[c] = df[c].nunique()
+
+    return df, unique_vals
+
+
+def onehot(x):
+    from sklearn.preprocessing import OneHotEncoder
+    return np.array(OneHotEncoder().fit_transform(x).todense())
+
+
+def wide(df_train, df_test, wide_cols, x_cols, target):
+    import pandas as pd
+    print('Processing wide data')
+    df_train['IS_TRAIN'] = 1
+    df_test['IS_TRAIN'] = 0
+    df_wide = pd.concat([df_train, df_test])
+
+    crossed_columns_d = cross_columns(x_cols)
+    categorical_columns = list(
+        df_wide.select_dtypes(include=['object']).columns)
+
+    wide_cols += list(crossed_columns_d.keys())
+
+    for k, v in crossed_columns_d.items():
+        df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
+
+    df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
+
+    dummy_cols = [
+        c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
+    df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
+
+    train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+    test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+    assert all(train.columns == test.columns)
+
+    cols = [c for c in train.columns if c != target]
+    X_train = train[cols].values
+    y_train = train[target].values.reshape(-1, 1)
+    X_test = test[cols].values
+    y_test = test[target].values.reshape(-1, 1)
+    return X_train, y_train, X_test, y_test
+
+
+def load_adult_data(return_val=True):
+    import pandas as pd
+    df_train, df_test = maybe_download("train.csv", "test.csv")
+
+    df_train['income_label'] = (
+        df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+    df_test['income_label'] = (
+        df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+
+    age_groups = [0, 25, 65, 90]
+    age_labels = range(len(age_groups) - 1)
+    df_train['age_group'] = pd.cut(
+        df_train['age'], age_groups, labels=age_labels)
+    df_test['age_group'] = pd.cut(
+        df_test['age'], age_groups, labels=age_labels)
+
+    # columns for wide model
+    wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
+                 'relationship', 'race', 'gender', 'native_country', 'age_group']
+    x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
+
+    # columns for deep model
+    embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
+                      'relationship', 'race', 'gender', 'native_country']
+    cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
+
+    target = 'income_label'
+
+    x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
+        df_train, df_test, wide_cols, x_cols, target)
+    x_train_wide = np.array(x_train_wide).astype(np.float32)
+    x_test_wide = np.array(x_test_wide).astype(np.float32)
+
+    print('Processing deep data')
+    df_train['IS_TRAIN'] = 1
+    df_test['IS_TRAIN'] = 0
+    df_deep = pd.concat([df_train, df_test])
+
+    deep_cols = embedding_cols + cont_cols
+    df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
+    from sklearn.preprocessing import StandardScaler
+    scaler = StandardScaler()
+    df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
+                                      columns=cont_cols)
+    df_deep, unique_vals = val2idx(df_deep, embedding_cols)
+
+    train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
+    test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
+
+    x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
+    y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
+    x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
+    y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
+
+    x_train_deep = np.transpose(x_train_deep)
+    x_test_deep = np.transpose(x_test_deep)
+    y_train = onehot(y_train)
+    y_test = onehot(y_test)
+
+    if return_val:
+        return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
+    else:
+        return x_train_deep, x_train_wide, y_train
diff --git a/examples/runner/models/wdl_adult.py b/examples/runner/models/wdl_adult.py
new file mode 100644
index 0000000..6cef28c
--- /dev/null
+++ b/examples/runner/models/wdl_adult.py
@@ -0,0 +1,57 @@
+import hetu as ht
+from hetu import init
+
+
+def wdl_adult(X_deep, X_wide, y_, dense_param_ctx):
+    lr = 5 / 128
+    dim_wide = 809
+    dim_deep = 68
+
+    with ht.context(dense_param_ctx):
+        W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
+        W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
+        b1 = init.random_normal([50], stddev=0.1, name="b1")
+        W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
+        b2 = init.random_normal([20], stddev=0.1, name="b2")
+
+    # deep
+    Embedding = []
+    X_deep_input = None
+
+    for i in range(8):
+        Embedding_name = "Embedding_deep_" + str(i)
+        Embedding.append(init.random_normal(
+            [50, 8], stddev=0.1, name=Embedding_name))
+        now = ht.embedding_lookup_op(Embedding[i], X_deep[i])
+        now = ht.array_reshape_op(now, (-1, 8))
+        if X_deep_input is None:
+            X_deep_input = now
+        else:
+            X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+    for i in range(4):
+        now = ht.array_reshape_op(X_deep[i + 8], (-1, 1))
+        X_deep_input = ht.concat_op(X_deep_input, now, 1)
+
+    mat1 = ht.matmul_op(X_deep_input, W1)
+    add1 = mat1 + ht.broadcastto_op(b1, mat1)
+    relu1 = ht.relu_op(add1)
+    dropout1 = relu1
+    mat2 = ht.matmul_op(dropout1, W2)
+    add2 = mat2 + ht.broadcastto_op(b2, mat2)
+    relu2 = ht.relu_op(add2)
+    dropout2 = relu2
+    dmodel = dropout2
+
+    # wide
+    wmodel = ht.concat_op(X_wide, dmodel, 1)
+    wmodel = ht.matmul_op(wmodel, W)
+
+    prediction = wmodel
+    loss = ht.softmaxcrossentropy_op(prediction, y_)
+    loss = ht.reduce_mean_op(loss, [0])
+
+    opt = ht.optim.SGDOptimizer(learning_rate=lr)
+    train_op = opt.minimize(loss)
+
+    return loss, prediction, y_, train_op
diff --git a/examples/runner/parallel/README.md b/examples/runner/parallel/README.md
new file mode 100644
index 0000000..f5fc8d9
--- /dev/null
+++ b/examples/runner/parallel/README.md
@@ -0,0 +1,35 @@
+## Usage
+* Complex Pipeline Parallel (not using heturun):
+```bash
+mpirun --allow-run-as-root --tag-output -np 8 python complex_pipeline_mlp.py
+```
+
+* Simple Pipeline Parallel:
+```bash
+heturun -c config8.yml python simple_pipeline_mlp.py
+```
+
+* Data + Pipeline Parallel:
+```bash
+heturun -c config8.yml python data_pipeline_mlp.py
+```
+
+* Multiple Machine Data + Pipeline Parallel:
+```bash
+heturun -c dist_config8.yml python dist_data_pipeline_mlp.py
+```
+
+* Test Model Parallel (the following commands should give the same results):
+```bash
+heturun -c config3.yml python test_model_mlp_base.py --save
+heturun -c config4.yml python test_model_mlp.py --split left
+heturun -c config4.yml python test_model_mlp.py --split right
+heturun -c config4.yml python test_model_mlp.py --split middle
+```
+
+* Data + Model (+ Pipeline) Parallel:
+```bash
+heturun -c config8.yml python data_model_pipeline_mlp.py --split left
+heturun -c config8.yml python data_model_pipeline_mlp.py --split right
+heturun -c config8.yml python data_model_pipeline_mlp.py --split middle
+```
diff --git a/examples/runner/parallel/complex_pipeline_mlp.py b/examples/runner/parallel/complex_pipeline_mlp.py
new file mode 100644
index 0000000..a9a3d25
--- /dev/null
+++ b/examples/runner/parallel/complex_pipeline_mlp.py
@@ -0,0 +1,200 @@
+import hetu as ht
+from hetu import stream
+from hetu import init
+
+import os
+import sys
+import json
+import time
+import argparse
+import numpy as np
+import logging
+
+np.random.seed(123)
+
+
+def convert_to_one_hot(vals, max_val=0):
+    """Helper method to convert label array to one-hot array."""
+    if max_val == 0:
+        max_val = vals.max() + 1
+    one_hot_vals = np.zeros((vals.size, max_val))
+    one_hot_vals[np.arange(vals.size), vals] = 1
+    return one_hot_vals
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+    weight = init.random_normal(
+        shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx)
+    bias = init.random_normal(
+        shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx)
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--steps', type=int, default=8, help='training steps')
+    parser.add_argument('--warmup', type=int, default=2,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.00001, help='learning rate')
+    args = parser.parse_args()
+
+    # init and opt for both ranks
+    comm = ht.wrapped_mpi_nccl_init()
+    device_id = comm.dev_id
+    print("mpi_nccl init for gpu device: {}".format(device_id))
+    executor_ctx = ht.gpu(device_id)
+    opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+
+    # init logger
+    logger = logging.getLogger()
+    ch = logging.StreamHandler()
+    formatter = logging.Formatter('[rank{}, PID{}]'.format(
+        device_id, os.getpid()) + ' %(asctime)s: %(message)s')
+    ch.setLevel(logging.DEBUG)
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    log = logger.warning
+
+    # nccl communicate stream for pipeline_send/receive
+    communicate_stream = stream.create_stream_handle(executor_ctx)
+
+    # dataset
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    batch_size = 10000
+    batch_num = 5
+    value_x_list = []
+    value_y_list = []
+    for i in range(batch_num):
+        start = i * batch_size
+        ending = (i+1) * batch_size
+        value_x_list.append(train_set_x[start:ending])
+        value_y_list.append(train_set_y[start:ending])
+
+    x = ht.Variable(name="dataloader_x", trainable=False)
+    y_ = ht.Variable(name="dataloader_y", trainable=False)
+
+    # model parallel
+    if comm.myRank.value == 0:
+        # rank0
+
+        # forward
+        activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True,
+                        ctx=ht.gpu(comm.localRank.value))
+        activation = fc(activation, (1024, 2048), 'mlp_fc2',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        activation = fc(activation, (2048, 1024), 'mlp_fc3',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        activation_send_op = ht.pipeline_send_op(
+            activation, 1, comm, stream=communicate_stream)
+
+        # backward
+        gradient_receive_op = ht.pipeline_receive_op(
+            1, comm, ctx=executor_ctx, stream=communicate_stream)
+        required_vars = opt.get_var_list(activation)
+        opt.params = required_vars
+        grads = ht.gradients(activation, required_vars,
+                             insert_grad=gradient_receive_op)
+        train_op = ht.optim.OptimizerOp(grads, opt)
+
+        executor = ht.Executor(
+            [activation_send_op, train_op], ctx=executor_ctx)
+
+    elif comm.myRank.value != 7:
+        # from rank1 to rank6
+        previous_rank = comm.myRank.value - 1
+        next_rank = comm.myRank.value + 1
+
+        # 1. receive activation from previous rank
+        activation_receive_op = ht.pipeline_receive_op(
+            previous_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+        # forward
+        activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        activation = fc(activation, (2048, 2048), 'mlp_fc2',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        activation = fc(activation, (2048, 1024), 'mlp_fc3',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+
+        # 2. send activation to next rank
+        activation_send_op = ht.pipeline_send_op(
+            activation, next_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+
+        # 3. receive gradients from next rank
+        gradient_receive_op = ht.pipeline_receive_op(
+            next_rank, comm, ctx=executor_ctx, stream=communicate_stream)
+        # backward
+        required_vars = opt.get_var_list(activation)
+        opt.params = required_vars
+        required_vars = [activation_receive_op] + required_vars
+        grads = ht.gradients(activation, required_vars,
+                             insert_grad=gradient_receive_op)
+        train_op = ht.optim.OptimizerOp(grads[1:], opt)
+
+        # 4. send gradients to previous rank
+        sendback_grad_op = ht.pipeline_send_op(
+            grads[0], previous_rank, comm, stream=communicate_stream)
+
+        executor = ht.Executor(
+            [activation_send_op, sendback_grad_op, train_op], ctx=executor_ctx)
+
+    else:
+        # rank7
+        activation_receive_op = ht.pipeline_receive_op(
+            6, comm, ctx=executor_ctx, stream=communicate_stream)
+
+        # forward
+        activation = fc(activation_receive_op, (1024, 2048), 'mlp_fc1',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        activation = fc(activation, (2048, 1024), 'mlp_fc2',
+                        with_relu=True, ctx=ht.gpu(comm.localRank.value))
+        y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+
+        # backward
+        required_vars = opt.get_var_list(loss)
+        opt.params = required_vars
+        required_vars = [activation_receive_op] + required_vars
+        grads = ht.gradients(loss, required_vars)
+        train_op = ht.optim.OptimizerOp(grads[1:], opt)
+
+        sendback_grad_op = ht.pipeline_send_op(
+            grads[0], 6, comm, stream=communicate_stream)
+        executor = ht.Executor(
+            [loss, sendback_grad_op, train_op], ctx=executor_ctx)
+
+    # training
+    for step in range(args.steps):
+        if step == args.warmup:
+            start = time.time()
+        if comm.myRank.value == 0:
+            log("step {}:".format(step))
+        if comm.myRank.value == 0:
+            executor.run(feed_dict={x: value_x_list[step % batch_num]})
+            log("gpu0 ok")
+        elif comm.myRank.value == 7:
+            loss, _, _ = executor.run(
+                feed_dict={y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+            log("gpu7 ok, loss: {}".format(loss[0]))
+        else:
+            executor.run()
+            log("gpu{} ok".format(comm.myRank.value))
+
+    # comm.stream.sync()
+    if communicate_stream:
+        communicate_stream.sync()
+
+    end = time.time()
+    log("time elapsed for {} steps: {}s".format(
+        args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/config3.yml b/examples/runner/parallel/config3.yml
new file mode 100644
index 0000000..7c2ad6f
--- /dev/null
+++ b/examples/runner/parallel/config3.yml
@@ -0,0 +1,5 @@
+nodes:
+  - host: localhost
+    servers: 0
+    workers: 3
+    chief: true
diff --git a/examples/runner/parallel/config4.yml b/examples/runner/parallel/config4.yml
new file mode 100644
index 0000000..5028d3f
--- /dev/null
+++ b/examples/runner/parallel/config4.yml
@@ -0,0 +1,5 @@
+nodes:
+  - host: localhost
+    servers: 0
+    workers: 4
+    chief: true
diff --git a/examples/runner/parallel/config8.yml b/examples/runner/parallel/config8.yml
new file mode 100644
index 0000000..2676c7d
--- /dev/null
+++ b/examples/runner/parallel/config8.yml
@@ -0,0 +1,5 @@
+nodes:
+  - host: localhost
+    servers: 0
+    workers: 8
+    chief: true
diff --git a/examples/runner/parallel/data_model_pipeline_mlp.py b/examples/runner/parallel/data_model_pipeline_mlp.py
new file mode 100644
index 0000000..ff6539c
--- /dev/null
+++ b/examples/runner/parallel/data_model_pipeline_mlp.py
@@ -0,0 +1,97 @@
+import hetu as ht
+
+import time
+import argparse
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+    weight = ht.init.random_normal(
+        shape=shape, stddev=0.04, name=name+'_weight', ctx=ctx)
+    bias = ht.init.random_normal(
+        shape=shape[-1:], stddev=0.04, name=name+'_bias', ctx=ctx)
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--steps', type=int, default=8, help='training steps')
+    parser.add_argument('--warmup', type=int, default=2,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.00001, help='learning rate')
+    parser.add_argument('--split', type=str, default='left',
+                        help='left, middle, right')
+    args = parser.parse_args()
+    assert args.split in ('left', 'middle', 'right')
+
+    # dataset
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    batch_size = 10000
+    batch_num = 5
+    value_x_list = []
+    value_y_list = []
+    for i in range(batch_num):
+        start = i * batch_size
+        ending = (i+1) * batch_size
+        value_x_list.append(train_set_x[start:ending])
+        value_y_list.append(train_set_y[start:ending])
+
+    # model parallel
+    with ht.context([ht.gpu(0), ht.gpu(4)]):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+        activation = fc(activation, (1024, 2048), 'mlp_fc2', with_relu=True)
+        activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True)
+        if args.split == 'left':
+            activation = ht.dispatch(activation, (2, 1))
+            weight = ht.dispatch(ht.init.random_normal(
+                shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 1), duplicate=2)
+        elif args.split == 'right':
+            activation = ht.dispatch(activation, (1, 1), duplicate=2)
+            weight = ht.dispatch(ht.init.random_normal(
+                shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (1, 2))
+        else:
+            activation = ht.dispatch(activation, (1, 2))
+            weight = ht.dispatch(ht.init.random_normal(
+                shape=(1024, 2048), stddev=0.04, name='mlp_fc1_weight'), (2, 1))
+
+    with ht.context([(ht.gpu(1), ht.gpu(2)), (ht.gpu(5), ht.gpu(6))]):
+        activation = ht.matmul_op(activation, weight)
+        activation = ht.dispatch(activation, (1, 1))
+
+    with ht.context([ht.gpu(3), ht.gpu(7)]):
+        activation = ht.relu_op(activation)
+        activation = fc(activation, (2048, 2048), 'mlp_fc2', with_relu=True)
+        activation = fc(activation, (2048, 1024), 'mlp_fc3', with_relu=True)
+        y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=False)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+
+        executor = ht.Executor([loss, train_op])
+
+    # training
+    for step in range(args.steps):
+        if step == args.warmup:
+            start = time.time()
+        loss_val, _ = executor.run(feed_dict={
+                                   x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+        if executor.rank == 3:
+            print('step:', step, 'loss:', loss_val)
+
+    end = time.time()
+    if executor.rank == 3:
+        print("time elapsed for {} steps: {}s".format(
+            args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/data_pipeline_mlp.py b/examples/runner/parallel/data_pipeline_mlp.py
new file mode 100644
index 0000000..076a5bd
--- /dev/null
+++ b/examples/runner/parallel/data_pipeline_mlp.py
@@ -0,0 +1,66 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+    bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--warmup', type=int, default=1,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int,
+                        default=10000, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.01, help='learning rate')
+    args = parser.parse_args()
+
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    with ht.context("gpu:0,gpu:4"):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True)
+
+    with ht.context("gpu:1,gpu:5"):
+        activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True)
+        activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True)
+
+    with ht.context("gpu:2,gpu:6"):
+        activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True)
+        activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True)
+
+    with ht.context("gpu:3,gpu:7"):
+        y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+        executor = ht.Executor([loss, train_op])
+
+    print_devices = [3, 7]
+
+    # training
+    steps = train_set_x.shape[0] // args.batch_size
+    for step in range(steps):
+        start = step * args.batch_size
+        end = start + args.batch_size
+        loss_val, _ = executor.run(feed_dict={
+                                   x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+        if executor.local_rank in print_devices:
+            print('[step {}]: loss: {}'.format(step, loss_val[0]))
diff --git a/examples/runner/parallel/dist_config8.yml b/examples/runner/parallel/dist_config8.yml
new file mode 100644
index 0000000..4129fb9
--- /dev/null
+++ b/examples/runner/parallel/dist_config8.yml
@@ -0,0 +1,8 @@
+nodes:
+  - host: daim118
+    servers: 0
+    workers: 4
+    chief: true
+  - host: daim117
+    servers: 0
+    workers: 4
\ No newline at end of file
diff --git a/examples/runner/parallel/dist_data_pipeline_mlp.py b/examples/runner/parallel/dist_data_pipeline_mlp.py
new file mode 100644
index 0000000..e05d228
--- /dev/null
+++ b/examples/runner/parallel/dist_data_pipeline_mlp.py
@@ -0,0 +1,68 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+import socket
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+    bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--warmup', type=int, default=1,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int,
+                        default=10000, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.01, help='learning rate')
+    args = parser.parse_args()
+
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    with ht.context([ht.rgpu('daim117', 0), ht.rgpu('daim117', 1)]):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc0', with_relu=True)
+
+    with ht.context([ht.rgpu('daim117', 2), ht.rgpu('daim117', 3)]):
+        activation = fc(activation, (1024, 1024), 'mlp_fc1', with_relu=True)
+        activation = fc(activation, (1024, 1024), 'mlp_fc11', with_relu=True)
+
+    with ht.context([ht.rgpu('daim118', 0), ht.rgpu('daim118', 1)]):
+        activation = fc(activation, (1024, 1024), 'mlp_fc2', with_relu=True)
+        activation = fc(activation, (1024, 1024), 'mlp_fc22', with_relu=True)
+
+    with ht.context([ht.rgpu('daim118', 2), ht.rgpu('daim118', 3)]):
+        y_pred = fc(activation, (1024, 10), 'mlp_fc3', with_relu=True)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+        executor = ht.Executor([loss, train_op])
+
+    print_ranks = [2, 3]
+    hostname = socket.gethostname()
+
+    # training
+    steps = train_set_x.shape[0] // args.batch_size
+    for step in range(steps):
+        start = step * args.batch_size
+        end = start + args.batch_size
+        loss_val, _ = executor.run(feed_dict={
+                                   x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+        if executor.local_rank in print_ranks and hostname == 'daim118':
+            print('[step {}]: loss: {}'.format(step, loss_val[0]))
diff --git a/examples/runner/parallel/simple_pipeline_mlp.py b/examples/runner/parallel/simple_pipeline_mlp.py
new file mode 100644
index 0000000..d200364
--- /dev/null
+++ b/examples/runner/parallel/simple_pipeline_mlp.py
@@ -0,0 +1,76 @@
+import hetu as ht
+
+import os
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True):
+    weight = ht.init.random_normal(shape, stddev=0.04, name=name+'_weight')
+    bias = ht.init.random_normal(shape[-1:], stddev=0.04, name=name+'_bias')
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=8,
+                        help='training epochs')
+    parser.add_argument('--warmup', type=int, default=1,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int,
+                        default=10000, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.01, help='learning rate')
+    args = parser.parse_args()
+
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # pipeline parallel
+    with ht.context(ht.gpu(0)):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+
+    for i in range(1, 7):
+        with ht.context(ht.gpu(i)):
+            activation = fc(activation, (1024, 1024), 'mlp_fc%d' %
+                            (i + 1), with_relu=True)
+
+    with ht.context(ht.gpu(7)):
+        y_pred = fc(activation, (1024, 10), 'mlp_fc8', with_relu=True)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+
+        executor = ht.Executor([loss, train_op])
+
+    # training
+    steps = train_set_x.shape[0] // args.batch_size
+    for epoch in range(args.epochs):
+        loss_vals = []
+        if epoch == args.warmup:
+            start_time = time.time()
+        for step in range(steps):
+            start = step * args.batch_size
+            end = start + args.batch_size
+            loss_val, _ = executor.run(feed_dict={
+                                       x: train_set_x[start:end], y_: train_set_y[start:end]}, convert_to_numpy_ret_vals=True)
+            loss_vals.append(loss_val)
+        if executor.rank == 7:
+            print('epoch: {}, loss: {}'.format(epoch, np.mean(loss_vals)))
+
+    if executor.rank == 0:
+        end_time = time.time()
+        print("time elapsed for {} epochs: {}s".format(
+            args.epochs-args.warmup, round(end_time-start_time, 3)))
diff --git a/examples/runner/parallel/test_model_mlp.py b/examples/runner/parallel/test_model_mlp.py
new file mode 100644
index 0000000..2eeab1a
--- /dev/null
+++ b/examples/runner/parallel/test_model_mlp.py
@@ -0,0 +1,92 @@
+import hetu as ht
+
+import time
+import argparse
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True, ctx=None):
+    weight_save = np.load('std/' + name + '_weight.npy')
+    bias_save = np.load('std/' + name + '_bias.npy')
+    weight = ht.Variable(value=weight_save, name=name+'_weight', ctx=ctx)
+    bias = ht.Variable(value=bias_save, name=name+'_bias', ctx=ctx)
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--steps', type=int, default=8, help='training steps')
+    parser.add_argument('--warmup', type=int, default=2,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.00001, help='learning rate')
+    parser.add_argument('--split', type=str, default='left')
+    args = parser.parse_args()
+    assert args.split in ('left', 'right', 'middle')
+
+    # dataset
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    batch_size = 10000
+    batch_num = 5
+    value_x_list = []
+    value_y_list = []
+    for i in range(batch_num):
+        start = i * batch_size
+        ending = (i+1) * batch_size
+        value_x_list.append(train_set_x[start:ending])
+        value_y_list.append(train_set_y[start:ending])
+
+    # model parallel
+    with ht.context(ht.gpu(0)):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True)
+        weight_save = np.load('std/' + 'special_weight.npy')
+        weight = ht.Variable(value=weight_save, name='mlp_fc1_weight')
+        if args.split == 'left':
+            activation = ht.dispatch(activation, (2, 1))
+            weight = ht.dispatch(weight, (1, 1), duplicate=2)
+        elif args.split == 'right':
+            activation = ht.dispatch(activation, (1, 1), duplicate=2)
+            weight = ht.dispatch(weight, (1, 2))
+        else:
+            activation = ht.dispatch(activation, (1, 2))
+            weight = ht.dispatch(weight, (2, 1))
+
+    with ht.context((ht.gpu(1), ht.gpu(2))):
+        activation = ht.matmul_op(activation, weight)
+        activation = ht.dispatch(activation, (1, 1))
+
+    with ht.context(ht.gpu(3)):
+        activation = ht.relu_op(activation)
+        y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+
+        executor = ht.Executor([loss, train_op])
+
+    # training
+    for step in range(args.steps):
+        if step == args.warmup:
+            start = time.time()
+        loss_val, _ = executor.run(feed_dict={
+                                   x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+        if executor.rank == 3:
+            print('step:', step, 'loss:', loss_val)
+
+    end = time.time()
+    if executor.rank == 3:
+        print("time elapsed for {} steps: {}s".format(
+            args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/parallel/test_model_mlp_base.py b/examples/runner/parallel/test_model_mlp_base.py
new file mode 100644
index 0000000..2941e96
--- /dev/null
+++ b/examples/runner/parallel/test_model_mlp_base.py
@@ -0,0 +1,94 @@
+import hetu as ht
+
+import time
+import argparse
+import os
+import numpy as np
+
+
+def fc(x, shape, name, with_relu=True, rank=-1):
+    weight_save = np.random.normal(0, 0.04, size=shape)
+    bias_save = np.random.normal(0, 0.04, size=shape[-1:])
+    weight = ht.Variable(value=weight_save, name=name+'_weight')
+    bias = ht.Variable(value=bias_save, name=name+'_bias')
+    global args
+    if args.save and args.rank == rank:
+        np.save('std/' + name + '_weight.npy', weight_save)
+        np.save('std/' + name + '_bias.npy', bias_save)
+    x = ht.matmul_op(x, weight)
+    x = x + ht.broadcastto_op(bias, x)
+    if with_relu:
+        x = ht.relu_op(x)
+    return x
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--steps', type=int, default=8, help='training steps')
+    parser.add_argument('--warmup', type=int, default=2,
+                        help='warm up steps excluded from timing')
+    parser.add_argument('--batch-size', type=int, default=8, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.00001, help='learning rate')
+    parser.add_argument('--save', action='store_true')
+    global args
+    args = parser.parse_args()
+    if args.save:
+        comm = ht.wrapped_mpi_nccl_init()
+        args.rank = comm.rank
+        if args.rank == 0 and not os.path.exists('std'):
+            os.mkdir('std')
+
+    # dataset
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    batch_size = 10000
+    batch_num = 5
+    value_x_list = []
+    value_y_list = []
+    for i in range(batch_num):
+        start = i * batch_size
+        ending = (i+1) * batch_size
+        value_x_list.append(train_set_x[start:ending])
+        value_y_list.append(train_set_y[start:ending])
+
+    # model parallel
+    with ht.context(ht.gpu(0)):
+        x = ht.Variable(name="dataloader_x", trainable=False)
+        activation = fc(x, (784, 1024), 'mlp_fc1', with_relu=True, rank=0)
+
+    with ht.context(ht.gpu(1)):
+        weight_save = np.random.normal(0, 0.04, size=(1024, 2048))
+        if args.save and args.rank == 1:
+            np.save('std/' + 'special_weight.npy', weight_save)
+        weight = ht.Variable(value=weight_save, name='mlp_fc1_weight')
+        activation = ht.matmul_op(activation, weight)
+
+    with ht.context(ht.gpu(2)):
+        activation = ht.relu_op(activation)
+        y_pred = fc(activation, (2048, 10), 'mlp_fc2', with_relu=False, rank=2)
+        y_ = ht.Variable(name="dataloader_y", trainable=False)
+        loss = ht.softmaxcrossentropy_op(y_pred, y_)
+        loss = ht.reduce_mean_op(loss, [0])
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+        train_op = opt.minimize(loss)
+
+        executor = ht.Executor([loss, train_op])
+
+    # training
+    for step in range(args.steps):
+        if step == args.warmup:
+            start = time.time()
+        loss_val, _ = executor.run(feed_dict={
+                                   x: value_x_list[step % batch_num], y_: value_y_list[step % batch_num]}, convert_to_numpy_ret_vals=True)
+        if executor.rank == 2:
+            print('step:', step, 'loss:', loss_val)
+
+    end = time.time()
+    if executor.rank == 2:
+        print("time elapsed for {} steps: {}s".format(
+            args.steps-args.warmup, round(end-start, 3)))
diff --git a/examples/runner/remote_allreduce.yml b/examples/runner/remote_allreduce.yml
new file mode 100644
index 0000000..7da1df7
--- /dev/null
+++ b/examples/runner/remote_allreduce.yml
@@ -0,0 +1,6 @@
+nodes:
+  - host: daim118
+    workers: 4
+    chief: true
+  - host: daim117
+    workers: 2
diff --git a/examples/runner/remote_ps.yml b/examples/runner/remote_ps.yml
new file mode 100644
index 0000000..fee7c84
--- /dev/null
+++ b/examples/runner/remote_ps.yml
@@ -0,0 +1,8 @@
+nodes:
+  - host: daim118
+    servers: 1
+    workers: 4
+    chief: true
+  - host: daim117
+    servers: 1
+    workers: 2
diff --git a/examples/runner/run_mlp.py b/examples/runner/run_mlp.py
new file mode 100644
index 0000000..e773dcb
--- /dev/null
+++ b/examples/runner/run_mlp.py
@@ -0,0 +1,118 @@
+import hetu as ht
+from models import MLP
+
+import os
+import numpy as np
+import argparse
+import json
+from time import time
+
+
+if __name__ == "__main__":
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='local',
+                        help='[local, lps(localps), lar(localallreduce), rps(remoteps), rar]')
+    parser.add_argument('--batch-size', type=int,
+                        default=128, help='batch size')
+    parser.add_argument('--learning-rate', type=float,
+                        default=0.1, help='learning rate')
+    parser.add_argument('--opt', type=str, default='sgd',
+                        help='optimizer to be used, default sgd; sgd / momentum / adagrad / adam')
+    parser.add_argument('--num-epochs', type=int,
+                        default=10, help='epoch number')
+    parser.add_argument('--validate', action='store_true',
+                        help='whether to use validation')
+    parser.add_argument('--timing', action='store_true',
+                        help='whether to time the training phase')
+    args = parser.parse_args()
+
+    dataset = 'MNIST'
+
+    assert args.opt in ['sgd', 'momentum', 'nesterov',
+                        'adagrad', 'adam'], 'Optimizer not supported!'
+    if args.opt == 'sgd':
+        print('Use SGD Optimizer.')
+        opt = ht.optim.SGDOptimizer(learning_rate=args.learning_rate)
+    elif args.opt == 'momentum':
+        print('Use Momentum Optimizer.')
+        opt = ht.optim.MomentumOptimizer(learning_rate=args.learning_rate)
+    elif args.opt == 'nesterov':
+        print('Use Nesterov Momentum Optimizer.')
+        opt = ht.optim.MomentumOptimizer(
+            learning_rate=args.learning_rate, nesterov=True)
+    elif args.opt == 'adagrad':
+        print('Use AdaGrad Optimizer.')
+        opt = ht.optim.AdaGradOptimizer(
+            learning_rate=args.learning_rate, initial_accumulator_value=0.1)
+    else:
+        print('Use Adam Optimizer.')
+        opt = ht.optim.AdamOptimizer(learning_rate=args.learning_rate)
+
+    # data loading
+    print('Loading %s data...' % dataset)
+    if dataset == 'MNIST':
+        datasets = ht.data.mnist()
+        train_set_x, train_set_y = datasets[0]
+        valid_set_x, valid_set_y = datasets[1]
+        test_set_x, test_set_y = datasets[2]
+        # train_set_x: (50000, 784), train_set_y: (50000,)
+        # valid_set_x: (10000, 784), valid_set_y: (10000,)
+        # x_shape = (args.batch_size, 784)
+        # y_shape = (args.batch_size, 10)
+
+    # model definition
+    ctx = {
+        'local': ht.gpu(0),
+        'lps': [ht.cpu(0), ht.gpu(0), ht.gpu(1), ht.gpu(4), ht.gpu(5)],
+        'lar': [ht.gpu(1), ht.gpu(2), ht.gpu(3), ht.gpu(6)],
+        'rps': ['cpu:0', 'daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3'],
+        'rar': ['daim118:gpu:0', 'daim118:gpu:2', 'daim118:gpu:4', 'daim118:gpu:6', 'daim117:gpu:1', 'daim117:gpu:3']
+    }[args.config]
+    with ht.context(ctx):
+        print('Building model...')
+        x = ht.dataloader_op([
+            ht.Dataloader(train_set_x, args.batch_size, 'train'),
+            ht.Dataloader(valid_set_x, args.batch_size, 'validate'),
+        ])
+        y_ = ht.dataloader_op([
+            ht.Dataloader(train_set_y, args.batch_size, 'train'),
+            ht.Dataloader(valid_set_y, args.batch_size, 'validate'),
+        ])
+
+        loss, y = MLP.mlp(x, y_)
+        train_op = opt.minimize(loss)
+
+        executor = ht.Executor(
+            {'train': [loss, y, train_op], 'validate': [loss, y, y_]})
+        n_train_batches = executor.get_batch_num('train')
+        n_valid_batches = executor.get_batch_num('validate')
+
+    # training
+    print("Start training loop...")
+    for i in range(args.num_epochs):
+        print("Epoch %d" % i)
+        loss_all = 0
+        if args.timing:
+            start = time()
+        for minibatch_index in range(n_train_batches):
+            loss_val, predict_y, _ = executor.run('train')
+            loss_val = loss_val.asnumpy()
+            loss_all += loss_val * x.dataloaders['train'].last_batch_size
+        loss_all /= len(train_set_x)
+        print("Loss = %f" % loss_all)
+        if args.timing:
+            end = time()
+            print("Time = %f" % (end - start))
+
+        if args.validate:
+            correct_predictions = []
+            for minibatch_index in range(n_valid_batches):
+                loss_val, valid_y_predicted, y_val = executor.run(
+                    'validate', convert_to_numpy_ret_vals=True)
+                correct_prediction = np.equal(
+                    np.argmax(y_val, 1),
+                    np.argmax(valid_y_predicted, 1)).astype(np.float32)
+                correct_predictions.extend(correct_prediction)
+            accuracy = np.mean(correct_predictions)
+            print("Validation accuracy = %f" % accuracy)
diff --git a/examples/runner/run_wdl.py b/examples/runner/run_wdl.py
new file mode 100644
index 0000000..e4163c7
--- /dev/null
+++ b/examples/runner/run_wdl.py
@@ -0,0 +1,130 @@
+import hetu as ht
+from hetu.launcher import launch
+
+import os
+import numpy as np
+import yaml
+import time
+import argparse
+from tqdm import tqdm
+from sklearn import metrics
+from models import load_data, wdl_adult
+
+
+def worker(args):
+    def train(iterations, auc_enabled=True, tqdm_enabled=False):
+        localiter = tqdm(range(iterations)
+                         ) if tqdm_enabled else range(iterations)
+        train_loss = []
+        train_acc = []
+        if auc_enabled:
+            train_auc = []
+        for it in localiter:
+            loss_val, predict_y, y_val, _ = executor.run(
+                'train', convert_to_numpy_ret_vals=True)
+            acc_val = np.equal(
+                np.argmax(y_val, 1),
+                np.argmax(predict_y, 1)).astype(np.float32)
+            train_loss.append(loss_val[0])
+            train_acc.append(acc_val)
+            if auc_enabled:
+                train_auc.append(metrics.roc_auc_score(y_val, predict_y))
+        if auc_enabled:
+            return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)
+        else:
+            return np.mean(train_loss), np.mean(train_acc)
+
+    def validate(iterations, tqdm_enabled=False):
+        localiter = tqdm(range(iterations)
+                         ) if tqdm_enabled else range(iterations)
+        test_loss = []
+        test_acc = []
+        test_auc = []
+        for it in localiter:
+            loss_val, test_y_predicted, y_test_val = executor.run(
+                'validate', convert_to_numpy_ret_vals=True)
+            correct_prediction = np.equal(
+                np.argmax(y_test_val, 1),
+                np.argmax(test_y_predicted, 1)).astype(np.float32)
+            test_loss.append(loss_val[0])
+            test_acc.append(correct_prediction)
+            test_auc.append(metrics.roc_auc_score(
+                y_test_val, test_y_predicted))
+        return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)
+
+    batch_size = 128
+
+    ctx = {
+        'local': 'gpu:0',
+        'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7',
+        'lhy': 'cpu:0,gpu:1,gpu:2,gpu:3,gpu:6',
+        'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3',
+        'rhy': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3'
+    }[args.config]
+    dense_param_ctx = {'local': 'gpu:0', 'lps': 'cpu:0,gpu:0,gpu:1,gpu:2,gpu:7', 'lhy': 'gpu:1,gpu:2,gpu:3,gpu:6',
+                       'rps': 'cpu:0;daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3',
+                       'rhy': 'daim118:gpu:0;daim118:gpu:2;daim118:gpu:4;daim118:gpu:6;daim117:gpu:1;daim117:gpu:3'}[args.config]
+    with ht.context(ctx):
+        x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_data.load_adult_data()
+        dense_input = [
+            ht.dataloader_op([
+                [x_train_deep[:, i], batch_size, 'train'],
+                [x_test_deep[:, i], batch_size, 'validate'],
+            ]) for i in range(12)
+        ]
+        sparse_input = ht.dataloader_op([
+            [x_train_wide, batch_size, 'train'],
+            [x_test_wide, batch_size, 'validate'],
+        ])
+        y_ = ht.dataloader_op([
+            [y_train, batch_size, 'train'],
+            [y_test, batch_size, 'validate'],
+        ])
+        print("Data loaded.")
+
+        loss, prediction, y_, train_op = wdl_adult.wdl_adult(
+            dense_input, sparse_input, y_, dense_param_ctx)
+
+        eval_nodes = {'train': [loss, prediction, y_, train_op]}
+        if args.val:
+            print('Validation enabled...')
+            eval_nodes['validate'] = [loss, prediction, y_]
+        executor = ht.Executor(eval_nodes,
+                               cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
+
+    total_epoch = args.nepoch if args.nepoch > 0 else 50
+    for ep in range(total_epoch):
+        if ep == 5:
+            start = time.time()
+        print("epoch %d" % ep)
+        ep_st = time.time()
+        train_loss, train_acc = train(
+            executor.get_batch_num('train'), auc_enabled=False)
+        ep_en = time.time()
+        if args.val:
+            val_loss, val_acc, val_auc = validate(
+                executor.get_batch_num('validate'))
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f"
+                  % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc))
+        else:
+            print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f"
+                  % (train_loss, train_acc, ep_en - ep_st))
+    print('all time:', time.time() - start)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='local',
+                        help='[local, lps(localps), lhy(localhybrid), rps(remoteps), rhy]')
+    parser.add_argument("--val", action="store_true",
+                        help="whether to use validation")
+    parser.add_argument("--all", action="store_true",
+                        help="whether to use all data")
+    parser.add_argument("--bsp", action="store_true",
+                        help="whether to use bsp instead of asp")
+    parser.add_argument("--cache", default=None, help="cache policy")
+    parser.add_argument("--bound", default=100, help="cache bound")
+    parser.add_argument("--nepoch", type=int, default=-1,
+                        help="num of epochs, each train 1/10 data")
+    args = parser.parse_args()
+    worker(args)
diff --git a/hetu.exp b/hetu.exp
new file mode 100755
index 0000000..1528483
--- /dev/null
+++ b/hetu.exp
@@ -0,0 +1,5 @@
+#!/bin/bash
+path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "Hetu root is" $path
+export PATH="$path/bin:$PATH"
+export PYTHONPATH="$path/python:$path/build/lib:$path/third_party/GraphMix/python:$PYTHONPATH:$path/third_party/HetuML/hetuml/python"
diff --git a/img/alibabacloud.png b/img/alibabacloud.png
new file mode 100644
index 0000000..2b0095e
Binary files /dev/null and b/img/alibabacloud.png differ
diff --git a/img/hetu.png b/img/hetu.png
new file mode 100644
index 0000000..5791951
Binary files /dev/null and b/img/hetu.png differ
diff --git a/img/kuaishou.png b/img/kuaishou.png
new file mode 100644
index 0000000..acce838
Binary files /dev/null and b/img/kuaishou.png differ
diff --git a/img/tencent.png b/img/tencent.png
new file mode 100644
index 0000000..26fdb92
Binary files /dev/null and b/img/tencent.png differ
diff --git a/ps-lite/.gitignore b/ps-lite/.gitignore
new file mode 100644
index 0000000..ba96dc2
--- /dev/null
+++ b/ps-lite/.gitignore
@@ -0,0 +1,2 @@
+src/meta.pb.cc
+src/meta.pb.h
diff --git a/ps-lite/CMakeLists.txt b/ps-lite/CMakeLists.txt
new file mode 100644
index 0000000..4d6fce6
--- /dev/null
+++ b/ps-lite/CMakeLists.txt
@@ -0,0 +1,41 @@
+# port from https://github.com/dmlc/ps-lite/blob/master/CMakeLists.txt
+
+aux_source_directory(src PS_SRC)
+add_library(ps SHARED ${PS_SRC})
+target_include_directories(ps PUBLIC include)
+
+# find and build zeroMQ
+find_package(ZMQ 4.3.2)
+if(NOT ZMQ_FOUND)
+    message(STATUS "ZMQ not Found, Preparing ZMQ ...")
+    FetchContent_Declare(zmq URL https://github.com/zeromq/libzmq/releases/download/v4.3.2/zeromq-4.3.2.zip)
+    FetchContent_GetProperties(zmq)
+    if(NOT zmq_POPULATED)
+        FetchContent_Populate(zmq)
+        if(POLICY CMP0077)  # Avoid building shared library and tests on CMake 3.13+
+            cmake_policy(SET CMP0077 NEW)
+            set(BUILD_SHARED OFF CACHE BOOL "")
+            set(BUILD_TESTS OFF CACHE BOOL "")
+        endif()
+        add_subdirectory(${zmq_SOURCE_DIR} ${zmq_BINARY_DIR})
+    endif()
+    target_link_libraries(ps PRIVATE libzmq-static)
+else()
+    target_include_directories(ps PRIVATE ${ZMQ_INCLUDE_DIRS})
+    target_link_libraries(ps PRIVATE ${ZMQ_LIBRARIES})
+endif()
+
+# find and build protobuf
+find_package(Protobuf)
+if(NOT Protobuf_FOUND)
+    message(FATAL_ERROR "Protobuf not found, use `conda install protobuf`")
+endif()
+
+set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE)
+file(GLOB_RECURSE PROTO_FILES "proto/*.proto")
+protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES})
+target_sources(ps PRIVATE ${PROTO_SRCS})
+target_include_directories(ps PRIVATE ${CMAKE_CURRENT_BINARY_DIR})  # PROTO_HDRS files are here
+target_include_directories(ps PRIVATE ${PROTOBUF_INCLUDE_DIR})
+target_include_directories(ps PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_link_libraries(ps PRIVATE ${PROTOBUF_LIBRARY})
diff --git a/ps-lite/README.md b/ps-lite/README.md
new file mode 100644
index 0000000..4587c0f
--- /dev/null
+++ b/ps-lite/README.md
@@ -0,0 +1,83 @@
+# PS-lite Module [adapted from https://github.com/dmlc/ps-lite]
+
+---
+
+## Use Guide
+
+PS-lite module is a a light-weighted C++ parameter server with ctypes python interface. It provides a list of PS functions that are useful in distributed training.
+
+To use PS,we will have three roles: worker, server and scheduler. Worker are training process. Servers are where the parameters are stored. The scheduler setup and tear down the connection. There will be multiple servers and workers and only one scheduler.
+
+Currently, We only implemented python interface for hetu. Since it contains some symbol from hetu, we can only use ps functions after we import hetu. Here is a quick example on how we use ps-lite with hetu.
+
+```python
+# worker.py
+import hetu
+import numpy as np
+import ctypes
+# create arrays
+tgt_array = hetu.ndarray.empty([128])
+name = 0 # A number specifies a parameter, should be the same among all workers
+param_type = 0 # 0 for dense parameter
+# PS initialize
+hetu.worker_init()
+# PS functions here
+comm = hetu.get_worker_communicate()
+# InitTensor(node_name, param_type, length, width, init_type, init_param_a, init_param_b, seed, opt_type, opt_args, num_opt_args)
+# This function is synchronous.
+comm.InitTensor(name, param_type, 128, 1, 0, 5.0, 1.0, 123, 0, (ctypes.c_float * 1)(0.1), 1)
+comm.Pull(name, tgt_array.handle)
+comm.Wait(name)
+print(tgt_array.asnumpy())
+# PS finialize
+hetu.worker_finish()
+```
+We will also have server code and scheduler code
+```python
+# server.py
+import hetu
+hetu.server_init()
+hetu.server_finish()
+```
+
+```python
+# scheduler.py
+import hetu
+hetu.scheduler_init()
+hetu.scheduler_finish()
+```
+
+To run the sricpts, we should use environment variables to specify which ip address and port to use. Note that it is recommended to use a yaml or json file to store these environment variables.
+
+```shell
+export DMLC_PS_ROOT_URI=127.0.0.1 DMLC_PS_ROOT_PORT=4080 DMLC_NUM_WORKER=1 DMLC_NUM_SERVER=1 DMLC_PS_VAN_TYPE=p3
+DMLC_ROLE=scheduler python3 scheduler.py &
+DMLC_ROLE=server SERVER_ID=0 DMLC_PS_SERVER_URI=127.0.0.1 DMLC_PS_SERVER_PORT=4081 python3 server.py &
+DMLC_ROLE=worker WORKER_ID=0 DMLC_PS_WORKER_URI=127.0.0.1 DMLC_PS_WORKER_PORT=4082 python3 worker.py
+```
+
+## PS functions
+
+We provide a list of useful parameter server functions for training.
+
+It also has the ability to easily extend to new ps functions. There will be several steps to go.
+
+1. Create a enum in psf/PSFunc.h and write a struct to define the ps function.
+
+   ```C++
+   template<> struct PSFData<kMyFunction> {
+     using Request = tuple<
+       unsigned long,
+       SArray<float>
+     >;
+     using Response = tuple<>;
+     static void _callback(const Response &response) {/* callback here */}
+   };
+   ```
+
+   here we can use scalar types like int,float... or arrays as function parameters. Note that arrays are shared and scalars are copied.
+
+2. Implement server handler in server/PSFHandler.h
+
+3. use a kvworker.Request to launch yout ps function and kvworker.Wait to wait till callback ends, see more example in PSAgent.h. We can also write python binding to expose the ps function to python layer.
+
diff --git a/ps-lite/include/common/dmlc_base.h b/ps-lite/include/common/dmlc_base.h
new file mode 100644
index 0000000..f33ba10
--- /dev/null
+++ b/ps-lite/include/common/dmlc_base.h
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file base.h
+ * \brief defines configuration macros
+ */
+#ifndef DMLC_BASE_H_
+#define DMLC_BASE_H_
+
+/*! \brief whether use glog for logging */
+#ifndef DMLC_USE_GLOG
+#define DMLC_USE_GLOG 0
+#endif
+
+/*!
+ * \brief whether throw dmlc::Error instead of
+ *  directly calling abort when FATAL error occured
+ *  NOTE: this may still not be perfect.
+ *  do not use FATAL and CHECK in destructors
+ */
+#ifndef DMLC_LOG_FATAL_THROW
+#define DMLC_LOG_FATAL_THROW 1
+#endif
+
+/*!
+ * \brief Whether to print stack trace for fatal error,
+ * enabled on linux when using gcc.
+ */
+#if (defined(__GNUC__) && !defined(__MINGW32__) && !defined(__sun)             \
+     && !defined(__SVR4) && !(defined __MINGW64__) && !(defined __ANDROID__))
+#if (!defined(DMLC_LOG_STACK_TRACE))
+#define DMLC_LOG_STACK_TRACE 1
+#endif
+#if (!defined(DMLC_LOG_STACK_TRACE_SIZE))
+#define DMLC_LOG_STACK_TRACE_SIZE 10
+#endif
+#endif
+
+/*! \brief whether compile with hdfs support */
+#ifndef DMLC_USE_HDFS
+#define DMLC_USE_HDFS 0
+#endif
+
+/*! \brief whether compile with s3 support */
+#ifndef DMLC_USE_S3
+#define DMLC_USE_S3 0
+#endif
+
+/*! \brief whether or not use parameter server */
+#ifndef DMLC_USE_PS
+#define DMLC_USE_PS 0
+#endif
+
+/*! \brief whether or not use c++11 support */
+#ifndef DMLC_USE_CXX11
+#define DMLC_USE_CXX11                                                         \
+    (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L             \
+     || defined(_MSC_VER))
+#endif
+
+/// check if g++ is before 4.6
+#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 6
+#pragma message("Will need g++-4.6 or higher to compile all"                   \
+                "the features in dmlc-core, "                                  \
+                "compile without c++11, some features may be disabled")
+#undef DMLC_USE_CXX11
+#define DMLC_USE_CXX11 0
+#endif
+#endif
+
+/*!
+ * \brief Disable copy constructor and assignment operator.
+ *
+ * If C++11 is supported, both copy and move constructors and
+ * assignment operators are deleted explicitly. Otherwise, they are
+ * only declared but not implemented. Place this macro in private
+ * section if C++11 is not available.
+ */
+#ifndef DISALLOW_COPY_AND_ASSIGN
+#if DMLC_USE_CXX11
+#define DISALLOW_COPY_AND_ASSIGN(T)                                            \
+    T(T const &) = delete;                                                     \
+    T(T &&) = delete;                                                          \
+    T &operator=(T const &) = delete;                                          \
+    T &operator=(T &&) = delete
+#else
+#define DISALLOW_COPY_AND_ASSIGN(T)                                            \
+    T(T const &);                                                              \
+    T &operator=(T const &)
+#endif
+#endif
+
+///
+/// code block to handle optionally loading
+///
+#if !defined(__GNUC__)
+#define fopen64 std::fopen
+#endif
+#ifdef _MSC_VER
+#if _MSC_VER < 1900
+// NOTE: sprintf_s is not equivalent to snprintf,
+// they are equivalent when success, which is sufficient for our case
+#define snprintf sprintf_s
+#define vsnprintf vsprintf_s
+#endif
+#else
+#ifdef _FILE_OFFSET_BITS
+#if _FILE_OFFSET_BITS == 32
+#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
+#endif
+#endif
+
+#ifdef __APPLE__
+#define off64_t off_t
+#define fopen64 std::fopen
+#endif
+
+extern "C" {
+#include <sys/types.h>
+}
+#endif
+
+#ifdef _MSC_VER
+//! \cond Doxygen_Suppress
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+//! \endcond
+#else
+#include <inttypes.h>
+#endif
+#include <string>
+#include <vector>
+
+/*! \brief namespace for dmlc */
+namespace dmlc {
+/*!
+ * \brief safely get the beginning address of a vector
+ * \param vec input vector
+ * \return beginning address of a vector
+ */
+template <typename T>
+inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
+    if (vec.size() == 0) {
+        return NULL;
+    } else {
+        return &vec[0];
+    }
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param vec input vector
+ * \return beginning address of a vector
+ */
+template <typename T>
+inline const T *BeginPtr(const std::vector<T> &vec) {
+    if (vec.size() == 0) {
+        return NULL;
+    } else {
+        return &vec[0];
+    }
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param str input string
+ * \return beginning address of a string
+ */
+inline char *BeginPtr(std::string &str) { // NOLINT(*)
+    if (str.length() == 0)
+        return NULL;
+    return &str[0];
+}
+/*!
+ * \brief get the beginning address of a vector
+ * \param str input string
+ * \return beginning address of a string
+ */
+inline const char *BeginPtr(const std::string &str) {
+    if (str.length() == 0)
+        return NULL;
+    return &str[0];
+}
+} // namespace dmlc
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define constexpr const
+#define alignof __alignof
+#endif
+
+#endif // DMLC_BASE_H_
diff --git a/ps-lite/include/common/logging.h b/ps-lite/include/common/logging.h
new file mode 100644
index 0000000..0aa23bc
--- /dev/null
+++ b/ps-lite/include/common/logging.h
@@ -0,0 +1,349 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file logging.h
+ * \brief defines logging macros of dmlc
+ *  allows use of GLOG, fall back to internal
+ *  implementation when disabled
+ */
+#ifndef DMLC_LOGGING_H_
+#define DMLC_LOGGING_H_
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <memory>
+#include "dmlc_base.h"
+
+#if DMLC_LOG_STACK_TRACE
+#include <cxxabi.h>
+#endif
+
+#if DMLC_LOG_STACK_TRACE
+#include <execinfo.h>
+#endif
+
+namespace dmlc {
+/*!
+ * \brief exception class that will be thrown by
+ *  default logger if DMLC_LOG_FATAL_THROW == 1
+ */
+struct Error : public std::runtime_error {
+    /*!
+     * \brief constructor
+     * \param s the error message
+     */
+    explicit Error(const std::string &s) : std::runtime_error(s) {
+    }
+};
+} // namespace dmlc
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define noexcept(a)
+#endif
+
+#if DMLC_USE_CXX11
+#define DMLC_THROW_EXCEPTION noexcept(false)
+#else
+#define DMLC_THROW_EXCEPTION
+#endif
+
+#if DMLC_USE_GLOG
+#include <glog/logging.h>
+
+namespace dmlc {
+inline void InitLogging(const char *argv0) {
+    google::InitGoogleLogging(argv0);
+}
+} // namespace dmlc
+
+#else
+// use a light version of glog
+#include <assert.h>
+#include <iostream>
+#include <sstream>
+#include <ctime>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4722)
+#endif
+
+namespace dmlc {
+inline void InitLogging(const char *argv0) {
+    // DO NOTHING
+}
+
+// Always-on checking
+#define CHECK(x)                                                               \
+    if (!(x))                                                                  \
+    dmlc::LogMessageFatal(__FILE__, __LINE__).stream() << "Check "             \
+                                                          "failed: " #x        \
+                                                       << ' '
+#define CHECK_LT(x, y) CHECK((x) < (y))
+#define CHECK_GT(x, y) CHECK((x) > (y))
+#define CHECK_LE(x, y) CHECK((x) <= (y))
+#define CHECK_GE(x, y) CHECK((x) >= (y))
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NOTNULL(x)                                                       \
+    ((x) == NULL ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()          \
+                       << "Check  notnull: " #x << ' ',                        \
+     (x) : (x)) // NOLINT(*)
+// Debug-only checking.
+#ifdef NDEBUG
+#define DCHECK(x)                                                              \
+    while (false)                                                              \
+    CHECK(x)
+#define DCHECK_LT(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) < (y))
+#define DCHECK_GT(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) > (y))
+#define DCHECK_LE(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) <= (y))
+#define DCHECK_GE(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) >= (y))
+#define DCHECK_EQ(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) == (y))
+#define DCHECK_NE(x, y)                                                        \
+    while (false)                                                              \
+    CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif // NDEBUG
+
+#define LOG_INFO dmlc::LogMessage(__FILE__, __LINE__)
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL dmlc::LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// Poor man version of VLOG
+#define VLOG(x) LOG_INFO.stream()
+
+#define LOG(severity) LOG_##severity.stream()
+#define LG LOG_INFO.stream()
+#define LF LOG_FATAL.stream()
+#define LGMT dmlc::LogMessageMultiThread(__FILE__, __LINE__).stream()
+#define LOG_IF(severity, condition)                                            \
+    !(condition) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+
+#ifdef NDEBUG
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity)                                                         \
+    true ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition)                                           \
+    (true || !(condition)) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#else
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG(severity)
+#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
+#endif
+
+// Poor man version of LOG_EVERY_N
+#define LOG_EVERY_N(severity, n) LOG(severity)
+
+class DateLogger {
+public:
+    DateLogger() {
+#if defined(_MSC_VER)
+        _tzset();
+#endif
+    }
+    const char *HumanDate() {
+#if defined(_MSC_VER)
+        _strtime_s(buffer_, sizeof(buffer_));
+#else
+        time_t time_value = time(NULL);
+        struct tm now;
+        localtime_r(&time_value, &now);
+        snprintf(buffer_, sizeof(buffer_), "%02d:%02d:%02d", now.tm_hour,
+                 now.tm_min, now.tm_sec);
+#endif
+        return buffer_;
+    }
+
+private:
+    char buffer_[9];
+};
+
+class LogMessage {
+public:
+    LogMessage(const char *file, int line) :
+#ifdef __ANDROID__
+        log_stream_(std::cout)
+#else
+        log_stream_(std::cerr)
+#endif
+    {
+        log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                    << line << ": ";
+    }
+    ~LogMessage() {
+        log_stream_ << "\n";
+    }
+    std::ostream &stream() {
+        return log_stream_;
+    }
+
+protected:
+    std::ostream &log_stream_;
+
+private:
+    DateLogger pretty_date_;
+    LogMessage(const LogMessage &);
+    void operator=(const LogMessage &);
+};
+
+#if DMLC_LOG_STACK_TRACE
+inline std::string Demangle(char const *msg_str) {
+    using std::string;
+    string msg(msg_str);
+    size_t symbol_start = string::npos;
+    size_t symbol_end = string::npos;
+    if (((symbol_start = msg.find("_Z")) != string::npos)
+        && (symbol_end = msg.find_first_of(" +", symbol_start))) {
+        string left_of_symbol(msg, 0, symbol_start);
+        string symbol(msg, symbol_start, symbol_end - symbol_start);
+        string right_of_symbol(msg, symbol_end);
+
+        int status = 0;
+        size_t length = string::npos;
+        std::unique_ptr<char, decltype(&std::free)> demangled_symbol = {
+            abi::__cxa_demangle(symbol.c_str(), 0, &length, &status),
+            &std::free};
+        if (demangled_symbol && status == 0 && length > 0) {
+            string symbol_str(demangled_symbol.get());
+            std::ostringstream os;
+            os << left_of_symbol << symbol_str << right_of_symbol;
+            return os.str();
+        }
+    }
+    return string(msg_str);
+}
+
+inline std::string StackTrace() {
+    using std::string;
+    std::ostringstream stacktrace_os;
+    const int MAX_STACK_SIZE = DMLC_LOG_STACK_TRACE_SIZE;
+    void *stack[MAX_STACK_SIZE];
+    int nframes = backtrace(stack, MAX_STACK_SIZE);
+    stacktrace_os << "Stack trace returned " << nframes
+                  << " entries:" << std::endl;
+    char **msgs = backtrace_symbols(stack, nframes);
+    if (msgs != nullptr) {
+        for (int frameno = 0; frameno < nframes; ++frameno) {
+            string msg = dmlc::Demangle(msgs[frameno]);
+            stacktrace_os << "[bt] (" << frameno << ") " << msg << "\n";
+        }
+    }
+    free(msgs);
+    string stack_trace = stacktrace_os.str();
+    return stack_trace;
+}
+
+#else // DMLC_LOG_STACK_TRACE is off
+
+inline std::string demangle(char const *msg_str) {
+    return std::string();
+}
+
+inline std::string StackTrace() {
+    return std::string("stack traces not available when "
+                       "DMLC_LOG_STACK_TRACE is disabled at compile time.");
+}
+
+#endif // DMLC_LOG_STACK_TRACE
+
+#if DMLC_LOG_FATAL_THROW == 0
+class LogMessageFatal : public LogMessage {
+public:
+    LogMessageFatal(const char *file, int line) : LogMessage(file, line) {
+    }
+    ~LogMessageFatal() {
+        log_stream_ << "\n";
+        abort();
+    }
+
+private:
+    LogMessageFatal(const LogMessageFatal &);
+    void operator=(const LogMessageFatal &);
+};
+#else
+class LogMessageFatal {
+public:
+    LogMessageFatal(const char *file, int line) {
+        log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                    << line << ": ";
+    }
+    std::ostringstream &stream() {
+        return log_stream_;
+    }
+    ~LogMessageFatal() DMLC_THROW_EXCEPTION {
+#if DMLC_LOG_STACK_TRACE
+        log_stream_ << "\n\n" << StackTrace() << "\n";
+#endif
+        // throwing out of destructor is evil
+        // hopefully we can do it here
+        // also log the message before throw
+        LOG(ERROR) << log_stream_.str();
+        throw Error(log_stream_.str());
+    }
+
+private:
+    std::ostringstream log_stream_;
+    DateLogger pretty_date_;
+    LogMessageFatal(const LogMessageFatal &);
+    void operator=(const LogMessageFatal &);
+};
+#endif
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+public:
+    LogMessageVoidify() {
+    }
+    // This has to be an operator with a precedence lower than << but
+    // higher than "?:". See its usage.
+    void operator&(std::ostream &) {
+    }
+};
+
+class LogMessageMultiThread {
+public:
+    LogMessageMultiThread(const char *file, int line) {
+        log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                    << line << ": ";
+    }
+    std::ostringstream &stream() {
+        return log_stream_;
+    }
+    ~LogMessageMultiThread() {
+        log_stream_ << "\n";
+    }
+
+private:
+    std::ostringstream log_stream_;
+    DateLogger pretty_date_;
+    LogMessageMultiThread(const LogMessageMultiThread &);
+    void operator=(const LogMessageMultiThread &);
+};
+
+} // namespace dmlc
+
+#endif
+#endif // DMLC_LOGGING_H_
diff --git a/ps-lite/include/common/sarray.h b/ps-lite/include/common/sarray.h
new file mode 100644
index 0000000..71c7cea
--- /dev/null
+++ b/ps-lite/include/common/sarray.h
@@ -0,0 +1,328 @@
+#pragma once
+
+#include <string.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <sstream>
+#include <inttypes.h>
+#include "logging.h"
+
+/**
+ * \brief Shared array
+ *
+ * A smart array that retains shared ownership. It provides similar
+ * functionalities comparing to std::vector, including data(), size(),
+ * operator[], resize(), clear(). SArray can be easily constructed from
+ * std::vector, such as
+ *
+ * \code
+ * std::vector<int> a(10); SArray<int> b(a);  // copying
+ * std::shared_ptr<std::vector<int>> c(new std::vector<int>(10));
+ * SArray<int> d(c);  // only pointer copying
+ * \endcode
+ *
+ * SArray is also like a C pointer when copying and assigning, namely
+ * both copy are assign are passing by pointers. The memory will be release only
+ * if there is no copy exists. It is also can be cast without memory copy, such
+ * as
+ *
+ * \code
+ * SArray<int> a(10);
+ * SArray<char> b(a);  // now b.size() = 10 * sizeof(int);
+ * \endcode
+ *
+ * \tparam V the value type
+ */
+template <typename V>
+class SArray {
+public:
+    /** \brief empty constructor */
+    SArray() {
+    }
+
+    /** \brief empty deconstrcutor */
+    ~SArray() {
+    }
+
+    /**
+     * \brief Create an array with length n with initialized value
+     * \param size the length
+     * \param val the initial length (0 in default)
+     */
+    explicit SArray(size_t size, V val = 0) {
+        resize(size, val);
+    }
+
+    /**
+     * \brief construct from another SArray.
+     *
+     * Zero-copy constructor, namely just copy the pointer
+     *
+     * \tparam W the value type of the source array
+     * \param arr the source array
+     */
+    template <typename W>
+    explicit SArray(const SArray<W> &arr) {
+        *this = arr;
+    }
+
+    /**
+     * \brief construct from another SArray.
+     *
+     * Zero-copy constructor, namely just copy the pointer
+     *
+     * \tparam W the value type of the source array
+     * \param arr the source array
+     */
+    template <typename W>
+    void operator=(const SArray<W> &arr) {
+        size_ = arr.size() * sizeof(W) / sizeof(V);
+        CHECK_EQ(size_ * sizeof(V), arr.size() * sizeof(W))
+            << "cannot be divided";
+        capacity_ = arr.capacity() * sizeof(W) / sizeof(V);
+        ptr_ = std::shared_ptr<V>(arr.ptr(), reinterpret_cast<V *>(arr.data()));
+    }
+
+    /**
+     * \brief construct from a c-array
+     *
+     * Zero-copy constructor, namely just copy the pointer
+     *
+     * \param data the source data
+     * \param size the length
+     * \param deletable whether or not can call `delete [] data` when the
+     * reference count goes 0
+     */
+
+    SArray(V *data, size_t size, bool deletable = false) {
+        if (deletable) {
+            reset(data, size, [](V *data) { delete[] data; });
+        } else {
+            reset(data, size, [](V *data) {});
+        }
+    }
+
+    /**
+     * \brief copy from a c-array
+     *
+     * \param data the source data
+     * \param size the length
+     */
+    void CopyFrom(const V *data, size_t size) {
+        resize(size);
+        memcpy(this->data(), data, size * sizeof(V));
+    }
+
+    /**
+     * \brief copy from another SArray
+     *
+     * \param other the source data
+     */
+    void CopyFrom(const SArray<V> &other) {
+        if (this == &other)
+            return;
+        CopyFrom(other.data(), other.size());
+    }
+
+    /**
+     * \brief copy from an iterator
+     */
+    template <typename ForwardIt>
+    void CopyFrom(const ForwardIt &first, const ForwardIt &last) {
+        size_t size = static_cast<size_t>(std::distance(first, last));
+        V *data = new V[size];
+        reset(data, size, [](V *data) { delete[] data; });
+        auto it = first;
+        while (size-- > 0) {
+            *data = *it;
+            ++data;
+            ++it;
+        }
+    }
+
+    /**
+     * \brief construct from a std::vector, copy the data
+     */
+    explicit SArray(const std::vector<V> &vec) {
+        CopyFrom(vec.data(), vec.size());
+    }
+
+    /**
+     * \brief construct from a shared std::vector pinter, no data copy
+     */
+    explicit SArray(const std::shared_ptr<std::vector<V>> &vec) {
+        ptr_ = std::shared_ptr<V>(vec, vec->data());
+        size_ = vec->size();
+        capacity_ = size_;
+    }
+
+    /** @brief Copy from a initializer_list */
+    template <typename W>
+    SArray(const std::initializer_list<W> &list) {
+        CopyFrom(list.begin(), list.end());
+    }
+
+    /** @brief Copy from a initializer_list */
+    template <typename W>
+    void operator=(const std::initializer_list<W> &list) {
+        CopyFrom(list.begin(), list.end());
+    }
+
+    /**
+     * @brief Reset the current data pointer with a deleter
+     */
+    template <typename Deleter>
+    void reset(V *data, size_t size, Deleter del) {
+        size_ = size;
+        capacity_ = size;
+        ptr_.reset(data, del);
+    }
+
+    /**
+     * @brief Resizes the array to size elements
+     *
+     * If size <= capacity_, then only change the size. otherwise, append size -
+     * current_size entries, and then set new value to val
+     */
+    void resize(size_t size, V val = 0) {
+        size_t cur_n = size_;
+        if (capacity_ >= size) {
+            size_ = size;
+        } else {
+            V *new_data = new V[size + 5];
+            memcpy(new_data, data(), size_ * sizeof(V));
+            reset(new_data, size, [](V *data) { delete[] data; });
+        }
+        if (size <= cur_n)
+            return;
+        V *p = data() + cur_n;
+        if (val == 0) {
+            memset(p, 0, (size - cur_n) * sizeof(V));
+        } else {
+            for (size_t i = 0; i < size - cur_n; ++i) {
+                *p = val;
+                ++p;
+            }
+        }
+    }
+
+    /**
+     * @brief Requests that the capacity be at least enough to contain n
+     * elements.
+     */
+    void reserve(size_t size) {
+        if (capacity_ >= size) {
+            return;
+        }
+        size_t old_size = size_;
+        resize(size);
+        size_ = old_size;
+    }
+
+    /** @brief release the memory */
+    void clear() {
+        reset(nullptr, 0, [](V *data) {});
+    }
+
+    inline bool empty() const {
+        return size() == 0;
+    }
+    inline size_t size() const {
+        return size_;
+    }
+    inline size_t capacity() const {
+        return capacity_;
+    }
+
+    inline V *begin() {
+        return data();
+    }
+    inline const V *begin() const {
+        return data();
+    }
+    inline V *end() {
+        return data() + size();
+    }
+    inline const V *end() const {
+        return data() + size();
+    }
+
+    inline V *data() const {
+        return ptr_.get();
+    }
+
+    /** \brief get the shared pointer */
+    inline std::shared_ptr<V> &ptr() {
+        return ptr_;
+    }
+    /** \brief get the const shared pointer */
+    inline const std::shared_ptr<V> &ptr() const {
+        return ptr_;
+    }
+
+    inline V back() const {
+        CHECK(!empty());
+        return data()[size_ - 1];
+    }
+    inline V front() const {
+        CHECK(!empty());
+        return data()[0];
+    }
+    inline V &operator[](size_t i) {
+        return data()[i];
+    }
+    inline const V &operator[](size_t i) const {
+        return data()[i];
+    }
+
+    inline void push_back(const V &val) {
+        if (size_ == capacity_)
+            reserve(size_ * 2 + 5);
+        data()[size_++] = val;
+    }
+
+    void pop_back() {
+        if (size_)
+            --size_;
+    }
+
+    void append(const SArray<V> &arr) {
+        if (arr.empty())
+            return;
+        auto orig_size = size_;
+        resize(size_ + arr.size());
+        memcpy(data() + orig_size, arr.data(), arr.size() * sizeof(V));
+    }
+
+    /**
+     * @brief Slice a segment, zero-copy
+     *
+     * @param begin the start index segment
+     * @param end the end index segment
+     * @return the segment [begin, end)
+     */
+    SArray<V> segment(size_t begin, size_t end) const {
+        CHECK_GE(end, begin);
+        CHECK_LE(end, size());
+        SArray<V> ret;
+        ret.ptr_ = std::shared_ptr<V>(ptr_, data() + begin);
+        ret.size_ = end - begin;
+        ret.capacity_ = end - begin;
+        return ret;
+    }
+
+private:
+    size_t size_ = 0;
+    size_t capacity_ = 0;
+    std::shared_ptr<V> ptr_;
+};
+
+/**
+ * \brief print a debug string
+ */
+template <typename V>
+std::ostream &operator<<(std::ostream &os, const SArray<V> &obj) {
+    os << DebugStr(obj.data(), obj.size());
+    return os;
+}
diff --git a/ps-lite/include/common/shared_mutex.h b/ps-lite/include/common/shared_mutex.h
new file mode 100644
index 0000000..d7ea395
--- /dev/null
+++ b/ps-lite/include/common/shared_mutex.h
@@ -0,0 +1,184 @@
+/*
+ *	shared_mutex (C) 2017 E. Oriani, ema <AT> fastwebnet <DOT> it
+ *
+ *	This file is part of shared_mutex.
+ *
+ *	shared_mutex is free software: you can redistribute it and/or modify
+ *	it under the terms of the GNU Lesser General Public License as published by
+ *	the Free Software Foundation, either version 3 of the License, or
+ *	(at your option) any later version.
+ *
+ *	shared_mutex is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *	GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with nettop.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <array>
+#include <thread>
+
+// this should be defined in the Makefile
+// if not defined, use what is most common
+// for x86_64 CPUs in 2017...
+#ifndef LEVEL1_DCACHE_LINESIZE
+#define LEVEL1_DCACHE_LINESIZE 64
+#endif
+
+namespace ps {
+template <size_t N>
+class shared_mutex {
+    // purpose of this structure is to hold
+    // status of each individual bucket-mutex
+    // object
+    // Ideally each thread should be mapped to
+    // one entry only of 'el_' during its
+    // lifetime
+    struct entry_lock {
+        const static uint64_t W_MASK = 0x8000000000000000, R_MASK = ~W_MASK;
+
+        // purpose ot this variable is to hold
+        // in the first bit (W_MASK) if we're locking
+        // in exclusive mode, otherwise use the
+        // reamining 63 bits to count how many R/O
+        // locks we share in this very bucket
+        std::atomic<uint64_t> wr_lock;
+
+        entry_lock() : wr_lock(0) {
+        }
+    } __attribute__((aligned(LEVEL1_DCACHE_LINESIZE)));
+    // array holding all the buckets
+    std::array<entry_lock, N> el_;
+    // atomic variable used to initialize thread
+    // ids so that they should evenly spread
+    // across all the buckets
+    static std::atomic<size_t> idx_hint_;
+    // lock-free function to return a 'unique' id
+    static uint64_t get_hint_idx(void) {
+        while (true) {
+            size_t cur_hint = idx_hint_.load();
+            if (idx_hint_.compare_exchange_weak(cur_hint, cur_hint + 1))
+                return cur_hint;
+        }
+    }
+    // get index for given thread
+    // could hav used something like
+    // std::hash<std::thread::id>()(std::this_thread::get_id()) but honestly
+    // using a controlled idx_hint_ seems to be better in terms of putting
+    // threads into buckets evenly note - thread_local is supposed to be
+    // static...
+    inline static size_t get_thread_idx(void) {
+        const thread_local size_t rv = get_hint_idx() % N;
+        return rv;
+    }
+
+public:
+    shared_mutex() {
+    }
+
+    void lock_shared(void) {
+        // try to replace the wr_lock with current value incremented by one
+        while (true) {
+            size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load();
+            if (entry_lock::W_MASK & cur_rw_lock) {
+                // if someone has got W access yield and retry...
+                std::this_thread::yield();
+                continue;
+            }
+            if (el_[get_thread_idx()].wr_lock.compare_exchange_weak(
+                    cur_rw_lock, cur_rw_lock + 1))
+                break;
+        }
+    }
+
+    void unlock_shared(void) {
+        // try to decrement the count
+        while (true) {
+            size_t cur_rw_lock = el_[get_thread_idx()].wr_lock.load();
+#ifndef _RELEASE
+            if (entry_lock::W_MASK & cur_rw_lock)
+                throw std::runtime_error(
+                    "Fatal: unlock_shared but apparently this entry is W_MASK locked!");
+#endif //_RELEASE
+            if (el_[get_thread_idx()].wr_lock.compare_exchange_weak(
+                    cur_rw_lock, cur_rw_lock - 1))
+                break;
+        }
+    }
+
+    void lock(void) {
+        for (size_t i = 0; i < N; ++i) {
+            // acquire all locks from all buckets
+            while (true) {
+                size_t cur_rw_lock = el_[i].wr_lock.load();
+                if (cur_rw_lock != 0) {
+                    std::this_thread::yield();
+                    continue;
+                }
+                // if cur_rw_lock is 0 then proceed
+                if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock,
+                                                         entry_lock::W_MASK))
+                    break;
+            }
+        }
+    }
+
+    void unlock(void) {
+        for (size_t i = 0; i < N; ++i) {
+            // release all locks
+            while (true) {
+                size_t cur_rw_lock = el_[i].wr_lock.load();
+#ifndef _RELEASE
+                if (cur_rw_lock != entry_lock::W_MASK)
+                    throw std::runtime_error(
+                        "Fatal: unlock but apparently this entry is shared locked or uninitialized!");
+#endif //_RELEASE
+       // then proceed resetting to 0
+                if (el_[i].wr_lock.compare_exchange_weak(cur_rw_lock, 0))
+                    break;
+            }
+        }
+    }
+
+    ~shared_mutex() {
+    }
+};
+
+template <size_t N>
+std::atomic<size_t> shared_mutex<N>::idx_hint_{0};
+
+// utility class for exclusive RAII lock
+template <size_t N>
+class x_lock {
+    shared_mutex<N> &sm_;
+
+public:
+    x_lock(shared_mutex<N> &sm) : sm_(sm) {
+        sm_.lock();
+    }
+
+    ~x_lock() {
+        sm_.unlock();
+    }
+};
+
+// utility class for share RAII lock
+template <size_t N>
+class s_lock {
+    shared_mutex<N> &sm_;
+
+public:
+    s_lock(shared_mutex<N> &sm) : sm_(sm) {
+        sm_.lock_shared();
+    }
+
+    ~s_lock() {
+        sm_.unlock_shared();
+    }
+};
+} // namespace ps
diff --git a/ps-lite/include/common/thread_pool.h b/ps-lite/include/common/thread_pool.h
new file mode 100644
index 0000000..1899b0a
--- /dev/null
+++ b/ps-lite/include/common/thread_pool.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <assert.h>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+
+class ThreadPool {
+public:
+    ThreadPool(size_t thread_num);
+    ~ThreadPool();
+    static ThreadPool *Get();
+
+    template <class F, class... Args>
+    auto Enqueue(F &&f, Args &&... args)
+        -> std::future<typename std::result_of<F(Args...)>::type> {
+        using return_type = typename std::result_of<F(Args...)>::type;
+        auto task = std::make_shared<std::packaged_task<return_type()>>(
+            std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+        std::future<return_type> res = task->get_future();
+        {
+            std::unique_lock<std::mutex> lock(mutex_);
+            if (terminate_)
+                throw std::runtime_error("enqueue on stopped ThreadPool");
+            tasks_.emplace([task]() { (*task)(); });
+        }
+        cond_.notify_one();
+        return res;
+    }
+
+    void Wait(int task_num);
+
+    size_t ThreadNum() {
+        return thread_num_;
+    }
+
+private:
+    bool terminate_;
+    size_t thread_num_;
+    std::atomic_int complete_task_num_;
+    std::vector<std::thread> threads_;
+    std::queue<std::function<void()>> tasks_;
+    std::mutex mutex_;
+    std::condition_variable cond_;
+};
diff --git a/ps-lite/include/common/thread_safe_hash_map.h b/ps-lite/include/common/thread_safe_hash_map.h
new file mode 100644
index 0000000..ce1f1e7
--- /dev/null
+++ b/ps-lite/include/common/thread_safe_hash_map.h
@@ -0,0 +1,325 @@
+#pragma once
+#include <unordered_map>
+#include <memory>
+#include <utility>
+#include "shared_mutex.h"
+
+namespace ps {
+/*
+  thread_safe unordered_map
+  use read-write lock to guaruntee concurrency
+*/
+template <typename _Key, typename _Tp, typename _Hash = std::hash<_Key>,
+          typename _Pred = std::equal_to<_Key>,
+          typename _Alloc = std::allocator<std::pair<const _Key, _Tp>>>
+class threadsafe_unordered_map {
+private:
+    std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc> map;
+    mutable shared_mutex<4> mtx;
+
+public:
+    using map_type = std::unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>;
+    using key_type = typename map_type::key_type;
+    using mapped_type = typename map_type::mapped_type;
+    using value_type = typename map_type::value_type;
+    using hasher = typename map_type::hasher;
+    using key_equal = typename map_type::key_equal;
+    using allocator_type = typename map_type::allocator_type;
+    using reference = typename map_type::reference;
+    using const_reference = typename map_type::const_reference;
+    using pointer = typename map_type::pointer;
+    using const_pointer = typename map_type::const_pointer;
+    using iterator = typename map_type::iterator;
+    using const_iterator = typename map_type::const_iterator;
+    using local_iterator = typename map_type::local_iterator;
+    using const_local_iterator = typename map_type::const_local_iterator;
+    using size_type = typename map_type::size_type;
+    using difference_type = typename map_type::difference_type;
+
+    threadsafe_unordered_map() = default;
+    threadsafe_unordered_map(const threadsafe_unordered_map &) = delete;
+    threadsafe_unordered_map(threadsafe_unordered_map &&) = default;
+    threadsafe_unordered_map &
+    operator=(const threadsafe_unordered_map &) = delete;
+    threadsafe_unordered_map &operator=(threadsafe_unordered_map &&) = delete;
+    explicit threadsafe_unordered_map(
+        size_type __n, const hasher &__hf = hasher(),
+        const key_equal &__eql = key_equal(),
+        const allocator_type &__a = allocator_type()) :
+        map(__n, __hf, __eql, __a) {
+    }
+    template <typename _InputIterator>
+    threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+                             size_type __n = 0, const hasher &__hf = hasher(),
+                             const key_equal &__eql = key_equal(),
+                             const allocator_type &__a = allocator_type()) :
+        map(__first, __last, __n, __hf, __eql, __a) {
+    }
+    threadsafe_unordered_map(const map_type &v) : map(v) {
+    }
+    threadsafe_unordered_map(map_type &&rv) : map(std::move(rv)) {
+    }
+    explicit threadsafe_unordered_map(const allocator_type &__a) : map(__a) {
+    }
+    threadsafe_unordered_map(const map_type &__umap,
+                             const allocator_type &__a) :
+        map(__umap, __a) {
+    }
+    threadsafe_unordered_map(map_type &&__umap, const allocator_type &__a) :
+        map(std::move(__umap), __a) {
+    }
+    threadsafe_unordered_map(std::initializer_list<value_type> __l,
+                             size_type __n = 0, const hasher &__hf = hasher(),
+                             const key_equal &__eql = key_equal(),
+                             const allocator_type &__a = allocator_type()) :
+        map(__l, __n, __hf, __eql, __a) {
+    }
+    threadsafe_unordered_map(size_type __n, const allocator_type &__a) :
+        threadsafe_unordered_map(__n, hasher(), key_equal(), __a) {
+    }
+    threadsafe_unordered_map(size_type __n, const hasher &__hf,
+                             const allocator_type &__a) :
+        threadsafe_unordered_map(__n, __hf, key_equal(), __a) {
+    }
+    template <typename _InputIterator>
+    threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+                             size_type __n, const allocator_type &__a) :
+        map(__first, __last, __n, __a) {
+    }
+    template <typename _InputIterator>
+    threadsafe_unordered_map(_InputIterator __first, _InputIterator __last,
+                             size_type __n, const hasher &__hf,
+                             const allocator_type &__a) :
+        threadsafe_unordered_map(__first, __last, __n, __hf, key_equal(), __a) {
+    }
+    threadsafe_unordered_map(std::initializer_list<value_type> __l,
+                             size_type __n, const allocator_type &__a) :
+        threadsafe_unordered_map(__l, __n, hasher(), key_equal(), __a) {
+    }
+    threadsafe_unordered_map(std::initializer_list<value_type> __l,
+                             size_type __n, const hasher &__hf,
+                             const allocator_type &__a) :
+        threadsafe_unordered_map(__l, __n, __hf, key_equal(), __a) {
+    }
+    bool empty() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.empty();
+    }
+    size_type size() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.size();
+    }
+    size_type max_size() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.max_size();
+    }
+    iterator begin() noexcept {
+        x_lock<4> write_lock(mtx);
+        return map.begin();
+    }
+    const_iterator begin() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.begin();
+    }
+    const_iterator cbegin() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.cbegin();
+    }
+    iterator end() noexcept {
+        x_lock<4> write_lock(mtx);
+        return map.end();
+    }
+    const_iterator end() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.end();
+    }
+    const_iterator cend() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.cend();
+    }
+    template <typename... _Args>
+    std::pair<iterator, bool> emplace(_Args &&... __args) {
+        x_lock<4> write_lock(mtx);
+        return map.emplace(std::forward<_Args>(__args)...);
+    }
+    template <typename... _Args>
+    iterator emplace_hint(const_iterator __pos, _Args &&... __args) {
+        x_lock<4> write_lock(mtx);
+        return map.emplace_hint(__pos, std::forward<_Args>(__args)...);
+    }
+    std::pair<iterator, bool> insert(const value_type &__x) {
+        x_lock<4> write_lock(mtx);
+        return map.insert(__x);
+    }
+    template <typename _Pair,
+              typename = typename std::enable_if<
+                  std::is_constructible<value_type, _Pair &&>::value>::type>
+    std::pair<iterator, bool> insert(_Pair &&__x) {
+        x_lock<4> write_lock(mtx);
+        return map.insert(std::forward<_Pair>(__x));
+    }
+    iterator insert(const_iterator __hint, const value_type &__x) {
+        x_lock<4> write_lock(mtx);
+        return map.insert(__hint, __x);
+    }
+    template <typename _Pair,
+              typename = typename std::enable_if<
+                  std::is_constructible<value_type, _Pair &&>::value>::type>
+    iterator insert(const_iterator __hint, _Pair &&__x) {
+        x_lock<4> write_lock(mtx);
+        return map.insert(__hint, std::forward<_Pair>(__x));
+    }
+    template <typename _InputIterator>
+    void insert(_InputIterator __first, _InputIterator __last) {
+        x_lock<4> write_lock(mtx);
+        map.insert(__first, __last);
+    }
+    void insert(std::initializer_list<value_type> __l) {
+        x_lock<4> write_lock(mtx);
+        map.insert(__l);
+    }
+    iterator erase(const_iterator __position) {
+        x_lock<4> write_lock(mtx);
+        return map.erase(__position);
+    }
+    iterator erase(iterator __position) {
+        x_lock<4> write_lock(mtx);
+        return map.erase(__position);
+    }
+    size_type erase(const key_type &__x) {
+        x_lock<4> write_lock(mtx);
+        return map.erase(__x);
+    }
+    iterator erase(const_iterator __first, const_iterator __last) {
+        x_lock<4> write_lock(mtx);
+        return map.erase(__first, __last);
+    }
+    void clear() noexcept {
+        x_lock<4> write_lock(mtx);
+        map.clear();
+    }
+    void swap(map_type &__x) noexcept(noexcept(map.swap(__x._M_h))) {
+        x_lock<4> write_lock(mtx);
+        map.swap(__x._M_h);
+    }
+    hasher hash_function() const {
+        s_lock<4> read_lock(mtx);
+        return map.hash_function();
+    }
+    key_equal key_eq() const {
+        s_lock<4> read_lock(mtx);
+        return map.key_eq();
+    }
+    iterator find(const key_type &__x) {
+        x_lock<4> write_lock(mtx);
+        return map.find(__x);
+    }
+    const_iterator find(const key_type &__x) const {
+        s_lock<4> read_lock(mtx);
+        return map.find(__x);
+    }
+    size_type count(const key_type &__x) const {
+        s_lock<4> read_lock(mtx);
+        return map.count(__x);
+    }
+    std::pair<iterator, iterator> equal_range(const key_type &__x) {
+        x_lock<4> write_lock(mtx);
+        return map.equal_range(__x);
+    }
+    std::pair<const_iterator, const_iterator>
+    equal_range(const key_type &__x) const {
+        s_lock<4> read_lock(mtx);
+        return map.equal_range(__x);
+    }
+    mapped_type &operator[](const key_type &__k) {
+        x_lock<4> write_lock(mtx);
+        return map[__k];
+    }
+    mapped_type &operator[](key_type &&__k) {
+        x_lock<4> write_lock(mtx);
+        return map[std::move(__k)];
+    }
+    mapped_type &at(const key_type &__k) {
+        x_lock<4> write_lock(mtx);
+        return map.at(__k);
+    }
+    const mapped_type &at(const key_type &__k) const {
+        s_lock<4> read_lock(mtx);
+        return map.at(__k);
+    }
+    size_type bucket_count() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.bucket_count();
+    }
+
+    size_type max_bucket_count() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.max_bucket_count();
+    }
+    size_type bucket_size(size_type __n) const {
+        s_lock<4> read_lock(mtx);
+        return map.bucket_size(__n);
+    }
+    size_type bucket(const key_type &__key) const {
+        s_lock<4> read_lock(mtx);
+        return map.bucket(__key);
+    }
+    local_iterator begin(size_type __n) {
+        x_lock<4> write_lock(mtx);
+        return map.begin(__n);
+    }
+    const_local_iterator begin(size_type __n) const {
+        s_lock<4> read_lock(mtx);
+        return map.begin(__n);
+    }
+    const_local_iterator cbegin(size_type __n) const {
+        s_lock<4> read_lock(mtx);
+        return map.cbegin(__n);
+    }
+    local_iterator end(size_type __n) {
+        x_lock<4> write_lock(mtx);
+        return map.end(__n);
+    }
+    const_local_iterator end(size_type __n) const {
+        s_lock<4> read_lock(mtx);
+        return map.end(__n);
+    }
+    const_local_iterator cend(size_type __n) const {
+        s_lock<4> read_lock(mtx);
+        return map.cend(__n);
+    }
+    float load_factor() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.load_factor();
+    }
+    float max_load_factor() const noexcept {
+        s_lock<4> read_lock(mtx);
+        return map.max_load_factor();
+    }
+    void max_load_factor(float __z) {
+        x_lock<4> write_lock(mtx);
+        map.max_load_factor(__z);
+    }
+    void rehash(size_type __n) {
+        x_lock<4> write_lock(mtx);
+        map.rehash(__n);
+    }
+    void reserve(size_type __n) {
+        x_lock<4> write_lock(mtx);
+        map.reserve(__n);
+    }
+    // ----------------------------- Added function
+    // ----------------------------------
+    template <typename... _Args>
+    const_iterator emplaceIfAbsent(const key_type &__x, _Args &&... __args) {
+        x_lock<4> write_lock(mtx);
+        iterator iter = map.find(__x);
+        if (iter == map.end()) {
+            iter = map.emplace(__x, mapped_type(std::forward<_Args>(__args)...))
+                       .first;
+        }
+        return iter;
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/base.h b/ps-lite/include/ps/base.h
new file mode 100644
index 0000000..0984d05
--- /dev/null
+++ b/ps-lite/include/ps/base.h
@@ -0,0 +1,33 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_BASE_H_
+#define PS_BASE_H_
+#include <limits>
+#include "ps/internal/utils.h"
+namespace ps {
+
+#if USE_KEY32
+/*! \brief Use unsigned 32-bit int as the key type */
+using Key = uint32_t;
+#else
+/*! \brief Use unsigned 64-bit int as the key type */
+using Key = uint64_t;
+#endif
+/*! \brief The maximal allowed key value */
+static const Key kMaxKey = std::numeric_limits<Key>::max();
+/** \brief node ID for the scheduler */
+static const int kScheduler = 1;
+/**
+ * \brief the server node group ID
+ *
+ * group id can be combined:
+ * - kServerGroup + kScheduler means all server nodes and the scheuduler
+ * - kServerGroup + kWorkerGroup means all server and worker nodes
+ */
+static const int kServerGroup = 2;
+/** \brief the worker node group ID */
+static const int kWorkerGroup = 4;
+
+} // namespace ps
+#endif // PS_BASE_H_
diff --git a/ps-lite/include/ps/internal/assign_op.h b/ps-lite/include/ps/internal/assign_op.h
new file mode 100644
index 0000000..7941960
--- /dev/null
+++ b/ps-lite/include/ps/internal/assign_op.h
@@ -0,0 +1,84 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ * \file   assign_op.h
+ * \brief  assignment operator
+ * http://en.cppreference.com/w/cpp/language/operator_assignment
+ */
+#ifndef PS_INTERNAL_ASSIGN_OP_H_
+#define PS_INTERNAL_ASSIGN_OP_H_
+#include "ps/internal/utils.h"
+namespace ps {
+
+enum AssignOp {
+    ASSIGN, // a = b
+    PLUS,   // a += b
+    MINUS,  // a -= b
+    TIMES,  // a *= b
+    DIVIDE, // a -= b
+    AND,    // a &= b
+    OR,     // a |= b
+    XOR     // a ^= b
+};
+
+/**
+ * \brief return an assignment function: right op= left
+ */
+template <typename T>
+inline void AssignFunc(const T &lhs, AssignOp op, T *rhs) {
+    switch (op) {
+    case ASSIGN:
+        *right = left;
+        break;
+    case PLUS:
+        *right += left;
+        break;
+    case MINUS:
+        *right -= left;
+        break;
+    case TIMES:
+        *right *= left;
+        break;
+    case DIVIDE:
+        *right /= left;
+        break;
+    default:
+        LOG(FATAL) << "use AssignOpInt..";
+    }
+}
+
+/**
+ * \brief return an assignment function including bit operations, only
+ * works for integers
+ */
+template <typename T>
+inline void AssignFuncInt(const T &lhs, AssignOp op, T *rhs) {
+    switch (op) {
+    case ASSIGN:
+        *right = left;
+        break;
+    case PLUS:
+        *right += left;
+        break;
+    case MINUS:
+        *right -= left;
+        break;
+    case TIMES:
+        *right *= left;
+        break;
+    case DIVIDE:
+        *right /= left;
+        break;
+    case AND:
+        *right &= left;
+        break;
+    case OR:
+        *right |= left;
+        break;
+    case XOR:
+        *right ^= left;
+        break;
+    }
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_ASSIGN_OP_H_
diff --git a/ps-lite/include/ps/internal/customer.h b/ps-lite/include/ps/internal/customer.h
new file mode 100644
index 0000000..0a7ee11
--- /dev/null
+++ b/ps-lite/include/ps/internal/customer.h
@@ -0,0 +1,119 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_CUSTOMER_H_
+#define PS_INTERNAL_CUSTOMER_H_
+#include <mutex>
+#include <vector>
+#include <utility>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <thread>
+#include <memory>
+#include <unordered_map>
+#include "ps/internal/message.h"
+#include "ps/internal/threadsafe_pqueue.h"
+namespace ps {
+
+/**
+ * \brief The object for communication.
+ *
+ * As a sender, a customer tracks the responses for each request sent.
+ *
+ * It has its own receiving thread which is able to process any message received
+ * from a remote node with `msg.meta.customer_id` equal to this customer's id
+ */
+class Customer {
+public:
+    /**
+     * \brief the handle for a received message
+     * \param recved the received message
+     */
+    using RecvHandle = std::function<void(const Message &recved)>;
+
+    /**
+     * \brief constructor
+     * \param app_id the globally unique id indicating the application the
+     * postoffice serving for \param customer_id the locally unique id
+     * indicating the customer of a postoffice \param recv_handle the functino
+     * for processing a received message
+     */
+    Customer(int app_id, int customer_id, const RecvHandle &recv_handle);
+
+    /**
+     * \brief desconstructor
+     */
+    ~Customer();
+
+    /**
+     * \brief return the globally unique application id
+     */
+    inline int app_id() {
+        return app_id_;
+    }
+
+    /**
+     * \brief return the locally unique customer id
+     */
+    inline int customer_id() {
+        return customer_id_;
+    }
+
+    /**
+     * \brief get a timestamp for a new request. threadsafe
+     * \param recver the receive node id of this request
+     * \return the timestamp of this request
+     */
+    int NewRequest(int recver);
+
+    /**
+     * \brief wait until the request is finished. threadsafe
+     * \param timestamp the timestamp of the request
+     */
+    void WaitRequest(int timestamp);
+
+    /**
+     * \brief return the number of responses received for the request.
+     * threadsafe \param timestamp the timestamp of the request
+     */
+    // int NumResponse(int timestamp);
+
+    /**
+     * \brief add a number of responses to timestamp
+     */
+    // void AddResponse(int timestamp, int num = 1);
+
+    /**
+     * \brief accept a received message from \ref Van. threadsafe
+     * \param recved the received the message
+     */
+    inline void Accept(const Message &recved) {
+        recv_queue_.Push(recved);
+    }
+
+private:
+    /**
+     * \brief the thread function
+     */
+    void Receiving();
+
+    int app_id_;
+
+    int customer_id_;
+
+    RecvHandle recv_handle_;
+    ThreadsafePQueue recv_queue_;
+    // using multithread to speed data processing
+    std::vector<std::shared_ptr<std::thread>> recv_threads_;
+
+    std::mutex tracker_mu_;
+    std::condition_variable tracker_cond_;
+    std::unordered_map<int, bool> tracker_;
+    int cur_timestamp;
+
+    DISALLOW_COPY_AND_ASSIGN(Customer);
+};
+
+} // namespace ps
+#endif // PS_INTERNAL_CUSTOMER_H_
diff --git a/ps-lite/include/ps/internal/env.h b/ps-lite/include/ps/internal/env.h
new file mode 100644
index 0000000..d3e4aa0
--- /dev/null
+++ b/ps-lite/include/ps/internal/env.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) 2016 by Contributors
+ */
+#ifndef PS_INTERNAL_ENV_H_
+#define PS_INTERNAL_ENV_H_
+#include <cstdlib>
+#include <unordered_map>
+#include <memory>
+#include <string>
+namespace ps {
+
+/**
+ * \brief Environment configurations
+ */
+class Environment {
+public:
+    /**
+     * \brief return the singleton instance
+     */
+    static inline Environment *Get() {
+        return _GetSharedRef(nullptr).get();
+    }
+    /**
+     * \brief return a shared ptr of the singleton instance
+     */
+    static inline std::shared_ptr<Environment> _GetSharedRef() {
+        return _GetSharedRef(nullptr);
+    }
+    /**
+     * \brief initialize the environment
+     * \param envs key-value environment variables
+     * \return the initialized singleton instance
+     */
+    static inline Environment *
+    Init(const std::unordered_map<std::string, std::string> &envs) {
+        Environment *env = _GetSharedRef(&envs).get();
+        env->kvs = envs;
+        return env;
+    }
+
+    /**
+     * \brief find the env value.
+     *  User-defined env vars first. If not found, check system's environment
+     * \param k the environment key
+     * \return the related environment value, nullptr when not found
+     */
+    const char *find(const char *k) {
+        std::string key(k);
+        return kvs.find(key) == kvs.end() ? getenv(k) : kvs[key].c_str();
+    }
+
+private:
+    explicit Environment(
+        const std::unordered_map<std::string, std::string> *envs) {
+        if (envs)
+            kvs = *envs;
+    }
+
+    static std::shared_ptr<Environment>
+    _GetSharedRef(const std::unordered_map<std::string, std::string> *envs) {
+        static std::shared_ptr<Environment> inst_ptr(new Environment(envs));
+        return inst_ptr;
+    }
+
+    std::unordered_map<std::string, std::string> kvs;
+};
+
+} // namespace ps
+#endif // PS_INTERNAL_ENV_H_
diff --git a/ps-lite/include/ps/internal/message.h b/ps-lite/include/ps/internal/message.h
new file mode 100644
index 0000000..ceb8133
--- /dev/null
+++ b/ps-lite/include/ps/internal/message.h
@@ -0,0 +1,170 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_MESSAGE_H_
+#define PS_INTERNAL_MESSAGE_H_
+#include "common/sarray.h"
+#include "ps/psf/PSFunc.h"
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace ps {
+
+/**
+ * \brief information about a node
+ */
+struct Node {
+    /** \brief the empty value */
+    static const int kEmpty;
+    /** \brief default constructor */
+    Node() : id(kEmpty), port(kEmpty), is_recovery(false) {
+    }
+    /** \brief node roles */
+    enum Role { SERVER, WORKER, SCHEDULER };
+    /** \brief get debug string */
+    std::string DebugString() const {
+        std::stringstream ss;
+        ss << "role="
+           << (role == SERVER ? "server" :
+                                (role == WORKER ? "worker" : "scheduler"))
+           << (id != kEmpty ? ", id=" + std::to_string(id) : "")
+           << ", ip=" << hostname << ", port=" << port
+           << ", is_recovery=" << is_recovery;
+
+        return ss.str();
+    }
+    /** \brief get short debug string */
+    std::string ShortDebugString() const {
+        std::string str = role == SERVER ? "S" : (role == WORKER ? "W" : "H");
+        if (id != kEmpty)
+            str += "[" + std::to_string(id) + "]";
+        return str;
+    }
+    /** \brief the role of this node */
+    Role role;
+    /** \brief node id */
+    int id;
+    /** \brief customer id */
+    int customer_id;
+    /** \brief hostname or ip */
+    std::string hostname;
+    /** \brief the port this node is binding */
+    int port;
+    /** \brief whether this node is created by failover */
+    bool is_recovery;
+};
+/**
+ * \brief meta info of a system control message
+ */
+struct Control {
+    /** \brief empty constructor */
+    Control() : cmd(EMPTY) {
+    }
+    /** \brief return true is empty */
+    inline bool empty() const {
+        return cmd == EMPTY;
+    }
+    /** \brief get debug string */
+    std::string DebugString() const {
+        if (empty())
+            return "";
+        std::vector<std::string> cmds = {"EMPTY",   "TERMINATE", "ADD_NODE",
+                                         "BARRIER", "ACK",       "HEARTBEAT"};
+        std::stringstream ss;
+        ss << "cmd=" << cmds[cmd];
+        if (node.size()) {
+            ss << ", node={";
+            for (const Node &n : node)
+                ss << " " << n.DebugString();
+            ss << " }";
+        }
+        if (cmd == BARRIER)
+            ss << ", barrier_group=" << barrier_group;
+        if (cmd == ACK)
+            ss << ", msg_sig=" << msg_sig;
+        return ss.str();
+    }
+    /** \brief all commands */
+    enum Command { EMPTY, TERMINATE, ADD_NODE, BARRIER, ACK, HEARTBEAT };
+    /** \brief the command */
+    Command cmd;
+    /** \brief node infos */
+    std::vector<Node> node;
+    /** \brief the node group for a barrier, such as kWorkerGroup */
+    int barrier_group;
+    /** message signature */
+    uint64_t msg_sig;
+};
+/**
+ * \brief meta info of a message
+ */
+struct Meta {
+    /** \brief the empty value */
+    static const int kEmpty;
+    /** \brief default constructor */
+    Meta() :
+        app_id(kEmpty), customer_id(kEmpty), timestamp(kEmpty), sender(kEmpty),
+        recver(kEmpty), request(false), priority(kEmpty),
+        psftype(PsfType::DensePull) {
+    }
+    std::string DebugString() const {
+        std::stringstream ss;
+        if (sender == Node::kEmpty) {
+            ss << "?";
+        } else {
+            ss << sender;
+        }
+        ss << " => " << recver;
+        if (timestamp != kEmpty)
+            ss << ", timestamp=" << timestamp;
+        if (!control.empty()) {
+            ss << ", control={ " << control.DebugString() << " }";
+        } else {
+            ss << ", app_id=" << app_id << ", customer_id=" << customer_id
+               << ", priority=" << priority << ", psfType=" << psftype;
+        }
+        return ss.str();
+    }
+    /** \brief the unique id of the application of messsage is for*/
+    int app_id;
+    /** \brief customer id*/
+    int customer_id;
+    /** \brief the timestamp of this message */
+    int timestamp;
+    /** \brief the node id of the sender of this message */
+    int sender;
+    /** \brief the node id of the receiver of this message */
+    int recver;
+    /** \brief whether or not this is a request message*/
+    bool request;
+    /** \brief system control message */
+    Control control;
+    /** \brief message priority */
+    int priority;
+    /** \brief server-side computation op for keys */
+    PsfType psftype;
+};
+/**
+ * \brief messages that communicated amaong nodes.
+ */
+struct Message {
+    /** \brief the meta info of this message */
+    Meta meta;
+    /** \brief the large chunk of data of this message */
+    std::vector<SArray<char>> data;
+
+    std::string DebugString() const {
+        std::stringstream ss;
+        ss << meta.DebugString();
+        if (data.size()) {
+            ss << " Body:";
+            for (const auto &d : data)
+                ss << " data_size=" << d.size();
+        }
+        return ss.str();
+    }
+};
+} // namespace ps
+#endif // PS_INTERNAL_MESSAGE_H_
diff --git a/ps-lite/include/ps/internal/parallel_kv_match.h b/ps-lite/include/ps/internal/parallel_kv_match.h
new file mode 100644
index 0000000..6a9bf18
--- /dev/null
+++ b/ps-lite/include/ps/internal/parallel_kv_match.h
@@ -0,0 +1,125 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ * \file   parallel_kv_match.h
+ * \brief  paralle key-value pairs matching
+ */
+#ifndef PS_INTERNAL_PARALLEL_KV_MATCH_H_
+#define PS_INTERNAL_PARALLEL_KV_MATCH_H_
+#include <thread>
+#include <algorithm>
+#include "ps/sarray.h"
+#include "ps/internal/assign_op.h"
+
+namespace ps {
+namespace {
+/**
+ * \brief thread function, internal use
+ *
+ * \param src_key start of source key
+ * \param src_key_end end of source key
+ * \param src_val start of source val
+ * \param dst_key start of destination key
+ * \param dst_key_end end of denstination key
+ * \param dst_val start of destination val
+ * \param k length of a single value
+ * \param op assignment operator
+ * \param grainsize thread grainsize size
+ * \param n number of matched kv pairs
+ */
+template <typename K, typename V>
+void ParallelOrderedMatch(const K *src_key, const K *src_key_end,
+                          const V *src_val, const K *dst_key,
+                          const K *dst_key_end, V *dst_val, int k, AsOp op,
+                          size_t grainsize, size_t *n) {
+    size_t src_len = std::distance(src_key, src_key_end);
+    size_t dst_len = std::distance(dst_key, dst_key_end);
+    if (dst_len == 0 || src_len == 0)
+        return;
+
+    // drop the unmatched tail of src
+    src_key = std::lower_bound(src_key, src_key_end, *dst_key);
+    src_val += (src_key - (src_key_end - src_len)) * k;
+
+    if (dst_len <= grainsize) {
+        while (dst_key != dst_key_end && src_key != src_key_end) {
+            if (*src_key < *dst_key) {
+                ++src_key;
+                src_val += k;
+            } else {
+                if (!(*dst_key < *src_key)) {
+                    for (int i = 0; i < k; ++i) {
+                        AssignOp(dst_val[i], src_val[i], op);
+                    }
+                    ++src_key;
+                    src_val += k;
+                    *n += k;
+                }
+                ++dst_key;
+                dst_val += k;
+            }
+        }
+    } else {
+        std::thread thr(ParallelOrderedMatch<K, V>, src_key, src_key_end,
+                        src_val, dst_key, dst_key + dst_len / 2, dst_val, k, op,
+                        grainsize, n);
+        size_t m = 0;
+        ParallelOrderedMatch<K, V>(
+            src_key, src_key_end, src_val, dst_key + dst_len / 2, dst_key_end,
+            dst_val + (dst_len / 2) * k, k, op, grainsize, &m);
+        thr.join();
+        *n += m;
+    }
+}
+} // namespace
+
+/**
+ * \brief Merge \a src_val into \a dst_val by matching keys. Keys must be unique
+ * and sorted.
+ *
+ * \code
+ * if (dst_key[i] == src_key[j]) {
+ *    dst_val[i] op= src_val[j]
+ * }
+ * \endcode
+ *
+ * When finished, \a dst_val will have length `k * dst_key.size()` and filled
+ * with matched value. Umatched value will be untouched if exists or filled with
+ * 0.
+ *
+ * \tparam K type of key
+ * \tparam V type of value
+ * \tparam C type of the container such as \ref SArray or \ref std::vector
+ * \param src_key the source keys
+ * \param src_val the source values
+ * \param dst_key the destination keys
+ * \param dst_val the destination values.
+ * \param k the length of a single value (default is 1)
+ * \param op the assignment operator (default is ASSIGN)
+ * \param num_threads number of thread (default is 1)
+ * \return the number of matched kv pairs
+ */
+template <typename K, typename V, typename C>
+size_t ParallelOrderedMatch(const SArray<K> &src_key, const SArray<V> &src_val,
+                            const SArray<K> &dst_key, C *dst_val, int k = 1,
+                            AssignOp op = ASSIGN, int num_threads = 1) {
+    // do check
+    CHECK_GT(num_threads, 0);
+    CHECK_EQ(src_key.size() * k, src_val.size());
+    CHECK_NOTNULL(dst_val->resize(dst_key.size() * k));
+    if (dst_key.empty())
+        return 0;
+
+    // shorten the matching range
+    Range range = FindRange(dst_key, src_key.begin(), src_key.end());
+    size_t grainsize = std::max(range.size() * k / num_threads + 5,
+                                static_cast<size_t>(1024 * 1024));
+    size_t n = 0;
+    ParallelOrderedMatch<K, V>(
+        src_key.begin(), src_key.end(), src_val.begin(),
+        dst_key.begin() + range.begin(), dst_key.begin() + range.end(),
+        dst_val->begin() + range.begin() * k, k, op, grainsize, &n);
+    return n;
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_PARALLEL_KV_MATCH_H_
diff --git a/ps-lite/include/ps/internal/parallel_sort.h b/ps-lite/include/ps/internal/parallel_sort.h
new file mode 100644
index 0000000..174b0be
--- /dev/null
+++ b/ps-lite/include/ps/internal/parallel_sort.h
@@ -0,0 +1,58 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ * @file   parallel_sort.h
+ * @brief  Parallel sort
+ */
+#ifndef PS_INTERNAL_PARALLEL_SORT_H_
+#define PS_INTERNAL_PARALLEL_SORT_H_
+#include <functional>
+#include <thread>
+#include <algorithm>
+#include "ps/sarray.h"
+namespace ps {
+
+namespace {
+/**
+ * \brief the thread function
+ *
+ * \param data start pointer of data
+ * \param len  length of data
+ * \param grainsize max data length of one thread
+ * \param cmp comparison function
+ */
+template <typename T, class Fn>
+void ParallelSort(T *data, size_t len, size_t grainsize, const Fn &cmp) {
+    if (len <= grainsize) {
+        std::sort(data, data + len, cmp);
+    } else {
+        std::thread thr(ParallelSort<T, Fn>, data, len / 2, grainsize, cmp);
+        ParallelSort(data + len / 2, len - len / 2, grainsize, cmp);
+        thr.join();
+
+        std::inplace_merge(data, data + len / 2, data + len, cmp);
+    }
+}
+} // namespace
+
+/**
+ * \brief Parallel Sort
+ *
+ * \param arr the array for sorting
+ * \param num_threads number of thread
+ * \param cmp the comparision function such as
+ * [](const T& a, const T& b) {* return a < b; }
+ * or an even simplier version:
+ * std::less<T>()
+ */
+template <typename T, class Fn>
+void ParallelSort(SArray<T> *arr, int num_threads = 2,
+                  const Fn &cmp = std::less<T>()) {
+    CHECK_GT(num_threads, 0);
+    CHECK(cmp);
+    size_t grainsize =
+        std::max(arr->size() / num_threads + 5, (size_t)1024 * 16);
+    ParallelSort(arr->data(), arr->size(), grainsize, cmp);
+}
+
+} // namespace ps
+#endif // PS_INTERNAL_PARALLEL_SORT_H_
diff --git a/ps-lite/include/ps/internal/postoffice.h b/ps-lite/include/ps/internal/postoffice.h
new file mode 100644
index 0000000..674b01e
--- /dev/null
+++ b/ps-lite/include/ps/internal/postoffice.h
@@ -0,0 +1,222 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_POSTOFFICE_H_
+#define PS_INTERNAL_POSTOFFICE_H_
+#include <mutex>
+#include <algorithm>
+#include <vector>
+#include <unordered_map>
+#include <memory>
+#include "ps/range.h"
+#include "ps/internal/env.h"
+#include "ps/internal/customer.h"
+#include "ps/internal/van.h"
+namespace ps {
+/**
+ * \brief the center of the system
+ */
+class Postoffice {
+public:
+    /**
+     * \brief return the singleton object
+     */
+    static Postoffice *Get() {
+        static Postoffice e;
+        return &e;
+    }
+    /** \brief get the van */
+    Van *van() {
+        return van_;
+    }
+    /**
+     * \brief start the system
+     *
+     * This function will block until every nodes are started.
+     * \param argv0 the program name, used for logging.
+     * \param do_barrier whether to block until every nodes are started.
+     */
+    void Start(int customer_id, const char *argv0, const bool do_barrier);
+    /**
+     * \brief terminate the system
+     *
+     * All nodes should call this function before existing.
+     * \param do_barrier whether to do block until every node is finalized,
+     * default true.
+     */
+    void Finalize(const int customer_id, const bool do_barrier = true);
+    /**
+     * \brief add an customer to the system. threadsafe
+     */
+    void AddCustomer(Customer *customer);
+    /**
+     * \brief remove a customer by given it's id. threasafe
+     */
+    void RemoveCustomer(Customer *customer);
+    /**
+     * \brief get the customer by id, threadsafe
+     * \param app_id the application id
+     * \param customer_id the customer id
+     * \param timeout timeout in sec
+     * \return return nullptr if doesn't exist and timeout
+     */
+    Customer *GetCustomer(int app_id, int customer_id, int timeout = 0) const;
+    /**
+     * \brief get the id of a node (group), threadsafe
+     *
+     * if it is a  node group, return the list of node ids in this
+     * group. otherwise, return {node_id}
+     */
+    const std::vector<int> &GetNodeIDs(int node_id) const {
+        const auto it = node_ids_.find(node_id);
+        CHECK(it != node_ids_.cend()) << "node " << node_id << " doesn't exist";
+        return it->second;
+    }
+    /**
+     * \brief return the key ranges of all server nodes
+     */
+    const std::vector<Range> &GetServerKeyRanges();
+    /**
+     * \brief the template of a callback
+     */
+    using Callback = std::function<void()>;
+    /**
+     * \brief Register a callback to the system which is called after Finalize()
+     *
+     * The following codes are equal
+     * \code {cpp}
+     * RegisterExitCallback(cb);
+     * Finalize();
+     * \endcode
+     *
+     * \code {cpp}
+     * Finalize();
+     * cb();
+     * \endcode
+     * \param cb the callback function
+     */
+    void RegisterExitCallback(const Callback &cb) {
+        exit_callback_ = cb;
+    }
+    /**
+     * \brief convert from a worker rank into a node id
+     * \param rank the worker rank
+     */
+    static inline int WorkerRankToID(int rank) {
+        return rank * 2 + 9;
+    }
+    /**
+     * \brief convert from a server rank into a node id
+     * \param rank the server rank
+     */
+    static inline int ServerRankToID(int rank) {
+        return rank * 2 + 8;
+    }
+    /**
+     * \brief convert from a node id into a server or worker rank
+     * \param id the node id
+     */
+    static inline int IDtoRank(int id) {
+#ifdef _MSC_VER
+#undef max
+#endif
+        return std::max((id - 8) / 2, 0);
+    }
+    /** \brief Returns the number of worker nodes */
+    int num_workers() const {
+        return num_workers_;
+    }
+    /** \brief Returns the number of server nodes */
+    int num_servers() const {
+        return num_servers_;
+    }
+    /** \brief Returns the rank of this node in its group
+     *
+     * Each worker will have a unique rank within [0, NumWorkers()). So are
+     * servers. This function is available only after \ref Start has been
+     * called.
+     */
+    int my_rank() const {
+        return IDtoRank(van_->my_node().id);
+    }
+    /** \brief Returns true if this node is a worker node */
+    int is_worker() const {
+        return is_worker_;
+    }
+    /** \brief Returns true if this node is a server node. */
+    int is_server() const {
+        return is_server_;
+    }
+    /** \brief Returns true if this node is a scheduler node. */
+    int is_scheduler() const {
+        return is_scheduler_;
+    }
+    /** \brief Returns the verbose level. */
+    int verbose() const {
+        return verbose_;
+    }
+    /** \brief Return whether this node is a recovery node */
+    bool is_recovery() const {
+        return van_->my_node().is_recovery;
+    }
+    /**
+     * \brief barrier
+     * \param node_id the barrier group id
+     */
+    void Barrier(int customer_id, int node_group);
+    /**
+     * \brief process a control message, called by van
+     * \param the received message
+     */
+    void Manage(const Message &recv);
+    /**
+     * \brief update the heartbeat record map
+     * \param node_id the \ref Node id
+     * \param t the last received heartbeat time
+     */
+    void UpdateHeartbeat(int node_id, time_t t) {
+        std::lock_guard<std::mutex> lk(heartbeat_mu_);
+        heartbeats_[node_id] = t;
+    }
+    /**
+     * \brief get node ids that haven't reported heartbeats for over t seconds
+     * \param t timeout in sec
+     */
+    std::vector<int> GetDeadNodes(int t = 60);
+
+private:
+    Postoffice();
+    ~Postoffice() {
+        delete van_;
+    }
+
+    void InitEnvironment();
+    Van *van_;
+    mutable std::mutex mu_;
+    // app_id -> (customer_id -> customer pointer)
+    std::unordered_map<int, std::unordered_map<int, Customer *>> customers_;
+    std::unordered_map<int, std::vector<int>> node_ids_;
+    std::mutex server_key_ranges_mu_;
+    std::vector<Range> server_key_ranges_;
+    bool is_worker_, is_server_, is_scheduler_;
+    int num_servers_, num_workers_;
+    std::unordered_map<int, std::unordered_map<int, bool>> barrier_done_;
+    int verbose_;
+    std::mutex barrier_mu_;
+    std::condition_variable barrier_cond_;
+    std::mutex heartbeat_mu_;
+    std::mutex start_mu_;
+    int init_stage_ = 0;
+    std::unordered_map<int, time_t> heartbeats_;
+    Callback exit_callback_;
+    /** \brief Holding a shared_ptr to prevent it from being destructed too
+     * early */
+    std::shared_ptr<Environment> env_ref_;
+    time_t start_time_;
+    DISALLOW_COPY_AND_ASSIGN(Postoffice);
+};
+
+/** \brief verbose log */
+#define PS_VLOG(x) LOG_IF(INFO, x <= Postoffice::Get()->verbose())
+} // namespace ps
+#endif // PS_INTERNAL_POSTOFFICE_H_
diff --git a/ps-lite/include/ps/internal/threadsafe_pqueue.h b/ps-lite/include/ps/internal/threadsafe_pqueue.h
new file mode 100644
index 0000000..95a927f
--- /dev/null
+++ b/ps-lite/include/ps/internal/threadsafe_pqueue.h
@@ -0,0 +1,64 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_THREADSAFE_PQUEUE_H_
+#define PS_INTERNAL_THREADSAFE_PQUEUE_H_
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "ps/base.h"
+namespace ps {
+
+/**
+ * \brief thread-safe queue allowing push and waited pop
+ */
+class ThreadsafePQueue {
+public:
+    ThreadsafePQueue() {
+    }
+    ~ThreadsafePQueue() {
+    }
+
+    /**
+     * \brief push an value into the end. threadsafe.
+     * \param new_value the value
+     */
+    void Push(Message new_value) {
+        mu_.lock();
+        queue_.push(std::move(new_value));
+        mu_.unlock();
+        cond_.notify_all();
+    }
+
+    /**
+     * \brief wait until pop an element from the beginning, threadsafe
+     * \param value the poped value
+     */
+    void WaitAndPop(Message *value) {
+        std::unique_lock<std::mutex> lk(mu_);
+        cond_.wait(lk, [this] { return !queue_.empty(); });
+        *value = std::move(queue_.top());
+        queue_.pop();
+    }
+
+private:
+    class Compare {
+    public:
+        bool operator()(const Message &l, const Message &r) {
+            // hbsun: note it is the max-heap,
+            // In other words, the priority is larger, the quickly it is
+            // processed push is first, pull is later
+            return l.meta.priority <= r.meta.priority;
+        }
+    };
+    mutable std::mutex mu_;
+    std::priority_queue<Message, std::vector<Message>, Compare> queue_;
+    std::condition_variable cond_;
+};
+
+} // namespace ps
+
+#endif // PS_INTERNAL_THREADSAFE_PQUEUE_H_
diff --git a/ps-lite/include/ps/internal/threadsafe_queue.h b/ps-lite/include/ps/internal/threadsafe_queue.h
new file mode 100644
index 0000000..a4cd340
--- /dev/null
+++ b/ps-lite/include/ps/internal/threadsafe_queue.h
@@ -0,0 +1,63 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_THREADSAFE_QUEUE_H_
+#define PS_INTERNAL_THREADSAFE_QUEUE_H_
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <utility>
+#include "ps/base.h"
+namespace ps {
+
+/**
+ * \brief thread-safe queue allowing push and waited pop
+ */
+template <typename T>
+class ThreadsafeQueue {
+public:
+    ThreadsafeQueue() {
+    }
+    ~ThreadsafeQueue() {
+    }
+
+    /**
+     * \brief push an value into the end. threadsafe.
+     * \param new_value the value
+     */
+    void Push(T new_value) {
+        mu_.lock();
+        queue_.push(std::move(new_value));
+        mu_.unlock();
+        cond_.notify_all();
+    }
+
+    /**
+     * \brief wait until pop an element from the beginning, threadsafe
+     * \param value the poped value
+     */
+    void WaitAndPop(T *value) {
+        std::unique_lock<std::mutex> lk(mu_);
+        cond_.wait(lk, [this] { return !queue_.empty(); });
+        *value = std::move(queue_.front());
+        queue_.pop();
+    }
+
+private:
+    mutable std::mutex mu_;
+    std::queue<T> queue_;
+    std::condition_variable cond_;
+};
+
+} // namespace ps
+
+// bool TryPop(T& value) {
+//   std::lock_guard<std::mutex> lk(mut);
+//   if(data_queue.empty())
+//     return false;
+//   value=std::move(data_queue.front());
+//   data_queue.pop();
+//   return true;
+// }
+#endif // PS_INTERNAL_THREADSAFE_QUEUE_H_
diff --git a/ps-lite/include/ps/internal/utils.h b/ps-lite/include/ps/internal/utils.h
new file mode 100644
index 0000000..f12b968
--- /dev/null
+++ b/ps-lite/include/ps/internal/utils.h
@@ -0,0 +1,57 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_UTILS_H_
+#define PS_INTERNAL_UTILS_H_
+#include "common/logging.h"
+#include "ps/internal/env.h"
+namespace ps {
+
+#ifdef _MSC_VER
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <inttypes.h>
+#endif
+
+/*!
+ * \brief Get environment variable as int with default.
+ * \param key the name of environment variable.
+ * \param default_val the default value of environment vriable.
+ * \return The value received
+ */
+template <typename V>
+inline V GetEnv(const char *key, V default_val) {
+    const char *val = Environment::Get()->find(key);
+    if (val == nullptr) {
+        return default_val;
+    } else {
+        return V(val);
+    }
+}
+
+inline int GetEnv(const char *key, int default_val) {
+    const char *val = Environment::Get()->find(key);
+    if (val == nullptr) {
+        return default_val;
+    } else {
+        return atoi(val);
+    }
+}
+
+#ifndef DISALLOW_COPY_AND_ASSIGN
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)                                     \
+    TypeName(const TypeName &);                                                \
+    void operator=(const TypeName &)
+#endif
+
+#define LL LOG(ERROR)
+
+} // namespace ps
+#endif // PS_INTERNAL_UTILS_H_
diff --git a/ps-lite/include/ps/internal/van.h b/ps-lite/include/ps/internal/van.h
new file mode 100644
index 0000000..b9f5ef9
--- /dev/null
+++ b/ps-lite/include/ps/internal/van.h
@@ -0,0 +1,213 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_INTERNAL_VAN_H_
+#define PS_INTERNAL_VAN_H_
+#include <atomic>
+#include <ctime>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "ps/base.h"
+#include "ps/internal/message.h"
+namespace ps {
+class Resender;
+class PBMeta;
+/**
+ * \brief Van sends messages to remote nodes
+ *
+ * If environment variable PS_RESEND is set to be 1, then van will resend a
+ * message if it no ACK messsage is received within PS_RESEND_TIMEOUT
+ * millisecond
+ */
+class Van {
+public:
+    /**
+     * \brief create Van
+     * \param type zmq, socket, ...
+     */
+    static Van *Create(const std::string &type);
+
+    /** \brief constructer, do nothing. use \ref Start for real start */
+    Van() {
+    }
+
+    /**\brief deconstructer, do nothing. use \ref Stop for real stop */
+    virtual ~Van() {
+    }
+
+    /**
+     * \brief start van
+     *
+     * must call it before calling Send
+     *
+     * it initalizes all connections to other nodes.  start the receiving
+     * threads, which keeps receiving messages. if it is a system
+     * control message, give it to postoffice::manager, otherwise, give it to
+     * the accoding app.
+     */
+    virtual void Start(int customer_id);
+
+    /**
+     * \brief send a message, It is thread-safe
+     * \return the number of bytes sent. -1 if failed
+     */
+    int Send(const Message &msg);
+
+    /**
+     * \brief return my node
+     */
+    inline const Node &my_node() const {
+        CHECK(ready_) << "call Start() first";
+        return my_node_;
+    }
+
+    /**
+     * \brief stop van
+     * stop receiving threads
+     */
+    virtual void Stop();
+
+    /**
+     * \brief get next available timestamp. thread safe
+     */
+    inline int GetTimestamp() {
+        return timestamp_++;
+    }
+
+    /**
+     * \brief whether it is ready for sending. thread safe
+     */
+    inline bool IsReady() {
+        return ready_;
+    }
+
+protected:
+    /**
+     * \brief connect to a node
+     */
+    virtual void Connect(const Node &node) = 0;
+
+    /**
+     * \brief bind to my node
+     * do multiple retries on binding the port. since it's possible that
+     * different nodes on the same machine picked the same port
+     * \return return the port binded, -1 if failed.
+     */
+    virtual int Bind(const Node &node, int max_retry) = 0;
+
+    /**
+     * \brief block until received a message
+     * \return the number of bytes received. -1 if failed or timeout
+     */
+    virtual int RecvMsg(Message *msg) = 0;
+
+    /**
+     * \brief send a mesage
+     * \return the number of bytes sent
+     */
+    virtual int SendMsg(const Message &msg) = 0;
+
+    /**
+     * \brief pack meta into a string
+     */
+    void PackMeta(const Meta &meta, char **meta_buf, int *buf_size);
+
+    /**
+     * \brief pack meta into protobuf
+     */
+    void PackMetaPB(const Meta &meta, PBMeta *pb);
+
+    /**
+     * \brief unpack meta from a string
+     */
+    void UnpackMeta(const char *meta_buf, int buf_size, Meta *meta);
+
+    Node scheduler_;
+    Node my_node_;
+    bool is_scheduler_;
+    std::mutex start_mu_;
+
+private:
+    /** thread function for receving */
+    void Receiving();
+
+    /** thread function for heartbeat */
+    void Heartbeat();
+
+    // node's address string (i.e. ip:port) -> node id
+    // this map is updated when ip:port is received for the first time
+    std::unordered_map<std::string, int> connected_nodes_;
+    // maps the id of node which is added later to the id of node
+    // which is with the same ip:port and added first
+    std::unordered_map<int, int> shared_node_mapping_;
+
+    /** whether it is ready for sending */
+    std::atomic<bool> ready_{false};
+    std::atomic<size_t> send_bytes_{0};
+    size_t recv_bytes_ = 0;
+    int num_servers_ = 0;
+    int num_workers_ = 0;
+    /** the thread for receiving messages */
+    std::unique_ptr<std::thread> receiver_thread_;
+    /** the thread for sending heartbeat */
+    std::unique_ptr<std::thread> heartbeat_thread_;
+    std::vector<int> barrier_count_;
+    /** msg resender */
+    Resender *resender_ = nullptr;
+    int drop_rate_ = 0;
+    std::atomic<int> timestamp_{0};
+    int init_stage = 0;
+
+    /**
+     * \brief processing logic of AddNode message for scheduler
+     */
+    void ProcessAddNodeCommandAtScheduler(Message *msg, Meta *nodes,
+                                          Meta *recovery_nodes);
+
+    /**
+     * \brief processing logic of Terminate message
+     */
+    void ProcessTerminateCommand();
+
+    /**
+     * \brief processing logic of AddNode message (run on each node)
+     */
+    void ProcessAddNodeCommand(Message *msg, Meta *nodes, Meta *recovery_nodes);
+
+    /**
+     * \brief processing logic of Barrier message (run on each node)
+     */
+    void ProcessBarrierCommand(Message *msg);
+
+    /**
+     * \brief processing logic of AddNode message (run on each node)
+     */
+    void ProcessHearbeat(Message *msg);
+
+    /**
+     * \brief processing logic of Data message
+     */
+    void ProcessDataMsg(Message *msg);
+
+    /**
+     * \brief called by ProcessAddNodeCommand, in scheduler it assigns an id to
+     * the newly added node; in other nodes, it updates the node id with what is
+     * received from scheduler
+     */
+    void UpdateLocalID(Message *msg, std::unordered_set<int> *deadnodes_set,
+                       Meta *nodes, Meta *recovery_nodes);
+
+    const char *heartbeat_timeout_val =
+        Environment::Get()->find("PS_HEARTBEAT_TIMEOUT");
+    int heartbeat_timeout_ =
+        heartbeat_timeout_val ? atoi(heartbeat_timeout_val) : 0;
+
+    DISALLOW_COPY_AND_ASSIGN(Van);
+};
+} // namespace ps
+#endif // PS_INTERNAL_VAN_H_
diff --git a/ps-lite/include/ps/kvapp.h b/ps-lite/include/ps/kvapp.h
new file mode 100644
index 0000000..eb87b5b
--- /dev/null
+++ b/ps-lite/include/ps/kvapp.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ps/internal/postoffice.h"
+#include "ps/internal/customer.h"
+#include "ps/internal/message.h"
+
+namespace ps {
+
+// Recursively register receive message handler (from 0 to kNumPSfunction)
+template <PsfType ftype, typename app>
+struct KVAppRegisterHelper {
+    static void init(app *ptr) {
+        ptr->message_handlers[ftype] = std::bind(
+            &app::template onReceive<ftype>, ptr, std::placeholders::_1);
+        KVAppRegisterHelper<PsfType(ftype + 1), app>::init(ptr);
+    }
+};
+
+template <typename app>
+struct KVAppRegisterHelper<kNumPSfunction, app> {
+    static void init(app *ptr) {
+    }
+};
+
+class KVApp {
+public:
+    explicit KVApp(int app_id) {
+        obj_.reset(new Customer(
+            app_id, app_id,
+            std::bind(&KVApp::Process, this, std::placeholders::_1)));
+    }
+    std::unique_ptr<Customer> obj_;
+
+private:
+    void Process(const Message &msg) {
+        CHECK_LT(msg.meta.psftype, kNumPSfunction)
+            << "Unknown PS Function Received";
+        message_handlers[msg.meta.psftype](msg);
+    }
+
+    typedef std::function<void(const Message &)> MessageHandle;
+    MessageHandle message_handlers[kNumPSfunction];
+    template <PsfType, typename>
+    friend struct KVAppRegisterHelper;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/partitioner.h b/ps-lite/include/ps/partitioner.h
new file mode 100644
index 0000000..bb90576
--- /dev/null
+++ b/ps-lite/include/ps/partitioner.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <vector>
+
+namespace ps {
+
+class Partitioner {
+protected:
+    const std::vector<Range> &server_range;
+    size_t server_num;
+
+public:
+    Partitioner() : server_range(Postoffice::Get()->GetServerKeyRanges()) {
+        server_num = server_range.size();
+    }
+    virtual ~Partitioner() {
+    }
+    virtual void partitionDense(size_t length, std::vector<Key> &keys,
+                                std::vector<size_t> &parts) {
+    }
+    virtual void partitionSparse(size_t length, size_t width,
+                                 std::vector<Key> &keys,
+                                 std::vector<size_t> &parts) {
+    }
+    virtual int queryServer(Key key) {
+        return 0;
+    }
+};
+
+/* Naive partitioner, average partition into servers */
+class AveragePartitioner : public Partitioner {
+private:
+    Key _globalId;
+    size_t _serverIndex;
+    size_t partition_num;
+
+public:
+    AveragePartitioner(size_t part_num = 0) : Partitioner() {
+        _globalId = 0;
+        _serverIndex = 0;
+        if (part_num == 0 || part_num > server_num)
+            part_num = server_num;
+        partition_num = part_num;
+    }
+
+    void partitionDense(size_t length, std::vector<Key> &keys,
+                        std::vector<size_t> &parts) {
+        size_t per_part_len = length / partition_num;
+        size_t rem = length % partition_num;
+        for (size_t i = 0; i < partition_num; i++) {
+            size_t server_idx = (i + _serverIndex) % server_num;
+            keys.push_back(_globalId + server_range[server_idx].begin());
+            parts.push_back(per_part_len + (i < rem));
+        }
+        _globalId++;
+        _serverIndex = (_serverIndex + partition_num) % server_num;
+    }
+
+    void partitionSparse(size_t length, size_t width, std::vector<Key> &keys,
+                         std::vector<size_t> &parts) {
+        partitionDense(length, keys, parts);
+    }
+
+    int queryServer(Key key) {
+        size_t server_id = 0;
+        while (server_id < server_num
+               && key >= server_range[server_id].begin()) {
+            server_id++;
+        }
+        return int(server_id - 1);
+    }
+};
+
+/* Use blocks to partition, intuition from BytePS */
+class BlockPartitioner : public Partitioner {
+private:
+    Key _globalId;
+    size_t _serverIndex;
+    size_t _block;
+
+public:
+    BlockPartitioner(size_t block_size = 1000000) : Partitioner() {
+        _globalId = 0;
+        _serverIndex = 0;
+        _block = block_size;
+    }
+
+    void partitionDense(size_t length, std::vector<Key> &keys,
+                        std::vector<size_t> &parts) {
+        partitionImpl(length, _block, keys, parts);
+    }
+
+    void partitionSparse(size_t length, size_t width, std::vector<Key> &keys,
+                         std::vector<size_t> &parts) {
+        size_t cur_block = std::max(_block / width, size_t(1));
+        partitionImpl(length, cur_block, keys, parts);
+    }
+
+    void partitionImpl(size_t length, size_t cur_block, std::vector<Key> &keys,
+                       std::vector<size_t> &parts) {
+        size_t DLArray_len = length;
+        while (DLArray_len != 0) {
+            keys.push_back(_globalId + server_range[_serverIndex].begin());
+            _serverIndex++;
+            auto tmp = std::min(cur_block, DLArray_len);
+            parts.push_back(tmp);
+            DLArray_len -= tmp;
+            if (_serverIndex == server_num) {
+                _globalId++;
+                _serverIndex = 0;
+            }
+        }
+    }
+
+    int queryServer(Key key) {
+        size_t server_id = 0;
+        while (server_id < server_num
+               && key >= server_range[server_id].begin()) {
+            server_id++;
+        }
+        return int(server_id - 1);
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/ps.h b/ps-lite/include/ps/ps.h
new file mode 100644
index 0000000..f40262f
--- /dev/null
+++ b/ps-lite/include/ps/ps.h
@@ -0,0 +1,90 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * @file   ps.h
+ * \brief  The parameter server interface
+ */
+#ifndef PS_PS_H_
+#define PS_PS_H_
+/** \brief basic setups in ps */
+#include "ps/base.h"
+/** \brief communcating with a list of key-value paris. */
+#include "common/thread_pool.h"
+#include "ps/internal/postoffice.h"
+namespace ps {
+/** \brief Returns the number of worker nodes */
+inline int NumWorkers() {
+    return Postoffice::Get()->num_workers();
+}
+/** \brief Returns the number of server nodes */
+inline int NumServers() {
+    return Postoffice::Get()->num_servers();
+}
+/** \brief Returns true if this node is a worker node */
+inline bool IsWorker() {
+    return Postoffice::Get()->is_worker();
+}
+/** \brief Returns true if this node is a server node. */
+inline bool IsServer() {
+    return Postoffice::Get()->is_server();
+}
+/** \brief Returns true if this node is a scheduler node. */
+inline bool IsScheduler() {
+    return Postoffice::Get()->is_scheduler();
+}
+/** \brief Returns the rank of this node in its group
+ *
+ * Each worker will have a unique rank within [0, NumWorkers()). So are
+ * servers. This function is available only after \ref Start has been called.
+ */
+inline int MyRank() {
+    return Postoffice::Get()->my_rank();
+}
+/**
+ * \brief start the system
+ *
+ * This function will block until every nodes are started.
+ * \param argv0 the program name, used for logging
+ */
+inline void Start(int customer_id, const char *argv0 = nullptr) {
+    Postoffice::Get()->Start(customer_id, argv0, true);
+}
+/**
+ * \brief start the system
+ *
+ * This function will NOT block.
+ * \param argv0 the program name, used for logging
+ */
+inline void StartAsync(int customer_id, const char *argv0 = nullptr) {
+    Postoffice::Get()->Start(customer_id, argv0, false);
+}
+/**
+ * \brief terminate the system
+ *
+ * All nodes should call this function before existing.
+ * \param do_barrier whether to block until every node is finalized, default
+ * true.
+ */
+inline void Finalize(int customer_id, const bool do_barrier = true) {
+    Postoffice::Get()->Finalize(customer_id, do_barrier);
+}
+/**
+ * \brief Register a callback to the system which is called after Finalize()
+ *
+ * The following codes are equal
+ * \code {cpp}
+ * RegisterExitCallback(cb);
+ * Finalize();
+ * \endcode
+ *
+ * \code {cpp}
+ * Finalize();
+ * cb();
+ * \endcode
+ * \param cb the callback function
+ */
+inline void RegisterExitCallback(const std::function<void()> &cb) {
+    Postoffice::Get()->RegisterExitCallback(cb);
+}
+
+} // namespace ps
+#endif // PS_PS_H_
diff --git a/ps-lite/include/ps/psf/PSFunc.h b/ps-lite/include/ps/psf/PSFunc.h
new file mode 100644
index 0000000..2988de8
--- /dev/null
+++ b/ps-lite/include/ps/psf/PSFunc.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "common/sarray.h"
+#include "ps/base.h"
+
+#include <tuple>
+#include <functional>
+using std::tuple;
+using std::get;
+using std::function;
+
+namespace ps {
+
+enum PsfType {
+    /* Dense ops */
+    DensePush,
+    DensePull,
+    DDPushPull,
+    /* Sparse ops */
+    SparsePush,
+    SparsePull,
+    SDPushPull,
+    SSPushPull,
+    /* misc ops */
+    ParamInit,
+    ParamClear,
+    ParamSave,
+    ParamLoad,
+    /* cache sparse table */
+    kSyncEmbedding,
+    kPushEmbedding,
+    kPushSyncEmbedding,
+    kNumPSfunction,
+};
+
+template <PsfType>
+struct PSFData;
+/*
+    To define a new PSFunc, we need 3 parts : Request, Response, _callback
+    * Request and Response are tuple-like object, and must only use
+      scalar types like int, float or Sarray
+    * _callback is a function having format void(const Response&, args...)
+      where args are some target memory space to write back
+    * See examples in dense.h sparse.h ...
+*/
+
+/*
+  getCallBack, use this to bind _callback to the get the real callback which can
+  be stored example: getCallBack<DensePull>(target);
+*/
+template <PsfType ftype, typename... Args>
+function<void(const typename PSFData<ftype>::Response &)>
+getCallBack(Args &&... args) {
+    return std::bind(PSFData<ftype>::_callback, std::placeholders::_1,
+                     std::forward<Args>(args)...);
+}
+
+} // namespace ps
+
+#include "dense.h"
+#include "sparse.h"
+#include "misc.h"
+#include "cachetable.h"
diff --git a/ps-lite/include/ps/psf/cachetable.h b/ps-lite/include/ps/psf/cachetable.h
new file mode 100644
index 0000000..04ed8f5
--- /dev/null
+++ b/ps-lite/include/ps/psf/cachetable.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "PSFunc.h"
+
+namespace ps {
+
+typedef int64_t version_t;
+
+template <>
+struct PSFData<kPushEmbedding> {
+    using Request = tuple<Key,              // key
+                          SArray<size_t>,   // rows
+                          SArray<float>,    // data
+                          SArray<version_t> // updates
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<kSyncEmbedding> {
+    using Request = tuple<Key,               // key
+                          SArray<size_t>,    // rows
+                          SArray<version_t>, // current version
+                          version_t          // bound
+                          >;
+    using Response = tuple<SArray<size_t>,    // rows that should be updated
+                           SArray<version_t>, // server version returned
+                           SArray<float>      // embedding value
+                           >;
+    // Use a closure to pass cached embedding data target
+    typedef std::function<void(const Response &response, size_t offset)>
+        Closure;
+};
+
+template <>
+struct PSFData<kPushSyncEmbedding> {
+    using Request = tuple<Key,               // key
+                          SArray<size_t>,    // rows
+                          SArray<version_t>, // current version
+                          version_t,         // bound
+                          SArray<size_t>,    // push rows
+                          SArray<float>,     // push data
+                          SArray<version_t>  // push updates
+                          >;
+    using Response = PSFData<kSyncEmbedding>::Response;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/psf/dense.h b/ps-lite/include/ps/psf/dense.h
new file mode 100644
index 0000000..ebb5870
--- /dev/null
+++ b/ps-lite/include/ps/psf/dense.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "PSFunc.h"
+
+namespace ps {
+
+template <>
+struct PSFData<DensePull> {
+    using Request = tuple<Key,   // key
+                          size_t // len
+                          >;
+    using Response = tuple<SArray<float> // data
+                           >;
+    static void _callback(const Response &response, SArray<float> tgt) {
+        auto val = get<0>(response);
+        CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size();
+        std::copy(val.begin(), val.end(), tgt.begin());
+    }
+};
+
+template <>
+struct PSFData<DensePush> {
+    using Request = tuple<Key,          // key
+                          size_t,       // len
+                          SArray<float> // data
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<DDPushPull> {
+    using Request = PSFData<DensePush>::Request;
+    using Response = PSFData<DensePull>::Response;
+
+    static void _callback(const Response &response, SArray<float> tgt) {
+        auto val = get<0>(response);
+        CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size();
+        std::copy(val.begin(), val.end(), tgt.begin());
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/psf/misc.h b/ps-lite/include/ps/psf/misc.h
new file mode 100644
index 0000000..6ad944a
--- /dev/null
+++ b/ps-lite/include/ps/psf/misc.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "PSFunc.h"
+
+namespace ps {
+
+enum InitType {
+    Constant,
+    Uniform,
+    Normal,
+    TruncatedNormal,
+};
+
+template <>
+struct PSFData<ParamInit> {
+    using Request = tuple<Key,                // key
+                          int,                // param_type
+                          size_t,             // len
+                          size_t,             // width
+                          int,                // init_type
+                          double,             // init_a
+                          double,             // init_b
+                          unsigned long long, // seed
+                          int,                // opt_type
+                          SArray<float>       // opt arguments
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<ParamClear> {
+    using Request = tuple<Key // key
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<ParamSave> {
+    using Request = tuple<Key,
+                          SArray<char>, // address
+                          bool          // different from load
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<ParamLoad> {
+    using Request = tuple<Key,
+                          SArray<char> // address
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/psf/serializer.h b/ps-lite/include/ps/psf/serializer.h
new file mode 100644
index 0000000..dadd127
--- /dev/null
+++ b/ps-lite/include/ps/psf/serializer.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "common/sarray.h"
+
+#include <tuple>
+#include <vector>
+using std::tuple;
+using std::vector;
+
+namespace ps {
+
+// we don't have if-constexpr in c++11, so we use this
+template <bool T>
+class ScalarTag {};
+
+// decide whether a data is scalar type or SArray
+// isScalar<int>::value -> true
+template <typename T>
+class isScalar {
+public:
+    constexpr static bool value =
+        std::is_integral<T>::value || std::is_floating_point<T>::value;
+    using Tag = ScalarTag<value>;
+};
+
+// Helper class to serialize Tuples recursively
+template <typename Tuple, int N>
+class tupleSerializer {
+public:
+    // encode scalar type, put it in target[0]
+    template <typename dtype>
+    static void _encode(const dtype &t, vector<SArray<char>> &target,
+                        ScalarTag<true>) {
+        size_t cur_size = target[0].size();
+        target[0].resize(cur_size + sizeof(dtype));
+        dtype *ptr = reinterpret_cast<dtype *>(target[0].data() + cur_size);
+        *ptr = t;
+    }
+    // encode sarray type, append it to target(no copy)
+    template <typename dtype>
+    static void _encode(const dtype &t, vector<SArray<char>> &target,
+                        ScalarTag<false>) {
+        SArray<char> bytes(t);
+        target.push_back(bytes);
+    }
+    // encode a tuple from back to front
+    static void encode(const Tuple &tup, vector<SArray<char>> &target) {
+        auto &t = std::get<N - 1>(tup);
+        typedef typename std::remove_reference<decltype(t)>::type dtype;
+        _encode(t, target, typename isScalar<dtype>::Tag());
+        tupleSerializer<Tuple, N - 1>::encode(tup, target);
+    }
+    //---------------------------------Decode---------------------------------------
+    template <typename dtype>
+    static void _decode(dtype &t, const vector<SArray<char>> &target,
+                        ScalarTag<true>, size_t &scalar_hint,
+                        size_t &array_hint) {
+        dtype *ptr = reinterpret_cast<dtype *>(target[0].data() + scalar_hint
+                                               - sizeof(dtype));
+        t = *ptr;
+        scalar_hint -= sizeof(dtype);
+    }
+    template <typename dtype>
+    static void _decode(dtype &t, const vector<SArray<char>> &target,
+                        ScalarTag<false>, size_t &scalar_hint,
+                        size_t &array_hint) {
+        t = target[array_hint - 1];
+        array_hint--;
+    }
+    // scalar_hint, array_hint, tell where to take the data from target
+    static void decode(Tuple &tup, const vector<SArray<char>> &target,
+                       size_t scalar_hint, size_t array_hint) {
+        // When decode, from front to back
+        auto &t = std::get<std::tuple_size<Tuple>::value - N>(tup);
+        typedef typename std::remove_reference<decltype(t)>::type dtype;
+        _decode(t, target, typename isScalar<dtype>::Tag(), scalar_hint,
+                array_hint);
+        tupleSerializer<Tuple, N - 1>::decode(tup, target, scalar_hint,
+                                              array_hint);
+    }
+};
+
+// Handle template specialization
+template <typename Tuple>
+class tupleSerializer<Tuple, 0> {
+public:
+    static void encode(const Tuple &tup, vector<SArray<char>> &target) {
+    }
+    static void decode(Tuple &tup, const vector<SArray<char>> &target,
+                       size_t scalar_hint, size_t array_hint) {
+    }
+};
+
+// ------------------------------ Exported APIs
+// ------------------------------------------------
+template <typename Tuple>
+void tupleEncode(const Tuple &tup, vector<SArray<char>> &dest) {
+    dest.clear();
+    dest.push_back(SArray<char>()); // Reserve for scalar types
+    dest[0].reserve(sizeof(Tuple));
+    tupleSerializer<Tuple, std::tuple_size<Tuple>::value>::encode(tup, dest);
+}
+
+template <typename Tuple>
+void tupleDecode(Tuple &tup, const vector<SArray<char>> &dest) {
+    tupleSerializer<Tuple, std::tuple_size<Tuple>::value>::decode(
+        tup, dest, dest[0].size(), dest.size());
+}
+
+} // namespace ps
diff --git a/ps-lite/include/ps/psf/sparse.h b/ps-lite/include/ps/psf/sparse.h
new file mode 100644
index 0000000..4934fa0
--- /dev/null
+++ b/ps-lite/include/ps/psf/sparse.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include "PSFunc.h"
+#include "dense.h"
+
+namespace ps {
+
+template <>
+struct PSFData<SparsePull> {
+    using Request = tuple<Key,           // key
+                          SArray<size_t> // offset
+                          >;
+    using Response = tuple<SArray<float> // data
+                           >;
+    static void
+    _callback(const Response &response, SArray<float> tgt,
+              std::vector<std::pair<size_t, std::vector<size_t>>> mapping,
+              size_t offset, size_t width) {
+        auto val = get<0>(response);
+        CHECK_EQ(val.size(), mapping.size() * width)
+            << val.size() << " " << mapping.size() << " " << width;
+        for (size_t i = 0; i < mapping.size(); ++i) {
+            auto begin_iter = val.begin() + i * width;
+            auto end_iter = begin_iter + width;
+            for (auto idx : mapping[i].second) {
+                std::copy(begin_iter, end_iter, tgt.begin() + idx * width);
+            }
+        }
+    }
+};
+
+template <>
+struct PSFData<SparsePush> {
+    using Request = tuple<Key,            // key
+                          SArray<size_t>, // offset
+                          SArray<float>   // data
+                          >;
+    using Response = tuple<>;
+    static void _callback(const Response &response) {
+    }
+};
+
+template <>
+struct PSFData<SDPushPull> {
+    using Request = tuple<Key,            // key
+                          SArray<size_t>, // offset
+                          SArray<float>,  // data
+                          size_t          // len for densepull
+                          >;
+    using Response = PSFData<DensePull>::Response;
+
+    static void _callback(const Response &response, SArray<float> tgt) {
+        auto val = get<0>(response);
+        CHECK_EQ(val.size(), tgt.size()) << val.size() << " " << tgt.size();
+        std::copy(val.begin(), val.end(), tgt.begin());
+    }
+};
+
+template <>
+struct PSFData<SSPushPull> {
+    using Request = tuple<Key,            // key
+                          SArray<size_t>, // push offset
+                          SArray<float>,  // data
+                          SArray<size_t>  // pull offset
+                          >;
+    using Response = PSFData<SparsePull>::Response;
+
+    static void
+    _callback(const Response &response, SArray<float> tgt,
+              std::vector<std::pair<size_t, std::vector<size_t>>> mapping,
+              size_t offset, size_t width) {
+        auto val = get<0>(response);
+        if (val.size() > 0) {
+            CHECK_EQ(val.size(), mapping.size() * width)
+                << val.size() << " " << mapping.size() << " " << width;
+            for (size_t i = 0; i < mapping.size(); ++i) {
+                auto begin_iter = val.begin() + i * width;
+                auto end_iter = begin_iter + width;
+                for (auto idx : mapping[i].second) {
+                    std::copy(begin_iter, end_iter, tgt.begin() + idx * width);
+                }
+            }
+        }
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/range.h b/ps-lite/include/ps/range.h
new file mode 100644
index 0000000..3905939
--- /dev/null
+++ b/ps-lite/include/ps/range.h
@@ -0,0 +1,35 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_RANGE_H_
+#define PS_RANGE_H_
+#include "ps/internal/utils.h"
+namespace ps {
+
+/**
+ * \brief a range [begin, end)
+ */
+class Range {
+public:
+    Range() : Range(0, 0) {
+    }
+    Range(uint64_t begin, uint64_t end) : begin_(begin), end_(end) {
+    }
+
+    uint64_t begin() const {
+        return begin_;
+    }
+    uint64_t end() const {
+        return end_;
+    }
+    uint64_t size() const {
+        return end_ - begin_;
+    }
+
+private:
+    uint64_t begin_;
+    uint64_t end_;
+};
+
+} // namespace ps
+#endif // PS_RANGE_H_
diff --git a/ps-lite/include/ps/server/PSFHandle.h b/ps-lite/include/ps/server/PSFHandle.h
new file mode 100644
index 0000000..95b823b
--- /dev/null
+++ b/ps-lite/include/ps/server/PSFHandle.h
@@ -0,0 +1,404 @@
+#pragma once
+
+#include "ps/psf/PSFunc.h"
+
+#include "common/thread_safe_hash_map.h"
+#include "param.h"
+#include <algorithm>
+#include <utility>
+#include <mutex>
+#include <omp.h>
+#include <random>
+#include <fstream>
+
+namespace ps {
+/**
+ * \brief used in ML part for sparse/dense pull, push.
+ *        keys is used for the key of one partition.
+ *        lens is used as the offset of the keys.
+ *        vals is vals.
+ *        One key (two keys for binary op) per request in Hetu.
+ *        Is it ok in a lock-free manner? By @Zhipeng
+ */
+
+class KVServerMatrixHandle {
+public:
+    KVServerMatrixHandle() {
+    }
+    KVServerMatrixHandle(const KVServerMatrixHandle &handle) {
+    }
+
+    void serve(const PSFData<DensePull>::Request &request,
+               PSFData<DensePull>::Response &response) {
+        Key k = get<0>(request);
+        size_t len = get<1>(request);
+        SArray<float> &pull_vals = get<0>(response);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ = *iter->second;
+            size_t data_size = value_set_.size();
+            CHECK_EQ(len, data_size) << " size mismatch in DensePull " << k
+                                     << " " << len << " " << data_size;
+            pull_vals.resize(data_size);
+            auto read_lock = value_set_.read_guard();
+            std::copy(value_set_.begin(), value_set_.end(), pull_vals.begin());
+        } else {
+            LG << "Key does not exist on PS in DensePull" << k;
+        }
+    }
+
+    void serve(const PSFData<DensePush>::Request &request,
+               PSFData<DensePush>::Response &response) {
+        Key k = get<0>(request);
+        size_t len = get<1>(request);
+        SArray<float> vals = get<2>(request);
+
+        if (const_store.find(k) == const_store.end()) {
+            store[k] = std::make_shared<Param<float>>(len, OptType::None,
+                                                      SArray<float>());
+        }
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            CHECK_EQ(len, iter->second->size())
+                << k << " " << len << " " << iter->second->size()
+                << " size mismatch in DensePush";
+            // write, discard const qualifier
+            auto &value_set_ =
+                *const_cast<typename tmap::mapped_type &>(iter->second);
+            auto write_lock = value_set_.write_guard();
+#pragma omp parallel for num_threads(4)
+            for (size_t j = 0; j < value_set_.size(); j++)
+                value_set_[j] += vals[j];
+        } else {
+            LG << "Key does not exist on PS in DensePull" << k;
+        }
+    }
+
+    void serve(const PSFData<DDPushPull>::Request &request,
+               PSFData<DDPushPull>::Response &response) {
+        // one key per request.
+        // with response result
+        Key k = get<0>(request);
+        size_t len = get<1>(request);
+        SArray<float> vals = get<2>(request);
+        SArray<float> &pull_vals = get<0>(response);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ =
+                *const_cast<typename tmap::mapped_type &>(iter->second);
+            size_t data_size = value_set_.size();
+            CHECK_EQ(len, data_size)
+                << " size mismatch in DDPushPull " << len << " " << data_size;
+            pull_vals.resize(data_size);
+            auto write_lock = value_set_.write_guard();
+#pragma omp parallel for num_threads(4)
+            for (size_t j = 0; j < data_size; j++) {
+                value_set_[j] += vals[j];
+                pull_vals[j] = value_set_[j];
+            }
+        } else {
+            LG << "Key does not exist on PS in DensePull" << k;
+        }
+    }
+
+    void serve(const PSFData<SparsePull>::Request &request,
+               PSFData<SparsePull>::Response &response) {
+        // we use length as the offset, i.e., #length = #vals.
+        // with response result
+        Key k = get<0>(request);
+        SArray<size_t> offset = get<1>(request);
+        SArray<float> &pull_vals = get<0>(response);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ =
+                *std::dynamic_pointer_cast<Param2D<float>>(iter->second);
+            size_t width = value_set_.width;
+            pull_vals.resize(offset.size() * width);
+            auto read_lock = value_set_.read_guard();
+#pragma omp parallel for num_threads(4)
+            for (size_t j = 0; j < offset.size(); ++j) {
+                auto value_begin = value_set_.data() + offset[j] * width;
+                auto value_end = value_begin + width;
+                auto dst_begin = pull_vals.data() + j * width;
+                std::copy(value_begin, value_end, dst_begin);
+            }
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pulled key: " << k
+               << " does not exist on PS in SparsePull.";
+        }
+    }
+
+    void serve(const PSFData<SparsePush>::Request &request,
+               PSFData<SparsePush>::Response &response) {
+        // we use length as the offset, i.e., #length = #vals.
+        // no response result
+        Key k = get<0>(request);
+        SArray<size_t> offsets = get<1>(request);
+        SArray<float> vals = get<2>(request);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ =
+                *std::dynamic_pointer_cast<Param2D<float>>(iter->second);
+            size_t width = value_set_.width;
+
+            CHECK_EQ(vals.size(), offsets.size() * width)
+                << " in Psf::SparsePush check failed,"
+                << " size of vals is " << vals.size() << " size of lens is "
+                << offsets.size() << " size of width is " << width;
+
+            // write, discard const qualifier
+            auto write_lock = value_set_.write_guard();
+#pragma omp parallel for num_threads(4)
+            for (size_t j = 0; j < offsets.size(); ++j) {
+                size_t src_offset = j * width;
+                size_t dst_offset = offsets[j] * width;
+                for (size_t k = 0; k < width; ++k) {
+                    value_set_[dst_offset + k] += vals[src_offset + k];
+                }
+            }
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in SparsePush.";
+        }
+    }
+
+    void serve(const PSFData<SDPushPull>::Request &request,
+               PSFData<SDPushPull>::Response &response) {
+        Key k = get<0>(request);
+        SArray<size_t> offsets = get<1>(request);
+        SArray<float> vals = get<2>(request);
+        size_t len = get<3>(request);
+        SArray<float> &pull_vals = get<0>(response);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ =
+                *std::dynamic_pointer_cast<Param2D<float>>(iter->second);
+            size_t width = value_set_.width;
+            CHECK_EQ(len, value_set_.size())
+                << " size mismatch in SDPushPull " << k << " " << len << " "
+                << value_set_.size();
+
+            // sparsepush phase
+            if (vals.size() > 0) {
+                CHECK_EQ(vals.size(), offsets.size() * width)
+                    << " in Psf::SDPushPull check failed,"
+                    << " size of vals is " << vals.size() << " size of lens is "
+                    << offsets.size() << " size of width is " << width;
+
+                // write, discard const qualifier
+                auto write_lock = value_set_.write_guard();
+#pragma omp parallel for num_threads(4)
+                for (size_t j = 0; j < offsets.size(); ++j) {
+                    size_t src_offset = j * width;
+                    size_t dst_offset = offsets[j] * width;
+                    for (size_t k = 0; k < width; ++k) {
+                        value_set_[dst_offset + k] += vals[src_offset + k];
+                    }
+                }
+            }
+            // densepull phase
+            pull_vals.resize(value_set_.size());
+            auto read_lock = value_set_.read_guard();
+            std::copy(value_set_.begin(), value_set_.end(), pull_vals.begin());
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in SDPushPull.";
+        }
+    }
+
+    void serve(const PSFData<SSPushPull>::Request &request,
+               PSFData<SSPushPull>::Response &response) {
+        Key k = get<0>(request);
+        SArray<size_t> push_offsets = get<1>(request);
+        SArray<float> vals = get<2>(request);
+        SArray<size_t> pull_offsets = get<3>(request);
+        SArray<float> &pull_vals = get<0>(response);
+
+        auto iter = const_store.find(k);
+        if (iter != const_store.end()) {
+            auto &value_set_ =
+                *std::dynamic_pointer_cast<Param2D<float>>(iter->second);
+            size_t width = value_set_.width;
+
+            // sparsepush phase
+            if (vals.size() > 0) {
+                CHECK_EQ(vals.size(), push_offsets.size() * width)
+                    << " in Psf::SSPushPull check failed,"
+                    << " size of vals is " << vals.size() << " size of lens is "
+                    << push_offsets.size() << " size of width is " << width;
+
+                // write, discard const qualifier
+                auto write_lock = value_set_.write_guard();
+#pragma omp parallel for num_threads(4)
+                for (size_t j = 0; j < push_offsets.size(); ++j) {
+                    size_t src_offset = j * width;
+                    size_t dst_offset = push_offsets[j] * width;
+                    for (size_t k = 0; k < width; ++k) {
+                        value_set_[dst_offset + k] += vals[src_offset + k];
+                    }
+                }
+            }
+
+            // sparsepull phase
+            if (pull_offsets.size() > 0) {
+                pull_vals.resize(pull_offsets.size() * width);
+                auto read_lock = value_set_.read_guard();
+#pragma omp parallel for num_threads(4)
+                for (size_t j = 0; j < pull_offsets.size(); ++j) {
+                    auto val_begin =
+                        value_set_.begin() + pull_offsets[j] * width;
+                    auto val_end = val_begin + width;
+                    auto dst_begin = pull_vals.begin() + j * width;
+                    std::copy(val_begin, val_end, dst_begin);
+                }
+            }
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in SparsePush.";
+        }
+    }
+
+    void serve(const PSFData<kSyncEmbedding>::Request &request,
+               PSFData<kSyncEmbedding>::Response &response);
+    void serve(const PSFData<kPushEmbedding>::Request &request,
+               PSFData<kPushEmbedding>::Response &response);
+    void serve(const PSFData<kPushSyncEmbedding>::Request &request,
+               PSFData<kPushSyncEmbedding>::Response &response);
+
+    void serve(const PSFData<ParamInit>::Request &request,
+               PSFData<ParamInit>::Response &response) {
+        // one key per request.
+        // no response result
+        Key k = get<0>(request);
+        ParamType param_type = (ParamType)get<1>(request);
+        size_t len = get<2>(request);
+        size_t width = get<3>(request);
+        InitType init_type = (InitType)get<4>(request);
+        double init_a = get<5>(request);
+        double init_b = get<6>(request);
+        unsigned long long seed = get<7>(request);
+        OptType otype = (OptType)get<8>(request);
+        SArray<float> lrs = get<9>(request);
+
+        Param<float> *newParam = nullptr;
+        switch (param_type) {
+        case kParam:
+            newParam = new Param<float>(len, otype, lrs);
+            break;
+        case kParam2D:
+            newParam = new Param2D<float>(len, width, otype, lrs);
+            break;
+        case kCacheTable:
+            newParam = new CacheTable<float>(len, width, otype, lrs);
+        }
+        auto iter = store.emplaceIfAbsent(k, newParam);
+
+        CHECK_EQ(len * width, iter->second->size())
+            << k << " " << len << " " << width << " " << iter->second->size()
+            << " size mismatch in UniformInit";
+        // write, discard const qualifier
+        auto &value_set_ =
+            *const_cast<typename tmap::mapped_type &>(iter->second);
+        auto write_lock = value_set_.write_guard();
+        if (init_type == InitType::Constant) {
+            float filled_value = static_cast<float>(init_a);
+            // #pragma omp parallel for num_threads(4)
+            for (size_t j = 0; j < value_set_.size(); j++)
+                value_set_[j] = filled_value;
+        } else if (init_type == InitType::Uniform) {
+            std::uniform_real_distribution<float> uniform_dist(init_a, init_b);
+            std::default_random_engine generator(seed);
+            for (size_t j = 0; j < value_set_.size(); j++) {
+                value_set_[j] = uniform_dist(generator);
+            }
+        } else if (init_type == InitType::Normal) {
+            std::normal_distribution<float> normal_dist(init_a, init_b);
+            std::default_random_engine generator(seed);
+            for (size_t j = 0; j < value_set_.size(); j++) {
+                value_set_[j] = normal_dist(generator);
+            }
+        } else if (init_type == InitType::TruncatedNormal) {
+            std::normal_distribution<float> truncated_normal_dist(init_a,
+                                                                  init_b);
+            float upper_limit = init_a + 2 * init_b;
+            float lower_limit = init_a - 2 * init_b;
+            std::default_random_engine generator(seed);
+            for (size_t j = 0; j < value_set_.size(); j++) {
+                float temp = truncated_normal_dist(generator);
+                while (temp > upper_limit || temp < lower_limit)
+                    temp = truncated_normal_dist(generator);
+                value_set_[j] = temp;
+            }
+        }
+    }
+
+    void serve(const PSFData<ParamClear>::Request &request,
+               PSFData<ParamClear>::Response &response) {
+        Key k = get<0>(request);
+        auto iter = store.find(k);
+        if (iter != store.end()) {
+            store.erase(iter);
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in ParamClear.";
+        }
+    }
+
+    void serve(const PSFData<ParamSave>::Request &request,
+               PSFData<ParamSave>::Response &response) {
+        Key k = get<0>(request);
+        SArray<char> address = get<1>(request);
+        auto iter = store.find(k);
+        if (iter != store.end()) {
+            auto &value_set_ = *iter->second;
+            auto read_lock = value_set_.read_guard();
+            std::ofstream fout(
+                std::string(address.data(), address.size()).c_str(),
+                std::ios::binary);
+            fout.write((char *)value_set_.data(),
+                       value_set_.size() * sizeof(float));
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in ParamSave.";
+        }
+    }
+
+    void serve(const PSFData<ParamLoad>::Request &request,
+               PSFData<ParamLoad>::Response &response) {
+        Key k = get<0>(request);
+        SArray<char> address = get<1>(request);
+        auto iter = store.find(k);
+        if (iter != store.end()) {
+            auto &value_set_ = *iter->second;
+            auto write_lock = value_set_.write_guard();
+            std::ifstream fin(
+                std::string(address.data(), address.size()).c_str(),
+                std::ios::binary);
+            fin.read((char *)value_set_.data(),
+                     value_set_.size() * sizeof(float));
+        } else {
+            // error, the key does not exist on PS.
+            LF << "[Error] The pushed key: " << k
+               << " does not exist on PS in ParamLoad.";
+        }
+    }
+
+private:
+    typedef threadsafe_unordered_map<Key, std::shared_ptr<Param<float>>> tmap;
+    tmap store;
+    const tmap &const_store =
+        store; // const reference to force compiler to use read lock
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/server/kvserver.h b/ps-lite/include/ps/server/kvserver.h
new file mode 100644
index 0000000..e2a8556
--- /dev/null
+++ b/ps-lite/include/ps/server/kvserver.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ps/server/PSFHandle.h"
+#include "ps/psf/serializer.h"
+#include "ps/kvapp.h"
+#include <memory>
+#include <vector>
+namespace ps {
+
+template <PsfType>
+struct KVServerRegisterHelper;
+
+/**
+ * \brief A server node for maintaining key-value pairs
+ */
+class KVServer : public KVApp {
+public:
+    /**
+     * \brief constructor
+     * \param app_id the app id, should match with \ref KVWorker's id
+     */
+    explicit KVServer(int app_id) : KVApp(app_id) {
+        KVAppRegisterHelper<PsfType(0), KVServer>::init(this);
+    }
+
+private:
+    template <PsfType ftype>
+    void onReceive(const Message &msg) {
+        typename PSFData<ftype>::Request request;
+        typename PSFData<ftype>::Response response;
+        tupleDecode(request, msg.data);
+        handler.serve(request, response);
+        Message rmsg;
+        tupleEncode(response, rmsg.data);
+        rmsg.meta = msg.meta;
+        rmsg.meta.recver = msg.meta.sender;
+        rmsg.meta.request = false;
+        Postoffice::Get()->van()->Send(rmsg);
+    }
+
+    /** \brief request handle */
+    KVServerMatrixHandle handler;
+    template <PsfType, typename>
+    friend struct KVAppRegisterHelper;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/server/optimizer.h b/ps-lite/include/ps/server/optimizer.h
new file mode 100644
index 0000000..c433019
--- /dev/null
+++ b/ps-lite/include/ps/server/optimizer.h
@@ -0,0 +1,357 @@
+#pragma once
+
+#include <cmath>
+#include "ps/server/param.h"
+
+namespace ps {
+
+template <typename V>
+class Param;
+template <typename V>
+class Param2D;
+template <typename V>
+class CacheTable;
+
+enum OptType {
+    SGD,
+    Momentum,
+    NesterovMomentum,
+    AdaGrad,
+    Adam,
+    None,
+};
+
+template <typename V>
+class Optimizer {
+public:
+    virtual void ApplyDense(Param<V> &param, SArray<V> &grads);
+    virtual void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                             SArray<V> &grads);
+    virtual void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                            SArray<size_t> &offsets, SArray<V> &grads);
+    virtual void InitStates(size_t size);
+};
+
+template <typename V>
+class SGDOptimizer : public Optimizer<V> {
+public:
+    explicit SGDOptimizer(float learning_rate) : lr(learning_rate) {
+    }
+
+    void ApplyDense(Param<V> &param, SArray<V> &grads) {
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < param.size(); ++j) {
+            param[j] -= lr * grads[j];
+        }
+    }
+
+    void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        size_t width = param.width;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                param[dst_offset + k] -= lr * grads[src_offset + k];
+            }
+        }
+    }
+
+    void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                    SArray<size_t> &offsets, SArray<V> &grads) {
+        size_t width = param.width;
+        // #pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            param.ver[offsets[j]] += updates[j];
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                param[dst_offset + k] -= lr * grads[src_offset + k];
+            }
+        }
+    }
+
+    void InitStates(size_t size) {
+    }
+
+private:
+    float lr;
+};
+
+// Optimizers below need tests! No correctness guarantees.
+template <typename V>
+class MomentumOptimizer : public Optimizer<V> {
+public:
+    explicit MomentumOptimizer(float learning_rate, float momentum) :
+        lr(learning_rate), moment(momentum) {
+    }
+
+    void ApplyDense(Param<V> &param, SArray<V> &grads) {
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < param.size(); ++j) {
+            velocity[j] = moment * velocity[j] - lr * grads[j];
+            param[j] = param[j] + velocity[j];
+        }
+    }
+
+    void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        size_t width = param.width;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                velocity[cur_dst] =
+                    moment * velocity[cur_dst] - lr * grads[cur_src];
+                param[cur_dst] = param[cur_dst] + velocity[cur_dst];
+            }
+        }
+    }
+
+    void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                    SArray<size_t> &offsets, SArray<V> &grads) {
+        size_t width = param.width;
+        // #pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            param.ver[offsets[j]] += updates[j];
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                velocity[cur_dst] =
+                    moment * velocity[cur_dst] - lr * grads[cur_src];
+                param[cur_dst] = param[cur_dst] + velocity[cur_dst];
+            }
+        }
+    }
+
+    void InitStates(size_t size) {
+        velocity = new V[size]();
+    }
+
+private:
+    float lr;
+    float moment;
+    V *velocity;
+};
+
+template <typename V>
+class NesterovMomentumOptimizer : public Optimizer<V> {
+public:
+    explicit NesterovMomentumOptimizer(float learning_rate, float momentum) :
+        lr(learning_rate), moment(momentum) {
+    }
+
+    void ApplyDense(Param<V> &param, SArray<V> &grads) {
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < param.size(); ++j) {
+            V temp = -lr * grads[j];
+            velocity[j] = moment * (velocity[j] + temp);
+            param[j] = param[j] + velocity[j] + temp;
+        }
+    }
+
+    void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        size_t width = param.width;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                V temp = -lr * grads[cur_src];
+                velocity[cur_dst] = moment * (velocity[cur_dst] + temp);
+                param[cur_dst] = param[cur_dst] + velocity[cur_dst] + temp;
+            }
+        }
+    }
+
+    void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                    SArray<size_t> &offsets, SArray<V> &grads) {
+        size_t width = param.width;
+        // #pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            param.ver[offsets[j]] += updates[j];
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                V temp = -lr * grads[cur_src];
+                velocity[cur_dst] = moment * (velocity[cur_dst] + temp);
+                param[cur_dst] = param[cur_dst] + velocity[cur_dst] + temp;
+            }
+        }
+    }
+
+    void InitStates(size_t size) {
+        velocity = new V[size]();
+    }
+
+private:
+    float lr;
+    float moment;
+    V *velocity;
+};
+
+template <typename V>
+class AdaGradOptimizer : public Optimizer<V> {
+public:
+    explicit AdaGradOptimizer(float learning_rate, float initial,
+                              float epsilon) :
+        lr(learning_rate),
+        init(initial), eps(epsilon) {
+    }
+
+    void ApplyDense(Param<V> &param, SArray<V> &grads) {
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < param.size(); ++j) {
+            accum[j] = accum[j] + grads[j] * grads[j];
+            param[j] = param[j] - lr * grads[j] / (sqrt(accum[j]) + eps);
+        }
+    }
+
+    void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        size_t width = param.width;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                accum[cur_dst] =
+                    accum[cur_dst] + grads[cur_src] * grads[cur_src];
+                param[cur_dst] =
+                    param[cur_dst]
+                    - lr * grads[cur_src] / (sqrt(accum[cur_dst]) + eps);
+            }
+        }
+    }
+
+    void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                    SArray<size_t> &offsets, SArray<V> &grads) {
+        size_t width = param.width;
+        // #pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            param.ver[offsets[j]] += updates[j];
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                accum[cur_dst] =
+                    accum[cur_dst] + grads[cur_src] * grads[cur_src];
+                param[cur_dst] =
+                    param[cur_dst]
+                    - lr * grads[cur_src] / (sqrt(accum[cur_dst]) + eps);
+            }
+        }
+    }
+
+    void InitStates(size_t size) {
+        accum = new V[size];
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < size; ++j)
+            accum[j] = init;
+    }
+
+private:
+    float lr;
+    float init;
+    float eps;
+    V *accum;
+};
+
+template <typename V>
+class AdamOptimizer : public Optimizer<V> {
+public:
+    explicit AdamOptimizer(float learning_rate, float beta1, float beta2,
+                           float epsilon) :
+        lr(learning_rate),
+        b1(beta1), b2(beta2), eps(epsilon) {
+        b1t = 1.0;
+        b2t = 1.0;
+    }
+
+    void ApplyDense(Param<V> &param, SArray<V> &grads) {
+        b1t = b1t * b1;
+        b2t = b2t * b2;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < param.size(); ++j) {
+            marr[j] = b1 * marr[j] + (1 - b1) * grads[j];
+            varr[j] = b2 * varr[j] + (1 - b2) * grads[j] * grads[j];
+            param[j] =
+                param[j]
+                - lr * marr[j] / (1 - b1t) / (sqrt(varr[j] / (1 - b2t)) + eps);
+        }
+    }
+
+    void ApplySparse(Param2D<V> &param, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        size_t width = param.width;
+#pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                marr[cur_dst] = b1 * marr[cur_dst] + (1 - b1) * grads[cur_src];
+                varr[cur_dst] = b2 * varr[cur_dst]
+                                + (1 - b2) * grads[cur_src] * grads[cur_src];
+                param[cur_dst] =
+                    param[cur_dst]
+                    - lr * marr[cur_dst] / (1 - b1t)
+                          / (sqrt(varr[cur_dst] / (1 - b2t)) + eps);
+            }
+        }
+    }
+
+    void ApplyCache(CacheTable<V> &param, SArray<version_t> &updates,
+                    SArray<size_t> &offsets, SArray<V> &grads) {
+        size_t width = param.width;
+        // #pragma omp parallel for num_threads(4)
+        for (size_t j = 0; j < offsets.size(); ++j) {
+            param.ver[offsets[j]] += updates[j];
+            size_t src_offset = j * width;
+            size_t dst_offset = offsets[j] * width;
+            for (size_t k = 0; k < width; ++k) {
+                size_t cur_src = src_offset + k;
+                size_t cur_dst = dst_offset + k;
+                marr[cur_dst] = b1 * marr[cur_dst] + (1 - b1) * grads[cur_src];
+                varr[cur_dst] = b2 * varr[cur_dst]
+                                + (1 - b2) * grads[cur_src] * grads[cur_src];
+                param[cur_dst] =
+                    param[cur_dst]
+                    - lr * marr[cur_dst] / (1 - b1t)
+                          / (sqrt(varr[cur_dst] / (1 - b2t)) + eps);
+            }
+        }
+    }
+
+    void InitStates(size_t size) {
+        marr = new V[size]();
+        varr = new V[size]();
+    }
+
+private:
+    float lr;
+    float b1;
+    float b2;
+    float eps;
+    float b1t;
+    float b2t;
+    V *marr;
+    V *varr;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/server/param.h b/ps-lite/include/ps/server/param.h
new file mode 100644
index 0000000..5617cac
--- /dev/null
+++ b/ps-lite/include/ps/server/param.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <vector>
+
+#include "common/shared_mutex.h"
+#include "ps/psf/PSFunc.h"
+#include "ps/server/optimizer.h"
+
+namespace ps {
+
+enum ParamType {
+    kParam,
+    kParam2D,
+    kCacheTable,
+};
+
+/*
+  Param with a read-write lock
+*/
+template <typename V>
+class Param {
+public:
+    explicit Param(size_t size, OptType otype, SArray<float> lrs) {
+        vec_ = new V[size]();
+        size_ = size;
+        switch (otype) {
+        case SGD:
+            opt = new SGDOptimizer<V>(lrs[0]);
+            break;
+        case Momentum:
+            opt = new MomentumOptimizer<V>(lrs[0], lrs[1]);
+            break;
+        case NesterovMomentum:
+            opt = new NesterovMomentumOptimizer<V>(lrs[0], lrs[1]);
+            break;
+        case AdaGrad:
+            opt = new AdaGradOptimizer<V>(lrs[0], lrs[1], lrs[2]);
+            break;
+        case Adam:
+            opt = new AdamOptimizer<V>(lrs[0], lrs[1], lrs[2], lrs[3]);
+            break;
+        case None:
+            opt = nullptr;
+            return;
+        }
+        opt->InitStates(size);
+    }
+
+    ~Param() {
+        delete[] vec_;
+    }
+
+    Param(const Param &) = delete;
+
+    s_lock<4> read_guard() const noexcept {
+        return s_lock<4>(mtx);
+    }
+    x_lock<4> write_guard() noexcept {
+        return x_lock<4>(mtx);
+    }
+
+    inline const V *data() const {
+        return vec_;
+    }
+    inline V *data() {
+        return vec_;
+    }
+    inline V *begin() {
+        return data();
+    }
+    inline V *end() {
+        return data() + size();
+    }
+    inline V &operator[](size_t i) {
+        return vec_[i];
+    }
+    inline const V &operator[](size_t i) const {
+        return vec_[i];
+    }
+    inline size_t size() const {
+        return size_;
+    }
+    virtual ParamType type() {
+        return kParam;
+    }
+    void updateDense(SArray<V> &grads) {
+        auto write_lock = write_guard();
+        opt->ApplyDense(*this, grads);
+    }
+
+private:
+    mutable shared_mutex<4> mtx;
+    V *vec_;
+    size_t size_;
+
+protected:
+    Optimizer<V> *opt;
+};
+
+template <typename V>
+class Param2D : public Param<V> {
+public:
+    explicit Param2D(size_t len, size_t wid, OptType otype, SArray<float> lrs) :
+        Param<V>(len * wid, otype, lrs) {
+        length = len;
+        width = wid;
+    }
+    void updateSparse(SArray<size_t> &offsets, SArray<V> &grads) {
+        auto write_lock = this->write_guard();
+        this->opt->ApplySparse(*this, offsets, grads);
+    }
+    ParamType type() {
+        return kParam2D;
+    }
+    size_t length, width;
+};
+
+template <typename V>
+class CacheTable : public Param2D<V> {
+public:
+    explicit CacheTable(size_t len, size_t wid, OptType otype,
+                        SArray<float> lrs) :
+        Param2D<V>(len, wid, otype, lrs) {
+        ver = new version_t[len]();
+    }
+    ~CacheTable() {
+        delete[] ver;
+    }
+    void updateCache(SArray<version_t> &updates, SArray<size_t> &offsets,
+                     SArray<V> &grads) {
+        auto write_lock = this->write_guard();
+        this->opt->ApplyCache(*this, updates, offsets, grads);
+    }
+    ParamType type() {
+        return kCacheTable;
+    }
+    version_t *ver;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/worker/PSAgent.h b/ps-lite/include/ps/worker/PSAgent.h
new file mode 100644
index 0000000..edc3a9b
--- /dev/null
+++ b/ps-lite/include/ps/worker/PSAgent.h
@@ -0,0 +1,609 @@
+#pragma once
+
+#include "ps/ps.h"
+#include "ps/worker/kvworker.h"
+#include "ps/psf/PSFunc.h"
+#include "ps/server/param.h"
+#include "common/logging.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <sys/time.h>
+#include <unistd.h>
+#include <unordered_map>
+#include <set>
+#include <chrono>
+#include <numeric>
+#include <map>
+#include <mutex>
+
+namespace ps {
+
+struct TensorMeta {
+    ParamType ptype;
+    size_t length;
+    size_t width = 1;
+    /* split a tensor into multiple pieces. [node_name -->
+     * splitted_dl_array_keys] */
+    vector<Key> keys;
+    /* [node_name --> timestamp to be waited] */
+    std::vector<int> ts;
+    std::vector<size_t> part;
+};
+
+struct SparseInfos {
+    // store structures used in sparse operations to avoid memory leak
+    // if using C++17 can changed to shared_ptr (which supports dynamic arrays)
+    size_t *in_offset;
+    size_t *out_offset;
+    float *in_data;
+};
+
+/*
+ * A singleton object for pulling or push to PS.
+ * Since we enable sparse pull/push in PSVector and the length of each val is
+ * one, thus the $lens in @kvpairs is not useful. As a result, we use $lens to
+ * store the offset of each vector. for example, key=1000, lens = {1,2,3}, then
+ * we are accessing elements with ids as {1000+1, 1000+2, 1000+3}
+ */
+class PSAgent {
+private:
+    /* The KVWorker used to make requests. */
+    KVWorker _kvworker;
+    Partitioner *_par;
+    std::unordered_map<int, TensorMeta> _id2meta;
+    std::unordered_map<int, SparseInfos> _id2sparseinfo;
+
+    Key _globalId = 0;
+
+    /* for round-robin tensor placement */
+    size_t _serverIndex = 0;
+
+    PSAgent() : _kvworker(0, 0) {
+        _par = _kvworker.par;
+    }
+
+public:
+    static PSAgent *Get() {
+        static PSAgent e;
+        return &e;
+    }
+
+    void wait(int name) {
+        for (int t : _id2meta[name].ts)
+            _kvworker.Wait(t);
+        _id2meta[name].ts.clear();
+    }
+
+    void clear(int name) {
+        _id2meta.erase(name);
+        // TODO: delete on PS
+    }
+
+    void clearOnServer(int name) {
+        TensorMeta &meta = _id2meta[name];
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            PSFData<ParamClear>::Request request(meta.keys[i]);
+            auto cb = getCallBack<ParamClear>();
+            meta.ts.push_back(_kvworker.Request<ParamClear>(request, cb));
+        }
+        wait(name);
+    }
+
+    void waitTimestamp(int timestamp) {
+        _kvworker.Wait(timestamp);
+    }
+
+    /**
+     * \brief init the meta information about this data on PS.
+     *        the meta data is stored on each worker.
+     * \param name the name of the input data
+     * \param cols the #columns of the data, the data are partitioned by cols.
+     */
+    void registerTensor(const int name, const ParamType ptype,
+                        const size_t length, const size_t width = 1) {
+        assert(!_id2meta.count(name));
+        TensorMeta tm;
+        tm.ptype = ptype;
+        tm.length = length;
+        if (ptype == kParam) {
+            _par->partitionDense(length, tm.keys, tm.part);
+        } else {
+            tm.width = width;
+            _par->partitionSparse(length, width, tm.keys, tm.part);
+            SparseInfos sp;
+            sp.in_offset = nullptr;
+            sp.out_offset = nullptr;
+            sp.in_data = nullptr;
+            _id2sparseinfo[name] = sp;
+        }
+        _id2meta[name] = tm;
+    }
+
+    void vecPushSparse(const int name, float *dup_index, float *vals,
+                       const size_t dup_index_size, int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        const std::vector<Key> &keys = meta.keys;
+        const std::vector<size_t> &lens = meta.part;
+        size_t width = meta.width;
+        SparseInfos &sp = _id2sparseinfo[name];
+        delete[] sp.in_offset;
+        delete[] sp.in_data;
+
+        std::map<size_t, std::vector<size_t>> idx2map;
+        for (size_t i = 0; i < dup_index_size; ++i) {
+            size_t idx = (size_t)dup_index[i];
+            idx2map[idx].emplace_back(i);
+        }
+
+        size_t index_size = idx2map.size();
+        size_t num_all = index_size * width;
+        size_t *cp_offset = sp.in_offset = new size_t[index_size];
+        float *cp_val = sp.in_data = new float[num_all]();
+
+        size_t cur_index = 0;
+        size_t cur_offset = 0;
+        size_t cur_len = 0;
+        auto iter = idx2map.begin();
+        std::vector<std::pair<bool, int>> ts(keys.size());
+
+        for (size_t i = 0; i < keys.size(); ++i) {
+            size_t st_index = cur_index;
+            size_t st_offset = cur_offset;
+            while (iter != idx2map.end() && iter->first < cur_len + lens[i]) {
+                cp_offset[cur_index++] = iter->first - cur_len;
+                for (auto j : iter->second) {
+                    size_t ori_offset = j * width;
+                    for (size_t k = 0; k < width; ++k) {
+                        cp_val[cur_offset + k] += vals[ori_offset + k];
+                    }
+                }
+                cur_offset += width;
+                ++iter;
+            }
+            if (cur_index > st_index) {
+                ts[i].first = true;
+                PSFData<SparsePush>::Request request(
+                    keys[i],
+                    SArray<size_t>(cp_offset + st_index, cur_index - st_index),
+                    SArray<float>(cp_val + st_offset, cur_offset - st_offset));
+                auto cb = getCallBack<SparsePush>();
+                ts[i].second = _kvworker.Request<SparsePush>(request, cb);
+            } else {
+                ts[i].first = false;
+            }
+            cur_len += lens[i];
+        }
+
+        for (auto &t : ts)
+            if (t.first)
+                meta.ts.push_back(t.second);
+        return;
+    }
+
+    void vecPullSparse(const int name, float *dup_index, float *vals,
+                       const size_t dup_index_size, int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        const std::vector<Key> &keys = meta.keys;
+        const std::vector<size_t> &lens = meta.part;
+        size_t width = meta.width;
+        SparseInfos &sp = _id2sparseinfo[name];
+        delete[] sp.out_offset;
+
+        std::map<size_t, std::vector<size_t>> idx2map;
+        for (size_t i = 0; i < dup_index_size; ++i) {
+            size_t idx = (size_t)dup_index[i];
+            idx2map[idx].emplace_back(i);
+        }
+
+        size_t index_size = idx2map.size();
+        size_t *cp_offset = sp.out_offset = new size_t[index_size];
+
+        size_t cur_index = 0;
+        size_t cur_len = 0;
+        auto iter = idx2map.begin();
+        std::vector<std::pair<bool, int>> ts(keys.size());
+
+        for (size_t i = 0; i < keys.size(); ++i) {
+            size_t st_index = cur_index;
+            auto st_iter = iter;
+            while (iter != idx2map.end() && iter->first < cur_len + lens[i]) {
+                cp_offset[cur_index++] = iter->first - cur_len;
+                ++iter;
+            }
+            if (cur_index > st_index) {
+                ts[i].first = true;
+                PSFData<SparsePull>::Request request(
+                    keys[i],
+                    SArray<size_t>(cp_offset + st_index, cur_index - st_index));
+                auto cb = getCallBack<SparsePull>(
+                    SArray<float>(vals, dup_index_size * width),
+                    std::move(
+                        std::vector<std::pair<size_t, std::vector<size_t>>>(
+                            st_iter, iter)),
+                    cur_len, width);
+                ts[i].second = _kvworker.Request<SparsePull>(request, cb);
+            } else {
+                ts[i].first = false;
+            }
+            cur_len += lens[i];
+        }
+
+        for (auto &t : ts)
+            if (t.first)
+                meta.ts.push_back(t.second);
+        return;
+    }
+
+    void vecSDPushPull(const int name, float *dup_index, float *vals,
+                       const size_t dup_index_size, float *out_vals,
+                       int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        const std::vector<Key> &keys = meta.keys;
+        const std::vector<size_t> &lens = meta.part;
+        size_t width = meta.width;
+        SparseInfos &sp = _id2sparseinfo[name];
+        delete[] sp.in_offset;
+        delete[] sp.in_data;
+
+        std::map<size_t, std::vector<size_t>> idx2map;
+        for (size_t i = 0; i < dup_index_size; ++i) {
+            size_t idx = (size_t)dup_index[i];
+            idx2map[idx].emplace_back(i);
+        }
+
+        size_t index_size = idx2map.size();
+        size_t num_all = index_size * width;
+        size_t *cp_offset = sp.in_offset = new size_t[index_size];
+        float *cp_val = sp.in_data = new float[num_all]();
+
+        size_t cur_index = 0;
+        size_t cur_offset = 0;
+        size_t cur_len = 0;
+        size_t pull_offset = 0;
+        auto iter = idx2map.begin();
+
+        for (size_t i = 0; i < keys.size(); ++i) {
+            size_t st_index = cur_index;
+            size_t st_offset = cur_offset;
+            size_t local_length = lens[i] * width;
+            while (iter != idx2map.end() && iter->first < cur_len + lens[i]) {
+                cp_offset[cur_index++] = iter->first - cur_len;
+                for (auto j : iter->second) {
+                    size_t ori_offset = j * width;
+                    for (size_t k = 0; k < width; ++k) {
+                        cp_val[cur_offset + k] += vals[ori_offset + k];
+                    }
+                }
+                cur_offset += width;
+                ++iter;
+            }
+            PSFData<SDPushPull>::Request request(
+                keys[i],
+                SArray<size_t>(cp_offset + st_index, cur_index - st_index),
+                SArray<float>(cp_val + st_offset, cur_offset - st_offset),
+                local_length);
+            auto cb = getCallBack<SDPushPull>(
+                SArray<float>(out_vals + pull_offset, local_length));
+            meta.ts.push_back(_kvworker.Request<SDPushPull>(request, cb));
+            cur_len += lens[i];
+            pull_offset += local_length;
+        }
+        return;
+    }
+
+    void vecSSPushPull(const int name, float *in_index, float *in_vals,
+                       float *out_index, float *out_vals,
+                       const size_t dup_index_size, int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        const std::vector<Key> &keys = meta.keys;
+        const std::vector<size_t> &lens = meta.part;
+        size_t width = meta.width;
+        SparseInfos &sp = _id2sparseinfo[name];
+        delete[] sp.in_offset;
+        delete[] sp.out_offset;
+        delete[] sp.in_data;
+
+        std::map<size_t, std::vector<size_t>> in_idx2map;
+        std::map<size_t, std::vector<size_t>> out_idx2map;
+        for (size_t i = 0; i < dup_index_size; ++i) {
+            size_t idx = (size_t)in_index[i];
+            in_idx2map[idx].emplace_back(i);
+            idx = (size_t)out_index[i];
+            out_idx2map[idx].emplace_back(i);
+        }
+
+        size_t in_index_size = in_idx2map.size();
+        size_t out_index_size = out_idx2map.size();
+        size_t in_num_all = in_index_size * width;
+        size_t *in_cp_offset = sp.in_offset = new size_t[in_index_size];
+        size_t *out_cp_offset = sp.out_offset = new size_t[out_index_size];
+        float *in_cp_val = sp.in_data = new float[in_num_all]();
+
+        size_t in_cur_index = 0;
+        size_t in_cur_offset = 0;
+        size_t cur_len = 0;
+        size_t out_cur_index = 0;
+        auto in_iter = in_idx2map.begin();
+        auto out_iter = out_idx2map.begin();
+        std::vector<std::pair<bool, int>> ts(keys.size());
+
+        for (size_t i = 0; i < keys.size(); ++i) {
+            size_t in_st_index = in_cur_index;
+            size_t st_offset = in_cur_offset;
+            while (in_iter != in_idx2map.end()
+                   && in_iter->first < cur_len + lens[i]) {
+                in_cp_offset[in_cur_index++] = in_iter->first - cur_len;
+                for (auto j : in_iter->second) {
+                    size_t ori_offset = j * width;
+                    for (size_t k = 0; k < width; ++k) {
+                        in_cp_val[in_cur_offset + k] += in_vals[ori_offset + k];
+                    }
+                }
+                in_cur_offset += width;
+                ++in_iter;
+            }
+
+            size_t out_st_index = out_cur_index;
+            auto st_iter = out_iter;
+            while (out_iter != out_idx2map.end()
+                   && out_iter->first < cur_len + lens[i]) {
+                out_cp_offset[out_cur_index++] = out_iter->first - cur_len;
+                ++out_iter;
+            }
+
+            if (in_cur_index > in_st_index || out_cur_index > out_st_index) {
+                ts[i].first = true;
+                PSFData<SSPushPull>::Request request(
+                    keys[i],
+                    SArray<size_t>(in_cp_offset + in_st_index,
+                                   in_cur_index - in_st_index),
+                    SArray<float>(in_cp_val + st_offset,
+                                  in_cur_offset - st_offset),
+                    SArray<size_t>(out_cp_offset + out_st_index,
+                                   out_cur_index - out_st_index));
+                auto cb = getCallBack<SparsePull>(
+                    SArray<float>(out_vals, dup_index_size * width),
+                    std::move(
+                        std::vector<std::pair<size_t, std::vector<size_t>>>(
+                            st_iter, out_iter)),
+                    cur_len, width);
+                ts[i].second = _kvworker.Request<SSPushPull>(request, cb);
+            } else {
+                ts[i].first = false;
+            }
+            cur_len += lens[i];
+        }
+
+        for (auto &t : ts)
+            if (t.first)
+                meta.ts.push_back(t.second);
+        return;
+    }
+
+    /**
+     * \brief PSVector: pull <Key, floats> pairs from PS.
+     * \param name name of the PSVector
+     * \param vals the vals of pullsh vals
+     */
+    void vecDensePush(const int name, float *vals, int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        auto cb = getCallBack<DensePush>();
+        /* send push request to each partition according to the offsets. */
+        size_t cur_len = 0;
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            PSFData<DensePush>::Request request(
+                meta.keys[i], meta.part[i],
+                SArray<float>(vals + cur_len, meta.part[i]));
+            meta.ts.push_back(_kvworker.Request<DensePush>(request, cb));
+            cur_len += meta.part[i];
+        }
+    }
+
+    void vecDensePull(const int name, float *vals, int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        size_t cur_offset = 0;
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            size_t cur_length = meta.part[i] * meta.width;
+            PSFData<DensePull>::Request request(meta.keys[i], cur_length);
+            auto cb = getCallBack<DensePull>(
+                SArray<float>(vals + cur_offset, cur_length));
+            meta.ts.push_back(_kvworker.Request<DensePull>(request, cb));
+            cur_offset += cur_length;
+        }
+    }
+
+    void vecDDPushPull(const int name, float *in_vals, float *out_vals,
+                       int priority = 0) {
+        TensorMeta &meta = _id2meta[name];
+        size_t cur_len = 0;
+        /* send pull request to each partition */
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            PSFData<DDPushPull>::Request request(
+                meta.keys[i], meta.part[i],
+                SArray<float>(in_vals + cur_len, meta.part[i]));
+            auto cb = getCallBack<DDPushPull>(
+                SArray<float>(out_vals + cur_len, meta.part[i]));
+            meta.ts.push_back(_kvworker.Request<DDPushPull>(request, cb));
+            cur_len += meta.part[i];
+        }
+    }
+
+    void ParameterInit(const int name, InitType init_type, double init_a,
+                       double init_b, unsigned long long seed, OptType otype,
+                       SArray<float> lrs) {
+        TensorMeta &meta = _id2meta[name];
+        /* send pull request to each partition */
+        auto cb = getCallBack<ParamInit>();
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            PSFData<ParamInit>::Request request(
+                meta.keys[i], meta.ptype, meta.part[i], meta.width, init_type,
+                init_a, init_b, seed, otype, lrs);
+            meta.ts.push_back(_kvworker.Request<ParamInit>(request, cb));
+        }
+    }
+
+    void ParameterSave(const int name, char *address) {
+        TensorMeta &meta = _id2meta[name];
+        /* send pull request to each partition */
+        auto cb = getCallBack<ParamSave>();
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            std::string local_address = std::string(address) + "/"
+                                        + std::to_string(name) + "_"
+                                        + std::to_string(i) + ".dat";
+            SArray<char> temp_array;
+            temp_array.CopyFrom(local_address.c_str(), local_address.size());
+            PSFData<ParamSave>::Request request(meta.keys[i], temp_array,
+                                                false);
+            meta.ts.push_back(_kvworker.Request<ParamSave>(request, cb));
+        }
+    }
+
+    void ParameterLoad(const int name, char *address) {
+        TensorMeta &meta = _id2meta[name];
+        /* send pull request to each partition */
+        auto cb = getCallBack<ParamLoad>();
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            std::string local_address = std::string(address) + "/"
+                                        + std::to_string(name) + "_"
+                                        + std::to_string(i) + ".dat";
+            SArray<char> temp_array;
+            temp_array.CopyFrom(local_address.c_str(), local_address.size());
+            PSFData<ParamLoad>::Request request(meta.keys[i], temp_array);
+            meta.ts.push_back(_kvworker.Request<ParamLoad>(request, cb));
+        }
+    }
+
+    void startRecord(std::string dirPath) {
+        _kvworker.startRecord(dirPath);
+    }
+
+    void getLoads() {
+        _kvworker.recordLoads();
+    }
+
+    /*
+        A simple key mapping for multiple server case
+    */
+    Key mapWkeyToSkey(Key idx) {
+        const std::vector<Range> &server_range =
+            Postoffice::Get()->GetServerKeyRanges();
+        int server = idx % server_range.size();
+        Key k = server_range[server].end() - idx - 1;
+        return k;
+    }
+
+    /*
+        Enqueue the Zpush request for PushData
+    */
+    void PushData(Key idx, float *vals, int len, std::vector<int> &timestamp) {
+        auto cb = getCallBack<DensePush>();
+        PSFData<DensePush>::Request request(mapWkeyToSkey(idx), len,
+                                            SArray<float>(vals, len));
+        int ts = _kvworker.Request<DensePush>(request, cb);
+        timestamp.push_back(ts);
+    }
+
+    // This is almost the same as PushData
+    void PullData(Key idx, float *vals, int len, std::vector<int> &timestamp) {
+        auto cb = getCallBack<DensePull>(SArray<float>(vals, len));
+        PSFData<DensePull>::Request request(mapWkeyToSkey(idx), len);
+        int ts = _kvworker.Request<DensePull>(request, cb);
+        timestamp.push_back(ts);
+    }
+
+    void syncEmbedding(int name, const SArray<uint64_t> &rows,
+                       const SArray<version_t> &ver, version_t bound,
+                       PSFData<kSyncEmbedding>::Closure closure) {
+        TensorMeta &meta = _id2meta[name];
+        size_t start = 0, end = 0, cur_len = 0;
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            // find the idx range
+            start = end;
+            end = std::lower_bound(rows.begin() + start, rows.end(),
+                                   cur_len + meta.part[i])
+                  - rows.begin();
+            if (start == end)
+                continue;
+            // remove row offset inplace so that index fits with server
+            SArray<uint64_t> new_rows = rows.segment(start, end);
+            for (size_t i = 0; i < new_rows.size(); i++)
+                new_rows[i] -= cur_len;
+            PSFData<kSyncEmbedding>::Request request(
+                meta.keys[i], new_rows, ver.segment(start, end), bound);
+            auto cb = std::bind(closure, std::placeholders::_1, start);
+            meta.ts.push_back(_kvworker.Request<kSyncEmbedding>(request, cb));
+            cur_len += meta.part[i];
+        }
+    }
+
+    void pushEmbedding(int name, const SArray<uint64_t> &rows,
+                       const SArray<float> &data,
+                       const SArray<version_t> &updates) {
+        TensorMeta &meta = _id2meta[name];
+        size_t start = 0, end = 0, cur_len = 0;
+        auto cb = getCallBack<kPushEmbedding>();
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            // find the idx range
+            start = end;
+            end = std::lower_bound(rows.begin() + start, rows.end(),
+                                   cur_len + meta.part[i])
+                  - rows.begin();
+            if (start == end)
+                continue;
+            // remove row offset inplace so that index fits with server
+            SArray<uint64_t> new_rows = rows.segment(start, end);
+            for (size_t i = 0; i < new_rows.size(); i++)
+                new_rows[i] -= cur_len;
+            PSFData<kPushEmbedding>::Request request(
+                meta.keys[i], new_rows,
+                data.segment(start * meta.width, end * meta.width),
+                updates.segment(start, end));
+            meta.ts.push_back(_kvworker.Request<kPushEmbedding>(request, cb));
+            cur_len += meta.part[i];
+        }
+    }
+
+    void pushSyncEmbedding(int name, const SArray<uint64_t> &rows,
+                           const SArray<version_t> &ver, version_t bound,
+                           PSFData<kSyncEmbedding>::Closure closure,
+                           const SArray<uint64_t> &push_rows,
+                           const SArray<float> &data,
+                           const SArray<version_t> &updates) {
+        TensorMeta &meta = _id2meta[name];
+        size_t start = 0, end = 0, cur_len = 0, push_start = 0, push_end = 0;
+        for (size_t i = 0; i < meta.keys.size(); i++) {
+            // find the idx range
+            start = end;
+            push_start = push_end;
+            end = std::lower_bound(rows.begin() + start, rows.end(),
+                                   cur_len + meta.part[i])
+                  - rows.begin();
+            push_end = std::lower_bound(push_rows.begin() + push_start,
+                                        push_rows.end(), cur_len + meta.part[i])
+                       - push_rows.begin();
+            if (start == end && push_start == push_end)
+                continue;
+            // remove row offset inplace so that index fits with server
+            SArray<uint64_t> new_rows = rows.segment(start, end),
+                             new_push_rows =
+                                 push_rows.segment(push_start, push_end);
+            for (size_t i = 0; i < new_rows.size(); i++)
+                new_rows[i] -= cur_len;
+            for (size_t i = 0; i < new_push_rows.size(); i++)
+                new_push_rows[i] -= cur_len;
+            PSFData<kPushSyncEmbedding>::Request request(
+                meta.keys[i], new_rows, ver.segment(start, end), bound,
+                new_push_rows,
+                data.segment(push_start * meta.width, push_end * meta.width),
+                updates.segment(push_start, push_end));
+            auto cb = std::bind(closure, std::placeholders::_1, start);
+            meta.ts.push_back(
+                _kvworker.Request<kPushSyncEmbedding>(request, cb));
+            cur_len += meta.part[i];
+        }
+    }
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/worker/callback_store.h b/ps-lite/include/ps/worker/callback_store.h
new file mode 100644
index 0000000..c8fec8c
--- /dev/null
+++ b/ps-lite/include/ps/worker/callback_store.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "ps/psf/PSFunc.h"
+
+#include "unordered_map"
+#include "mutex"
+
+namespace ps {
+
+// Used to lookup the callback for different ps functions
+// Store a callback use store(timestamp, cb)
+// Run a callback use run(timestamp, response)
+template <PsfType ftype>
+class CallbackStore {
+public:
+    using CallBack = function<void(const typename PSFData<ftype>::Response &)>;
+    static CallbackStore *Get() {
+        static CallbackStore a;
+        return &a;
+    }
+    void run(int timestamp, const typename PSFData<ftype>::Response &response) {
+        mu_.lock();
+        auto it = store_.find(timestamp);
+        if (it != store_.end()) {
+            mu_.unlock();
+            CHECK(it->second);
+            it->second(response);
+            mu_.lock();
+            store_.erase(it);
+        }
+        mu_.unlock();
+    }
+    void store(int ts, const CallBack &cb) {
+        mu_.lock();
+        store_[ts] = cb;
+        mu_.unlock();
+    }
+
+private:
+    std::unordered_map<int, CallBack> store_;
+    std::mutex mu_;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/worker/hetu_binding.h b/ps-lite/include/ps/worker/hetu_binding.h
new file mode 100644
index 0000000..0fd48c1
--- /dev/null
+++ b/ps-lite/include/ps/worker/hetu_binding.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// Do not include worker.h or any ps-lite header here
+// or we will have multiple PSAgent, PostOffice instance
+
+#include <common/sarray.h>
+#include "ps/psf/PSFunc.h"
+using std::vector;
+
+namespace ps {
+
+void debug();
+
+void syncEmbedding(int node_id, const SArray<uint64_t> &keys,
+                   const SArray<version_t> &ver, version_t bound,
+                   PSFData<kSyncEmbedding>::Closure closure);
+
+// Push Grads and Updates
+// keys are unique
+void PushEmbedding(int node_id, const SArray<uint64_t> &keys,
+                   const SArray<float> &data, const SArray<version_t> &updates);
+
+void PushSyncEmbedding(int node_id, const SArray<uint64_t> &keys,
+                       const SArray<version_t> &ver, version_t bound,
+                       PSFData<kSyncEmbedding>::Closure closure,
+                       const SArray<uint64_t> &push_keys,
+                       const SArray<float> &data,
+                       const SArray<version_t> &updates);
+
+} // namespace ps
diff --git a/ps-lite/include/ps/worker/kvworker.h b/ps-lite/include/ps/worker/kvworker.h
new file mode 100644
index 0000000..0d7db47
--- /dev/null
+++ b/ps-lite/include/ps/worker/kvworker.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "ps/psf/PSFunc.h"
+#include "ps/psf/serializer.h"
+#include "callback_store.h"
+#include "ps/kvapp.h"
+#include "ps/partitioner.h"
+#include <vector>
+#include <memory>
+#include <fstream>
+#include <unordered_map>
+#include <string>
+
+namespace ps {
+
+template <PsfType>
+struct KVWorkerRegisterHelper;
+
+class KVWorker : private KVApp {
+public:
+    Partitioner *par;
+    std::string PsfTypeString[15] = {
+        "DensePush",      "DensePull",          "DDPushPull",
+        "SparsePush",     "SparsePull",         "SDPushPull",
+        "SSPushPull",     "ParamInit",          "ParamClear",
+        "ParamSave",      "ParamLoad",          "kSyncEmbedding",
+        "kPushEmbedding", "kPushSyncEmbedding", "kNumPSfunction"};
+    /**
+     * \brief constructor
+     *
+     * \param app_id the app id, should match with \ref KVServer's id
+     * \param customer_id the customer id which is unique locally
+     */
+    explicit KVWorker(int app_id, int customer_id) : KVApp(app_id) {
+        KVAppRegisterHelper<PsfType(0), KVWorker>::init(this);
+        par = new AveragePartitioner(); // now use naive partitioner
+    }
+
+    ~KVWorker() {
+        delete par;
+        if (logOut.is_open())
+            logOut.close();
+    }
+
+    void startRecord(std::string dirPath) {
+        logOut.open(dirPath + "/loads_" + std::to_string(MyRank()) + ".txt");
+        assert(logOut.is_open());
+    }
+
+    void recordLoads() {
+        for (auto iter = loads.begin(); iter != loads.end(); ++iter) {
+            logOut << PsfTypeString[iter->first] << ": " << (iter->second).first
+                   << ' ' << (iter->second).second << std::endl;
+        }
+        logOut << std::endl;
+        loads.clear();
+    }
+
+    /**
+     * \brief Waits until a Request has been finished
+     *
+     * Sample usage:
+     * \code
+     *   _kvworker.Wait(ts);
+     * \endcode
+     *
+     * \param timestamp the timestamp returned by kvworker.Request
+     */
+    void Wait(int timestamp) {
+        obj_->WaitRequest(timestamp);
+    }
+    /**
+     * \brief make a new Request
+     *
+     * Sample usage:
+     * \code
+     *   int ts = _kvworker.Request<DensePush>(request, callback);
+     * \endcode
+     *
+     * \param request create request by PSFData<PsfType>::Request
+     * \param cb the callback returned by getCallback<PSfType>(args...)
+     */
+    template <PsfType ftype, typename Tuple, typename CallBack>
+    int Request(const Tuple &request, const CallBack &cb) {
+        int timestamp = obj_->NewRequest(kServerGroup);
+        CallbackStore<ftype>::Get()->store(timestamp, cb);
+        // Find the server
+        Key key = get<0>(request);
+        int target_server_id = par->queryServer(key);
+        // Create message
+        Message msg;
+        tupleEncode(request, msg.data);
+        if (logOut.is_open()) {
+            for (auto x : msg.data) {
+                loads[int(ftype)].first += x.size();
+            }
+        }
+        msg.meta.app_id = obj_->app_id();
+        msg.meta.customer_id = obj_->customer_id();
+        msg.meta.timestamp = timestamp;
+        msg.meta.recver = Postoffice::Get()->ServerRankToID(target_server_id);
+        msg.meta.psftype = ftype;
+        msg.meta.request = true;
+        Postoffice::Get()->van()->Send(msg);
+        return timestamp;
+    }
+
+private:
+    template <PsfType ftype>
+    void onReceive(const Message &msg) {
+        typename PSFData<ftype>::Response response;
+        if (logOut.is_open()) {
+            for (auto x : msg.data) {
+                loads[int(ftype)].second += x.size();
+            }
+        }
+        tupleDecode(response, msg.data);
+        int timestamp = msg.meta.timestamp;
+        CallbackStore<ftype>::Get()->run(timestamp, response);
+    }
+    template <PsfType, typename>
+    friend struct KVAppRegisterHelper;
+    std::unordered_map<int, std::pair<long long, long long>> loads;
+    std::ofstream logOut;
+};
+
+} // namespace ps
diff --git a/ps-lite/include/ps/worker/worker.h b/ps-lite/include/ps/worker/worker.h
new file mode 100644
index 0000000..550769e
--- /dev/null
+++ b/ps-lite/include/ps/worker/worker.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <cmath>
+#include <unordered_map>
+#include <vector>
+#include <thread>
+#include <condition_variable>
+#include <unistd.h>
+
+#include "ps/ps.h"
+#include "common/dlarray.h"
+#include "common/c_runtime_api.h"
+#include "ps/worker/PSAgent.h"
+#include "ps/server/param.h"
+#include "ps/server/optimizer.h"
+
+using namespace ps;
+
+class Worker {
+public:
+    Worker();
+
+    void parameter_init(int node_name, ParamType ptype, size_t len,
+                        size_t width, InitType init_type, double init_a,
+                        double init_b, unsigned long long seed, OptType otype,
+                        SArray<float> lrs);
+    void parameter_save(int node_name, char *address);
+    void parameter_load(int node_name, char *address);
+    // for data push&pull
+    typedef uint64_t query_t;
+    /*
+      for each indice, call PSAgent::PushData to launch a thread
+      hold the return handle in the global map
+      immediately return
+      user should guaruntee value unchanged until waitdata
+      returns:
+        an query_t which is a long
+        use waitdata(query_t) to wait for its success
+    */
+    query_t push_data(const long *indices, int index_size, const DLArray *value,
+                      const long *lengths);
+    // this is almost the same as push_data
+    query_t pull_data(const long *indices, int index_size, DLArray *value,
+                      const long *lengths);
+    /*
+      wait_data waits until a query success
+    */
+    void wait_data(query_t query);
+
+    void pull(int node_name, DLArray *arr);
+    void push(int node_name, const DLArray *arr, DLEvent *evt);
+    void dd_pushpull(int node_name, const DLArray *in_arr, DLArray *out_arr,
+                     DLEvent *evt);
+    void sparse_pull(int node_name, const DLArray *index, DLArray *value,
+                     size_t index_size);
+    void sparse_push(int node_name, const DLArray *index, const DLArray *value,
+                     size_t index_size, DLEvent *evt);
+    void sd_pushpull(int node_name, const DLArray *index, const DLArray *in_arr,
+                     size_t index_size, DLArray *out_arr, DLEvent *evt);
+    void ss_pushpull(int node_name, const DLArray *inind, const DLArray *in_arr,
+                     const DLArray *outind, DLArray *out_arr, size_t index_size,
+                     DLEvent *evt);
+    void wait(int node_name);
+    void clear(int node_name);
+    void clear_on_server(int node_name);
+
+private:
+    // used this hold to thread_pool return object
+    std::unordered_map<query_t, std::vector<int>> query2timestamp;
+    // data_pull & data_push query, increase 1 each call
+    query_t next_query = 0;
+    // protect query2timestamp and next_query
+    std::mutex data_mu;
+
+    // for concurrent parameter push&pull
+    std::unordered_map<int, std::future<void>> node2pullthread;
+    std::unordered_map<int, std::future<void>> node2pushthread;
+
+    int _thread_num = 3;
+};
+
+extern Worker worker;
diff --git a/ps-lite/proto/meta.proto b/ps-lite/proto/meta.proto
new file mode 100644
index 0000000..49f54de
--- /dev/null
+++ b/ps-lite/proto/meta.proto
@@ -0,0 +1,48 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+syntax = "proto2";
+package ps;
+option optimize_for = LITE_RUNTIME;
+
+message PBNode {
+  // the node role
+  required int32 role = 1;
+  // node id
+  optional int32 id = 2;
+  // hostname or ip
+  optional string hostname = 3;
+  // the port this node is binding
+  optional int32 port = 4;
+  // whether this node is created by failover
+  optional bool is_recovery = 5;
+  // the locally unique id of an customer
+  optional int32 customer_id = 10;
+}
+
+// system control info
+message PBControl {
+  required int32 cmd = 1;
+  repeated PBNode node = 2;
+  optional int32 barrier_group = 3;
+  optional uint64 msg_sig = 4;
+}
+
+// mete information about a message
+message PBMeta {
+  // if set, then it is system control task. otherwise, it is for app
+  optional PBControl control = 1;
+  // true: a request task
+  // false: the response task to the request task with the same *time*
+  optional bool request = 2 [default = false];
+  // the unique id of an application
+  optional int32 app_id = 3;
+  // the locally unique id of an customer
+  optional int32 customer_id = 4;
+  // the timestamp of this message
+  optional int32 timestamp = 5;
+  // priority
+  optional int32 priority = 6 [default = 0];
+  // psftype
+  required int32 psftype = 7 [default = 0];
+}
diff --git a/ps-lite/src/PSFhandle_embedding.cc b/ps-lite/src/PSFhandle_embedding.cc
new file mode 100644
index 0000000..a35e7bc
--- /dev/null
+++ b/ps-lite/src/PSFhandle_embedding.cc
@@ -0,0 +1,81 @@
+#include "ps/server/PSFHandle.h"
+
+namespace ps {
+
+void KVServerMatrixHandle::serve(
+    const PSFData<kPushEmbedding>::Request &request,
+    PSFData<kPushEmbedding>::Response &response) {
+    Key k = get<0>(request);
+    auto rows = get<1>(request);
+    auto data = get<2>(request);
+    auto updates = get<3>(request);
+    auto iter = const_store.find(k);
+    CHECK_NE(iter, const_store.end()) << "key does not exist";
+    CHECK_EQ(iter->second->type(), kCacheTable) << " key is not Cachetable";
+    auto &value_set =
+        *std::dynamic_pointer_cast<CacheTable<float>>(iter->second);
+    size_t width = value_set.width;
+    CHECK_EQ(updates.size(), rows.size())
+        << "PushEmbedding updates size mismatch";
+    CHECK_EQ(data.size(), rows.size() * width)
+        << "PushEmbedding data size mismatch";
+    auto write_lock = value_set.write_guard();
+    for (size_t i = 0; i < rows.size(); i++) {
+        value_set.ver[rows[i]] += updates[i];
+        for (size_t j = 0; j < width; j++)
+            value_set[rows[i] * width + j] += data[i * width + j];
+    }
+}
+
+void KVServerMatrixHandle::serve(
+    const PSFData<kSyncEmbedding>::Request &request,
+    PSFData<kSyncEmbedding>::Response &response) {
+    Key k = get<0>(request);
+    auto rows = get<1>(request);
+    auto ver = get<2>(request);
+    auto bound = get<3>(request);
+    auto &idx = get<0>(response);
+    auto &ret_ver = get<1>(response);
+    auto &data = get<2>(response);
+    auto iter = const_store.find(k);
+    CHECK_NE(iter, const_store.end()) << "key does not exist";
+    CHECK_EQ(iter->second->type(), kCacheTable) << " key is not Cachetable";
+    auto &value_set =
+        *std::dynamic_pointer_cast<CacheTable<float>>(iter->second);
+    size_t width = value_set.width;
+    auto read_lock = value_set.read_guard();
+    size_t count = 0;
+    for (size_t i = 0; i < rows.size(); i++)
+        if (ver[i] == -1 || value_set.ver[rows[i]] - ver[i] > bound)
+            count++;
+    idx.resize(count);
+    ret_ver.resize(count);
+    data.resize(count * width);
+    count = 0;
+    for (size_t i = 0; i < rows.size(); i++) {
+        if (ver[i] == -1 || value_set.ver[rows[i]] - ver[i] > bound) {
+            idx[count] = i;
+            ret_ver[count] = value_set.ver[rows[i]];
+            std::copy(&value_set[rows[i] * width],
+                      &value_set[(rows[i] + 1) * width], &data[count * width]);
+            count++;
+        }
+    }
+}
+
+void KVServerMatrixHandle::serve(
+    const PSFData<kPushSyncEmbedding>::Request &request,
+    PSFData<kPushSyncEmbedding>::Response &response) {
+    PSFData<kPushEmbedding>::Request push_req(
+        std::get<0>(request), std::get<4>(request), std::get<5>(request),
+        std::get<6>(request));
+    PSFData<kPushEmbedding>::Response push_res;
+    serve(push_req, push_res);
+
+    PSFData<kSyncEmbedding>::Request sync_req(
+        std::get<0>(request), std::get<1>(request), std::get<2>(request),
+        std::get<3>(request));
+    serve(sync_req, response);
+}
+
+} // namespace ps
diff --git a/ps-lite/src/customer.cc b/ps-lite/src/customer.cc
new file mode 100644
index 0000000..6922ecb
--- /dev/null
+++ b/ps-lite/src/customer.cc
@@ -0,0 +1,67 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#include "ps/internal/customer.h"
+#include "ps/internal/postoffice.h"
+namespace ps {
+
+const int Node::kEmpty = std::numeric_limits<int>::max();
+const int Meta::kEmpty = std::numeric_limits<int>::max();
+
+Customer::Customer(int app_id, int customer_id,
+                   const Customer::RecvHandle &recv_handle) :
+    app_id_(app_id),
+    customer_id_(customer_id), recv_handle_(recv_handle) {
+    cur_timestamp = 0;
+    Postoffice::Get()->AddCustomer(this);
+    // for debug set num_threads = 5;
+    int num_threads = 5;
+    for (int i = 0; i < num_threads; i++) {
+        recv_threads_.emplace_back(new std::thread(&Customer::Receiving, this));
+    }
+}
+
+Customer::~Customer() {
+    Postoffice::Get()->RemoveCustomer(this);
+    Message msg;
+    msg.meta.control.cmd = Control::TERMINATE;
+    msg.meta.control.barrier_group = 0;
+    msg.meta.control.msg_sig = 0;
+    recv_queue_.Push(msg);
+    for (auto &thread : recv_threads_)
+        thread->join();
+}
+
+int Customer::NewRequest(int recver) {
+    std::lock_guard<std::mutex> lk(tracker_mu_);
+    assert(recver == kServerGroup);
+    tracker_[cur_timestamp] = false;
+    return cur_timestamp++;
+}
+
+void Customer::WaitRequest(int timestamp) {
+    std::unique_lock<std::mutex> lk(tracker_mu_);
+    tracker_cond_.wait(lk, [this, timestamp] { return tracker_[timestamp]; });
+    tracker_.erase(timestamp);
+}
+
+void Customer::Receiving() {
+    while (true) {
+        Message recv;
+        // thread safe
+        recv_queue_.WaitAndPop(&recv);
+        if (!recv.meta.control.empty()
+            && recv.meta.control.cmd == Control::TERMINATE) {
+            recv_queue_.Push(recv);
+            break;
+        }
+        recv_handle_(recv);
+        if (!recv.meta.request) {
+            std::lock_guard<std::mutex> lk(tracker_mu_);
+            tracker_[recv.meta.timestamp] = true;
+            tracker_cond_.notify_all();
+        }
+    }
+}
+
+} // namespace ps
diff --git a/ps-lite/src/hetu_binding.cc b/ps-lite/src/hetu_binding.cc
new file mode 100644
index 0000000..cd95031
--- /dev/null
+++ b/ps-lite/src/hetu_binding.cc
@@ -0,0 +1,35 @@
+#include "ps/worker/worker.h"
+#include "ps/worker/hetu_binding.h"
+
+namespace ps {
+
+void syncEmbedding(int node_id, const SArray<uint64_t> &keys,
+                   const SArray<version_t> &ver, version_t bound,
+                   PSFData<kSyncEmbedding>::Closure closure) {
+    PSAgent::Get()->syncEmbedding(node_id, keys, ver, bound, closure);
+    PSAgent::Get()->wait(node_id);
+}
+
+void PushEmbedding(int node_id, const SArray<uint64_t> &keys,
+                   const SArray<float> &data,
+                   const SArray<version_t> &updates) {
+    PSAgent::Get()->pushEmbedding(node_id, keys, data, updates);
+    PSAgent::Get()->wait(node_id);
+}
+
+void PushSyncEmbedding(int node_id, const SArray<uint64_t> &keys,
+                       const SArray<version_t> &ver, version_t bound,
+                       PSFData<kSyncEmbedding>::Closure closure,
+                       const SArray<uint64_t> &push_keys,
+                       const SArray<float> &data,
+                       const SArray<version_t> &updates) {
+    PSAgent::Get()->pushSyncEmbedding(node_id, keys, ver, bound, closure,
+                                      push_keys, data, updates);
+    PSAgent::Get()->wait(node_id);
+}
+
+void debug() {
+    printf("hetu at %p\n", Postoffice::Get());
+}
+
+} // namespace ps
diff --git a/ps-lite/src/ibverbs_van.h b/ps-lite/src/ibverbs_van.h
new file mode 100644
index 0000000..e1f48b3
--- /dev/null
+++ b/ps-lite/src/ibverbs_van.h
@@ -0,0 +1,1195 @@
+#ifndef PS_IBVERBS_VAN_H_
+#define PS_IBVERBS_VAN_H_
+
+#ifdef DMLC_USE_IBVERBS
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <rdma/rdma_cma.h>
+
+#include <algorithm>
+#include <map>
+#include <queue>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "ps/internal/threadsafe_queue.h"
+#include "ps/internal/van.h"
+
+namespace ps {
+
+// Number of context buffers for sending START messages
+static const int kStartDepth = 128;
+
+// Number of context buffers for writing messages
+static const int kWriteDepth = kStartDepth;
+
+// Number of context buffers for receiving messages
+static const int kRxDepth = kStartDepth * 2;
+
+// Number of context buffers for sending REPLY messages
+static const int kReplyDepth = kRxDepth;
+
+// Maximum number of scatter/gather elements in any Work Request
+static const int kSGEntry = 4;
+
+// Time to wait for resolution to complete (in milliseconds)
+static const int kTimeoutms = 1000;
+
+// Number of backlog of incoming connection requests
+static const int kRdmaListenBacklog = 128;
+
+// Number of preallocated work request buffers
+static const int kMaxConcurrentWorkRequest =
+    kRxDepth + kStartDepth + kReplyDepth + kWriteDepth;
+
+// Length of buffers for storing hostname in the context of a connection request
+static const int kMaxHostnameLength = 16;
+
+// Maximum number of ``data'' in a Message
+// TODO(changlan): What if there are more data in Message?
+static const int kMaxDataFields = 4;
+
+// Alignment in Mempool
+static const size_t kAlignment = 8;
+
+template <typename T>
+static inline T align_floor(T v, T align) {
+    return v - (v % align);
+}
+
+template <typename T>
+static inline T align_ceil(T v, T align) {
+    return align_floor(v + align - 1, align);
+}
+
+// A simple thread-safe memory pool for RDMA memory regions
+class SimpleMempool {
+public:
+    // Allocated an initial ``size'' of registered memory regions
+    explicit SimpleMempool(struct ibv_pd *pd, size_t size = 0x1000000) {
+        pd_ = pd;
+        struct ibv_mr *mr;
+        char *p = reinterpret_cast<char *>(aligned_alloc(kAlignment, size));
+        total_allocated_size += size;
+        CHECK(p);
+        CHECK(mr =
+                  ibv_reg_mr(pd, p, size,
+                             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+        // this mr is associated with memory address range [p, p+size]
+        mr_list.emplace(p + size, mr);
+        free_list.emplace(size, p);
+    }
+
+    // Deregister and release all memory regions
+    ~SimpleMempool() {
+        std::lock_guard<std::mutex> lk(mu_);
+        for (auto it = mr_list.begin(); it != mr_list.end(); it++) {
+            CHECK_EQ(ibv_dereg_mr(it->second), 0);
+            free(it->second->addr);
+        }
+    }
+
+    // Take a buffer of ``size'' from the pool. If there is not enough remaining
+    // space in existing memory regions, allocate and register a new memory
+    // region.
+    char *Alloc(size_t size) {
+        if (size == 0) {
+            return nullptr;
+        }
+
+        std::lock_guard<std::mutex> lk(mu_);
+
+        // Make sure the memory addresses are aligned by rounding the size up to
+        // next power of two
+        size_t proper_size = align_ceil(size, kAlignment);
+
+        // Find a buffer of size greater than or equal to proper_size
+        auto it = free_list.lower_bound(proper_size);
+
+        if (it
+            == free_list.end()) { // if there is no space left, need to allocate
+                                  // and register new memory
+            size_t new_mem_size = total_allocated_size;
+            while (proper_size > new_mem_size) {
+                new_mem_size *= 2;
+            }
+            char *p = reinterpret_cast<char *>(
+                aligned_alloc(kAlignment, new_mem_size));
+            CHECK(p);
+            struct ibv_mr *mr;
+            CHECK(mr = ibv_reg_mr(pd_, p, new_mem_size,
+                                  IBV_ACCESS_LOCAL_WRITE
+                                      | IBV_ACCESS_REMOTE_WRITE));
+            mr_list.emplace(p + new_mem_size, mr);
+            free_list.emplace(new_mem_size, p);
+            it = free_list.lower_bound(proper_size);
+            PS_VLOG(1) << "Not enough memory in the pool, requested size "
+                       << proper_size << ", new allocated size "
+                       << new_mem_size;
+            total_allocated_size += new_mem_size;
+        }
+
+        CHECK_NE(free_list.end(), it) << "Not enough memory";
+        CHECK_GE(it->first, proper_size);
+
+        char *addr = it->second;
+        size_t space_left = it->first - proper_size;
+
+        free_list.erase(it);
+        CHECK_EQ(used_list.find(addr), used_list.end())
+            << "Address is already allocated";
+
+        used_list.emplace(addr, proper_size);
+
+        if (space_left) {
+            free_list.emplace(space_left, addr + proper_size);
+        }
+
+        return addr;
+    }
+
+    // Return the buffer pointed by ``addr'' into the pool
+    void Free(char *addr) {
+        if (!addr) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lk(mu_);
+
+        auto it = used_list.find(addr);
+        CHECK_NE(used_list.end(), it)
+            << "Cannot find info about address: " << (uintptr_t)addr;
+
+        size_t size = it->second;
+        used_list.erase(it);
+        free_list.emplace(size, addr);
+    }
+
+    uint32_t LocalKey(char *addr) {
+        struct ibv_mr *mr = Addr2MR(addr);
+        return mr->lkey;
+    }
+
+    uint32_t RemoteKey(char *addr) {
+        struct ibv_mr *mr = Addr2MR(addr);
+        return mr->rkey;
+    }
+
+private:
+    std::mutex mu_; // for thread safety
+    struct ibv_pd *pd_;
+
+    // buffer size -> buffer pointer
+    std::multimap<size_t, char *> free_list;
+    // buffer pointer -> buffer size
+    std::unordered_map<char *, size_t> used_list;
+    // first: `end` of this mr address (e.g., for mr with [addr, addr+size),
+    // point to `addr+size`)
+    std::map<char *, struct ibv_mr *> mr_list;
+
+    size_t total_allocated_size = 0;
+
+    // Convert the memory address to its associated RDMA memory region
+    inline struct ibv_mr *Addr2MR(char *addr) {
+        auto it = mr_list.lower_bound(addr);
+        CHECK_NE(it, mr_list.end())
+            << "cannot find the associated memory region";
+        return it->second;
+    }
+};
+
+class Block {
+public:
+    explicit Block(SimpleMempool *pool, char *addr, int count) :
+        pool(pool), addr(addr), counter(count) {
+    }
+
+    ~Block() {
+        CHECK_EQ(counter, 0);
+        pool->Free(addr);
+    }
+
+    void Release() {
+        int v = counter.fetch_sub(1);
+        if (v == 1) {
+            delete this;
+        }
+    }
+
+private:
+    SimpleMempool *pool;
+    char *addr;
+    std::atomic<int> counter;
+};
+
+enum MessageTypes : uint32_t {
+    kRendezvousStart,
+    kRendezvousReply,
+};
+
+struct RendezvousStart {
+    uint64_t meta_len;
+    uint64_t data_num;
+    uint64_t data_len[kMaxDataFields];
+    uint64_t origin_addr;
+};
+
+struct RendezvousReply {
+    uint64_t addr;
+    uint64_t origin_addr;
+    uint32_t rkey;
+    uint32_t idx;
+};
+
+enum WRContextType {
+    kRendezvousStartContext,
+    kRendezvousReplyContext,
+    kWriteContext,
+    kReceiveContext
+};
+
+struct WRContext {
+    WRContextType type;
+    struct ibv_mr *buffer;
+    void *private_data;
+};
+
+struct BufferContext {
+    char *buffer;
+    size_t meta_len;
+    size_t data_num;
+    size_t data_len[kMaxDataFields];
+};
+
+struct LocalBufferContext {
+    size_t meta_len;
+    char *meta_buf;
+    std::vector<SArray<char>> data;
+};
+
+struct MessageBuffer {
+    size_t inline_len;
+    char *inline_buf;
+    WRContext *reserved_context;
+    std::vector<SArray<char>> data;
+    std::vector<std::pair<struct ibv_mr *, size_t>> mrs;
+};
+
+struct RequestContext {
+    uint32_t node;
+    uint16_t port;
+    char hostname[kMaxHostnameLength];
+};
+
+static_assert(std::is_pod<RendezvousStart>::value,
+              "RendezvousStart must be a POD type.");
+static_assert(std::is_pod<RendezvousReply>::value,
+              "RendezvousReply must be a POD type.");
+static_assert(std::is_pod<RequestContext>::value,
+              "RequestContext must be a POD type.");
+
+static const size_t kMempoolChunkSize =
+    std::max(sizeof(RendezvousStart), sizeof(RendezvousReply));
+
+template <typename T>
+class AddressPool {
+public:
+    AddressPool() {
+        std::lock_guard<std::mutex> lk(mu_);
+        for (int i = 0; i < kMaxEntries; i++) {
+            indices_.push(i);
+            table_[i] = nullptr;
+        }
+    }
+
+    T *GetAddressAndRelease(uint32_t index) {
+        std::lock_guard<std::mutex> lk(mu_);
+        T *ptr = table_[index];
+        CHECK(ptr);
+        indices_.push(index);
+        table_[index] = nullptr;
+        return ptr;
+    }
+
+    uint32_t StoreAddress(T *ptr) {
+        std::lock_guard<std::mutex> lk(mu_);
+        CHECK(ptr);
+        uint32_t idx = indices_.front();
+        indices_.pop();
+        CHECK_EQ(table_[idx], nullptr);
+        table_[idx] = ptr;
+        return idx;
+    }
+
+private:
+    static const int kMaxEntries = 512;
+
+    std::mutex mu_;
+    std::queue<uint32_t> indices_;
+    T *table_[kMaxEntries];
+};
+
+struct Endpoint {
+    enum ConnectionStatus { IDLE, CONNECTING, CONNECTED, REJECTED };
+
+    ConnectionStatus status;
+    int node_id;
+    std::condition_variable cv;
+    std::mutex connect_mu;
+    struct rdma_cm_id *cm_id;
+
+    WRContext rx_ctx[kRxDepth];
+
+    WRContext start_ctx[kStartDepth];
+    WRContext reply_ctx[kReplyDepth];
+    WRContext write_ctx[kWriteDepth];
+
+    ThreadsafeQueue<WRContext *> free_start_ctx;
+    ThreadsafeQueue<WRContext *> free_reply_ctx;
+    ThreadsafeQueue<WRContext *> free_write_ctx;
+
+    Endpoint() : status(IDLE), node_id(Node::kEmpty), cm_id(nullptr), rx_ctx() {
+    }
+
+    ~Endpoint() {
+        for (int i = 0; i < kRxDepth; ++i) {
+            if (!(rx_ctx[i].buffer)) {
+                continue;
+            }
+            free(rx_ctx[i].buffer->addr);
+            CHECK_EQ(ibv_dereg_mr(rx_ctx[i].buffer), 0);
+        }
+
+        for (int i = 0; i < kStartDepth; ++i) {
+            if (start_ctx[i].buffer) {
+                free(start_ctx[i].buffer->addr);
+                CHECK_EQ(ibv_dereg_mr(start_ctx[i].buffer), 0);
+            }
+        }
+
+        for (int i = 0; i < kReplyDepth; ++i) {
+            if (reply_ctx[i].buffer) {
+                free(reply_ctx[i].buffer->addr);
+                CHECK_EQ(ibv_dereg_mr(reply_ctx[i].buffer), 0);
+            }
+        }
+
+        for (int i = 0; i < kWriteDepth; ++i) {
+            if (write_ctx[i].buffer) {
+                free(write_ctx[i].buffer->addr);
+                CHECK_EQ(ibv_dereg_mr(write_ctx[i].buffer), 0);
+            }
+        }
+
+        rdma_destroy_qp(cm_id);
+        CHECK_EQ(rdma_destroy_id(cm_id), 0) << strerror(errno);
+    }
+
+    void Disconnect() {
+        std::unique_lock<std::mutex> lk(connect_mu);
+        CHECK_EQ(rdma_disconnect(cm_id), 0) << strerror(errno);
+        cv.wait(lk, [this] { return status == IDLE; });
+    }
+
+    void SetNodeID(int id) {
+        node_id = id;
+    }
+
+    void InitSendContextHelper(struct ibv_pd *pd, WRContext *ctx,
+                               ThreadsafeQueue<WRContext *> *queue, size_t num,
+                               WRContextType type) {
+        for (size_t i = 0; i < num; ++i) {
+            void *buf = aligned_alloc(kAlignment, kMempoolChunkSize);
+            CHECK(buf);
+            struct ibv_mr *mr = ibv_reg_mr(pd, buf, kMempoolChunkSize, 0);
+            CHECK(mr);
+
+            ctx[i].type = type;
+            ctx[i].buffer = mr;
+            ctx[i].private_data = this;
+            queue->Push(&ctx[i]);
+        }
+    }
+
+    void Init(struct ibv_cq *cq, struct ibv_pd *pd) {
+        struct ibv_qp_init_attr attr;
+        memset(&attr, 0, sizeof(ibv_qp_init_attr));
+        attr.send_cq = cq;
+        attr.recv_cq = cq;
+        attr.cap.max_send_wr = kStartDepth + kReplyDepth + kWriteDepth;
+        attr.cap.max_recv_wr = kRxDepth;
+        attr.cap.max_send_sge = kSGEntry;
+        attr.cap.max_recv_sge = kSGEntry;
+        attr.qp_type = IBV_QPT_RC;
+        attr.sq_sig_all = 0;
+
+        CHECK_EQ(rdma_create_qp(cm_id, pd, &attr), 0)
+            << "Create RDMA queue pair failed";
+
+        InitSendContextHelper(pd, start_ctx, &free_start_ctx, kStartDepth,
+                              kRendezvousStartContext);
+        InitSendContextHelper(pd, reply_ctx, &free_reply_ctx, kReplyDepth,
+                              kRendezvousReplyContext);
+        InitSendContextHelper(pd, write_ctx, &free_write_ctx, kWriteDepth,
+                              kWriteContext);
+
+        for (size_t i = 0; i < kRxDepth; ++i) {
+            void *buf = aligned_alloc(kAlignment, kMempoolChunkSize);
+            CHECK(buf);
+            struct ibv_mr *mr =
+                ibv_reg_mr(pd, buf, kMempoolChunkSize, IBV_ACCESS_LOCAL_WRITE);
+            CHECK(mr);
+
+            rx_ctx[i].type = kReceiveContext;
+            rx_ctx[i].buffer = mr;
+            rx_ctx[i].private_data = this;
+
+            PostRecv(&rx_ctx[i]);
+        }
+    }
+
+    void PostRecv(WRContext *ctx) {
+        struct ibv_recv_wr wr, *bad_wr = nullptr;
+        memset(&wr, 0, sizeof(wr));
+
+        struct ibv_sge sge;
+        sge.addr = reinterpret_cast<uint64_t>(ctx->buffer->addr);
+        sge.length = kMempoolChunkSize;
+        sge.lkey = ctx->buffer->lkey;
+
+        wr.wr_id = reinterpret_cast<uint64_t>(ctx);
+        wr.next = nullptr;
+        wr.sg_list = &sge;
+        wr.num_sge = 1;
+
+        CHECK_EQ(ibv_post_recv(cm_id->qp, &wr, &bad_wr), 0)
+            << "ibv_post_recv failed.";
+    }
+};
+
+class IBVerbsVan : public Van {
+public:
+    IBVerbsVan() {
+    }
+    ~IBVerbsVan() {
+    }
+
+protected:
+    void Start(int customer_id) override {
+        start_mu_.lock();
+        should_stop_ = false;
+
+        if (event_channel_ == nullptr) {
+            event_channel_ = rdma_create_event_channel();
+            CHECK(event_channel_) << "Create RDMA event channel failed";
+
+            cm_event_polling_thread_.reset(
+                new std::thread(&IBVerbsVan::PollEvents, this));
+        }
+
+        start_mu_.unlock();
+        Van::Start(customer_id);
+    }
+
+    void Stop() override {
+        PS_VLOG(1) << my_node_.ShortDebugString() << " is stopping";
+        Van::Stop();
+
+        should_stop_ = true;
+        CHECK(should_stop_);
+
+        PS_VLOG(1) << "Stopping cq_polling_thread_.";
+        cq_polling_thread_->join();
+        cq_polling_thread_.reset();
+
+        PS_VLOG(1) << "Stopping cm_event_polling_thread_.";
+        cm_event_polling_thread_->join();
+        cm_event_polling_thread_.reset();
+
+        PS_VLOG(1) << "Clearing mempool.";
+        mempool_.reset();
+
+        for (auto &it : allocated_mr_) {
+            ibv_dereg_mr(it.second);
+        }
+
+        PS_VLOG(1) << "Clearing endpoints.";
+        incoming_.clear();
+        endpoints_.clear();
+
+        PS_VLOG(1) << "Destroying cq and pd.";
+        CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ";
+        CHECK(!ibv_destroy_comp_channel(comp_event_channel_))
+            << "Failed to destroy channel";
+
+        // TODO(changlan): ibv_dealloc_pd sometimes complains about busy
+        // resources
+
+        PS_VLOG(1) << "Destroying listener.";
+        rdma_destroy_id(listener_);
+        rdma_destroy_event_channel(event_channel_);
+    }
+
+    int Bind(const Node &node, int max_retry) override {
+        CHECK(rdma_create_id(event_channel_, &listener_, nullptr, RDMA_PS_TCP)
+              == 0)
+            << "Create RDMA connection identifier failed";
+
+        struct sockaddr_in addr;
+        memset(&addr, 0, sizeof(addr));
+        addr.sin_family = AF_INET;
+        int port = node.port;
+        unsigned seed = static_cast<unsigned>(time(NULL) + port);
+        for (int i = 0; i < max_retry + 1; ++i) {
+            addr.sin_port = htons(port);
+            if (rdma_bind_addr(listener_,
+                               reinterpret_cast<struct sockaddr *>(&addr))
+                == 0) {
+                break;
+            }
+            if (i == max_retry) {
+                port = -1;
+            } else {
+                port = 10000 + rand_r(&seed) % 40000;
+            }
+        }
+        CHECK(rdma_listen(listener_, kRdmaListenBacklog) == 0)
+            << "Listen RDMA connection failed: " << strerror(errno);
+        return port;
+    }
+
+    void Connect(const Node &node) override {
+        PS_VLOG(1) << "Connecting to " << my_node_.ShortDebugString();
+        CHECK_NE(node.id, node.kEmpty);
+        CHECK_NE(node.port, node.kEmpty);
+        CHECK(node.hostname.size());
+
+        // worker doesn't need to connect to the other workers. same for server
+        if ((node.role == my_node_.role) && (node.id != my_node_.id)) {
+            return;
+        }
+
+        std::string node_host_ip =
+            node.hostname + ":" + std::to_string(node.port);
+        if (node.id != Node::kEmpty) {
+            auto it = endpoints_.find(node.id);
+
+            // if there is an endpoint with pending connection
+            if (it != endpoints_.end()) {
+                endpoints_.erase(it);
+            }
+
+            Endpoint *endpoint;
+            endpoints_[node.id] = std::unique_ptr<Endpoint>(new Endpoint());
+            endpoint = endpoints_[node.id].get();
+
+            endpoint->SetNodeID(node.id);
+
+            struct addrinfo *remote_addr;
+            CHECK_EQ(getaddrinfo(node.hostname.c_str(),
+                                 std::to_string(node.port).c_str(), nullptr,
+                                 &remote_addr),
+                     0);
+
+            while (endpoint->status != Endpoint::CONNECTED) {
+                std::unique_lock<std::mutex> lk(endpoint->connect_mu);
+                endpoint->status = Endpoint::CONNECTING;
+
+                if (endpoint->cm_id != nullptr) {
+                    rdma_destroy_qp(endpoint->cm_id);
+                    CHECK_EQ(rdma_destroy_id(endpoint->cm_id), 0)
+                        << strerror(errno);
+                    endpoint->cm_id = nullptr;
+                }
+
+                CHECK_EQ(rdma_create_id(event_channel_, &endpoint->cm_id,
+                                        nullptr, RDMA_PS_TCP),
+                         0)
+                    << "Create RDMA connection identifier failed";
+                endpoint->cm_id->context = endpoint;
+
+                CHECK_EQ(rdma_resolve_addr(endpoint->cm_id, nullptr,
+                                           remote_addr->ai_addr, kTimeoutms),
+                         0)
+                    << "Resolve RDMA address failed with errno: " << errno;
+
+                endpoint->cv.wait(lk, [endpoint] {
+                    return endpoint->status != Endpoint::CONNECTING;
+                });
+
+                if (endpoint->status == Endpoint::CONNECTED)
+                    break;
+                std::this_thread::sleep_for(std::chrono::milliseconds(500));
+            }
+
+            freeaddrinfo(remote_addr);
+        }
+    }
+
+    int SendMsg(const Message &msg) override {
+        int remote_id = msg.meta.recver;
+        CHECK_NE(remote_id, Meta::kEmpty);
+
+        PBMeta meta;
+        PackMetaPB(msg.meta, &meta);
+
+        CHECK_NE(endpoints_.find(remote_id), endpoints_.end());
+        Endpoint *endpoint = endpoints_[remote_id].get();
+        MessageBuffer *msg_buf = new MessageBuffer();
+
+        size_t meta_len = meta.ByteSize();
+        size_t data_len = msg.meta.data_size;
+        size_t total_len = meta_len + data_len;
+
+        CHECK(meta_len);
+
+        // For control messages, inline the message content
+        // into the START message.
+        // Otherwise, register the data buffer as RDMA memory
+        // region.
+        if (!msg.meta.control.empty()) { // control message
+            msg_buf->inline_len = total_len;
+            msg_buf->inline_buf = mempool_->Alloc(total_len);
+            meta.SerializeToArray(msg_buf->inline_buf, meta_len);
+            char *cur = msg_buf->inline_buf + meta_len;
+            for (auto &sa : msg.data) {
+                size_t seg_len = sa.size();
+                memcpy(cur, sa.data(), seg_len);
+                cur += seg_len;
+            }
+        } else { // data message
+            msg_buf->inline_len = meta_len;
+            msg_buf->inline_buf = mempool_->Alloc(meta_len);
+            msg_buf->data = msg.data;
+            meta.SerializeToArray(msg_buf->inline_buf, meta_len);
+
+            for (auto &sa : msg_buf->data) {
+                if (sa.size() == 0) {
+                    continue;
+                }
+                // Optimization: If the memory region has been registered,
+                // (assuming the previously registered address is not freed)
+                // re-use the same memory region.
+                char *p = sa.data();
+                auto it = allocated_mr_.find(p);
+                if (it == allocated_mr_.end()) {
+                    allocated_mr_[p] = ibv_reg_mr(pd_, p, sa.size(), 0);
+                }
+                CHECK(allocated_mr_[p]) << "Invalid memory region";
+                msg_buf->mrs.push_back({allocated_mr_[p], sa.size()});
+            }
+        }
+
+        // Take the second context buffer first to avoid deadlock
+        WRContext *context = nullptr, *reserved = nullptr;
+        endpoint->free_write_ctx.WaitAndPop(&reserved);
+        endpoint->free_start_ctx.WaitAndPop(&context);
+
+        msg_buf->reserved_context = reserved;
+
+        RendezvousStart *req =
+            reinterpret_cast<RendezvousStart *>(context->buffer->addr);
+        req->meta_len = meta_len;
+
+        for (size_t i = 0; i < msg.data.size(); ++i) {
+            req->data_len[i] = msg.data[i].size();
+        }
+        req->data_num = msg.data.size();
+        req->origin_addr = reinterpret_cast<uint64_t>(msg_buf);
+
+        struct ibv_sge sge;
+        sge.addr = reinterpret_cast<uint64_t>(req);
+        sge.length = sizeof(RendezvousStart);
+        sge.lkey = context->buffer->lkey;
+
+        struct ibv_send_wr wr, *bad_wr = nullptr;
+        memset(&wr, 0, sizeof(wr));
+
+        wr.wr_id = reinterpret_cast<uint64_t>(context);
+        wr.opcode = IBV_WR_SEND_WITH_IMM;
+        wr.next = nullptr;
+
+        wr.imm_data = kRendezvousStart;
+
+        wr.send_flags = IBV_SEND_SIGNALED;
+        wr.sg_list = &sge;
+        wr.num_sge = 1;
+        CHECK_EQ(ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0)
+            << strerror(errno);
+
+        return total_len;
+    }
+
+    int RecvMsg(Message *msg) override {
+        msg->data.clear();
+        std::tuple<Endpoint *, BufferContext *> notification;
+        recv_buffers_.WaitAndPop(&notification);
+
+        Endpoint *endpoint = std::get<0>(notification);
+        BufferContext *buffer_ctx = std::get<1>(notification);
+
+        int total_len = 0;
+
+        msg->meta.recver = my_node_.id;
+        msg->meta.sender = endpoint->node_id;
+
+        char *cur = buffer_ctx->buffer;
+
+        UnpackMeta(cur, buffer_ctx->meta_len, &msg->meta);
+        total_len += buffer_ctx->meta_len;
+        uint64_t data_num = buffer_ctx->data_num;
+        cur += buffer_ctx->meta_len;
+
+        if (data_num > 0) {
+            Block *mem_block =
+                new Block(mempool_.get(), buffer_ctx->buffer, data_num);
+
+            for (size_t i = 0; i < data_num; i++) {
+                uint32_t len = buffer_ctx->data_len[i];
+                SArray<char> data;
+                data.reset(cur, len, [mem_block](void *) {
+                    mem_block->Release();
+                }); // Defer the deletion of block_ref
+                msg->data.push_back(data);
+                cur += len;
+                total_len += len;
+            }
+        } else {
+            mempool_->Free(buffer_ctx->buffer);
+        }
+
+        delete buffer_ctx;
+        return total_len;
+    }
+
+private:
+    void InitContext(struct ibv_context *context) {
+        context_ = context;
+        CHECK(context_) << "ibv_context* empty";
+
+        pd_ = ibv_alloc_pd(context_);
+        CHECK(pd_) << "Failed to allocate protection domain";
+
+        mempool_.reset(new SimpleMempool(pd_));
+
+        comp_event_channel_ = ibv_create_comp_channel(context_);
+
+        // TODO(clan): Replace the rough estimate here
+        cq_ = ibv_create_cq(context_, kMaxConcurrentWorkRequest * 2, NULL,
+                            comp_event_channel_, 0);
+
+        CHECK(cq_) << "Failed to create completion queue";
+        CHECK(!ibv_req_notify_cq(cq_, 0))
+            << "Failed to request CQ notification";
+    }
+
+    void ReleaseWorkRequestContext(WRContext *context, Endpoint *endpoint) {
+        switch (context->type) {
+        case kRendezvousStartContext:
+            endpoint->free_start_ctx.Push(context);
+            break;
+        case kRendezvousReplyContext:
+            endpoint->free_reply_ctx.Push(context);
+            break;
+        case kWriteContext:
+            endpoint->free_write_ctx.Push(context);
+            break;
+        case kReceiveContext:
+            endpoint->PostRecv(context);
+            break;
+        default:
+            CHECK(0);
+        }
+    }
+
+    void PollCQ() {
+        // Pre-allocated work completions array used for polling
+        struct ibv_wc wc[kMaxConcurrentWorkRequest];
+        while (!should_stop_.load()) {
+            int ne = ibv_poll_cq(cq_, kMaxConcurrentWorkRequest, wc);
+            CHECK_GE(ne, 0);
+            for (int i = 0; i < ne; ++i) {
+                CHECK(wc[i].status == IBV_WC_SUCCESS)
+                    << "Failed status \n"
+                    << ibv_wc_status_str(wc[i].status) << " " << wc[i].status
+                    << " " << static_cast<uint64_t>(wc[i].wr_id) << " "
+                    << wc[i].vendor_err;
+
+                WRContext *context = reinterpret_cast<WRContext *>(wc[i].wr_id);
+                Endpoint *endpoint =
+                    reinterpret_cast<Endpoint *>(context->private_data);
+
+                CHECK(endpoint);
+
+                switch (wc[i].opcode) {
+                case IBV_WC_SEND:
+                    // LOG(INFO) << "opcode: IBV_WC_SEND";
+                    ReleaseWorkRequestContext(context, endpoint);
+                    break;
+                case IBV_WC_RDMA_WRITE: {
+                    // LOG(INFO) << "opcode: IBV_WC_RDMA_WRITE";
+                    // Note: This is not a struct ibv_mr*
+                    MessageBuffer *msg_buf =
+                        *reinterpret_cast<MessageBuffer **>(
+                            context->buffer->addr);
+                    mempool_->Free(msg_buf->inline_buf);
+                    delete msg_buf;
+                    ReleaseWorkRequestContext(context, endpoint);
+                } break;
+                case IBV_WC_RECV_RDMA_WITH_IMM: {
+                    // LOG(INFO) << "opcode: IBV_WC_RECV_RDMA_WITH_IMM";
+                    uint32_t addr_idx = wc[i].imm_data;
+                    BufferContext *buf_ctx =
+                        addr_pool_.GetAddressAndRelease(addr_idx);
+                    recv_buffers_.Push(std::make_tuple(endpoint, buf_ctx));
+                    ReleaseWorkRequestContext(context, endpoint);
+                } break;
+                case IBV_WC_RECV: {
+                    CHECK(wc[i].wc_flags & IBV_WC_WITH_IMM);
+                    uint32_t imm = wc[i].imm_data;
+                    struct ibv_mr *mr = context->buffer;
+
+                    if (imm == kRendezvousStart) {
+                        // LOG(INFO) << "opcode: IBV_WC_RECV kRendezvousStart";
+                        RendezvousStart *req =
+                            reinterpret_cast<RendezvousStart *>(mr->addr);
+                        BufferContext *buf_ctx = new BufferContext();
+
+                        uint64_t len = req->meta_len;
+                        buf_ctx->meta_len = len;
+                        buf_ctx->data_num = req->data_num;
+                        for (size_t i = 0; i < req->data_num; ++i) {
+                            buf_ctx->data_len[i] = req->data_len[i];
+                            len += req->data_len[i];
+                        }
+
+                        char *buffer = mempool_->Alloc(len);
+                        CHECK(buffer) << "Alloc for " << len
+                                      << " bytes, data_num: " << req->data_num;
+
+                        buf_ctx->buffer = buffer;
+
+                        uint64_t origin_addr = req->origin_addr;
+
+                        WRContext *reply_ctx = nullptr;
+                        endpoint->free_reply_ctx.WaitAndPop(&reply_ctx);
+                        RendezvousReply *resp =
+                            reinterpret_cast<RendezvousReply *>(
+                                reply_ctx->buffer->addr);
+
+                        resp->addr = reinterpret_cast<uint64_t>(buffer);
+                        resp->rkey = mempool_->RemoteKey(buffer);
+                        resp->origin_addr = origin_addr;
+                        resp->idx = addr_pool_.StoreAddress(buf_ctx);
+
+                        struct ibv_sge sge;
+                        sge.addr = reinterpret_cast<uint64_t>(resp);
+                        sge.length = sizeof(RendezvousReply);
+                        sge.lkey = reply_ctx->buffer->lkey;
+
+                        struct ibv_send_wr wr, *bad_wr = nullptr;
+                        memset(&wr, 0, sizeof(wr));
+
+                        wr.wr_id = reinterpret_cast<uint64_t>(reply_ctx);
+                        wr.opcode = IBV_WR_SEND_WITH_IMM;
+                        wr.next = nullptr;
+
+                        wr.imm_data = kRendezvousReply;
+
+                        wr.send_flags = IBV_SEND_SIGNALED;
+                        wr.sg_list = &sge;
+                        wr.num_sge = 1;
+
+                        CHECK_EQ(
+                            ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0)
+                            << "ibv_post_send failed.";
+
+                    } else if (imm == kRendezvousReply) {
+                        // LOG(INFO) << "opcode: IBV_WC_RECV kRendezvousReply";
+                        RendezvousReply *resp =
+                            reinterpret_cast<RendezvousReply *>(mr->addr);
+                        uint64_t remote_addr = resp->addr;
+                        uint64_t origin_addr = resp->origin_addr;
+                        uint32_t rkey = resp->rkey;
+                        uint32_t idx = resp->idx;
+
+                        MessageBuffer *msg_buf =
+                            reinterpret_cast<MessageBuffer *>(origin_addr);
+
+                        struct ibv_sge sge[1 + msg_buf->mrs.size()];
+
+                        sge[0].addr =
+                            reinterpret_cast<uint64_t>(msg_buf->inline_buf);
+                        sge[0].length = msg_buf->inline_len;
+                        sge[0].lkey = mempool_->LocalKey(msg_buf->inline_buf);
+
+                        size_t num_sge = 1;
+                        for (auto &pair : msg_buf->mrs) {
+                            size_t length = pair.second;
+                            CHECK(length);
+                            sge[num_sge].addr =
+                                reinterpret_cast<uint64_t>(pair.first->addr);
+                            sge[num_sge].length = length;
+                            sge[num_sge].lkey = pair.first->lkey;
+                            ++num_sge;
+                        }
+
+                        WRContext *write_ctx = msg_buf->reserved_context;
+
+                        MessageBuffer **tmp =
+                            reinterpret_cast<MessageBuffer **>(
+                                write_ctx->buffer->addr);
+                        *tmp = msg_buf; // write the addr of msg_buf into the mr
+                                        // buffer
+
+                        struct ibv_send_wr wr, *bad_wr = nullptr;
+                        memset(&wr, 0, sizeof(wr));
+
+                        wr.wr_id = reinterpret_cast<uint64_t>(write_ctx);
+                        wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+                        wr.next = nullptr;
+
+                        wr.imm_data = idx;
+
+                        wr.send_flags = IBV_SEND_SIGNALED;
+                        wr.sg_list = sge;
+                        wr.num_sge = num_sge;
+
+                        wr.wr.rdma.remote_addr = remote_addr;
+                        wr.wr.rdma.rkey = rkey;
+
+                        CHECK_EQ(
+                            ibv_post_send(endpoint->cm_id->qp, &wr, &bad_wr), 0)
+                            << "ibv_post_send failed.";
+
+                    } else {
+                        CHECK(0);
+                    }
+                    ReleaseWorkRequestContext(context, endpoint);
+                } break;
+                default:
+                    CHECK(0) << "Unexpected opcode: " << wc[i].opcode;
+                }
+            }
+        }
+    }
+
+    void PollEvents() {
+        int flags = fcntl(event_channel_->fd, F_GETFL);
+        int rc = fcntl(event_channel_->fd, F_SETFL, flags | O_NONBLOCK);
+        CHECK_GE(rc, 0);
+        int error_flags = POLLERR | POLLHUP | POLLNVAL;
+
+        while (!should_stop_.load()) {
+            struct pollfd pfd = {
+                .fd = event_channel_->fd, .events = POLLIN, .revents = 0};
+            int ret = poll(&pfd, 1, 10);
+
+            CHECK_GE(ret, 0) << strerror(errno);
+            CHECK_EQ(pfd.revents & error_flags, 0);
+
+            if (!(pfd.revents & POLLIN)) {
+                continue;
+            }
+
+            struct rdma_cm_event *event;
+            CHECK_EQ(rdma_get_cm_event(event_channel_, &event), 0);
+            // TODO(clan): Reorder the list according to the event frequency
+            switch (event->event) {
+            case RDMA_CM_EVENT_CONNECT_REQUEST:
+                OnConnectRequest(event);
+                break;
+            case RDMA_CM_EVENT_ADDR_RESOLVED:
+                OnAddrResolved(event);
+                break;
+            case RDMA_CM_EVENT_ROUTE_RESOLVED:
+                OnRouteResolved(event);
+                break;
+            case RDMA_CM_EVENT_ESTABLISHED:
+                OnConnected(event);
+                break;
+            case RDMA_CM_EVENT_DISCONNECTED:
+                OnDisconnected(event);
+                break;
+            case RDMA_CM_EVENT_REJECTED:
+                OnRejected(event);
+                break;
+            default:
+                CHECK(0) << "OnEvent: unknown event " << event->event << " ("
+                         << rdma_event_str(event->event) << ")";
+            }
+            rdma_ack_cm_event(event);
+        }
+    }
+
+    void OnRejected(struct rdma_cm_event *event) {
+        struct rdma_cm_id *id = event->id;
+        Endpoint *endpoint = reinterpret_cast<Endpoint *>(id->context);
+
+        auto it = endpoints_.find(endpoint->node_id);
+        CHECK(it != endpoints_.end()) << "Connection not ready.";
+        CHECK_EQ(endpoint->status, Endpoint::CONNECTING);
+        CHECK_EQ(endpoint->cm_id, id);
+
+        PS_VLOG(1) << "Connection rejected, retrying...";
+        {
+            std::lock_guard<std::mutex> lk(endpoint->connect_mu);
+            endpoint->status = Endpoint::REJECTED;
+        }
+        endpoint->cv.notify_all();
+    }
+
+    void OnConnectRequest(struct rdma_cm_event *event) {
+        struct rdma_cm_id *id = event->id;
+        CHECK_NOTNULL(id);
+
+        CHECK_LE(sizeof(RequestContext), event->param.conn.private_data_len)
+            << "RequestContext size mismatch. Actual: "
+            << (size_t)event->param.conn.private_data_len
+            << ", Expected: " << sizeof(RequestContext);
+        CHECK_NOTNULL(event->param.conn.private_data);
+
+        const RequestContext *remote_ctx =
+            reinterpret_cast<const RequestContext *>(
+                event->param.conn.private_data);
+
+        const auto r =
+            incoming_.emplace(std::unique_ptr<Endpoint>(new Endpoint()));
+        Endpoint *endpoint = r.first->get();
+        endpoint->SetNodeID(remote_ctx->node);
+        endpoint->cm_id = id;
+        id->context = endpoint;
+
+        if (context_ == nullptr) {
+            InitContext(id->verbs);
+        }
+
+        endpoint->Init(cq_, pd_);
+
+        RequestContext ctx;
+        ctx.node = static_cast<uint32_t>(my_node_.id);
+        ctx.port = static_cast<uint16_t>(my_node_.port);
+        snprintf(ctx.hostname, kMaxHostnameLength, "%s",
+                 my_node_.hostname.c_str());
+
+        struct rdma_conn_param cm_params;
+        memset(&cm_params, 0, sizeof(cm_params));
+        cm_params.retry_count = 7;
+        cm_params.rnr_retry_count = 7;
+        cm_params.private_data = &ctx;
+        cm_params.private_data_len = sizeof(RequestContext);
+
+        CHECK_EQ(rdma_accept(id, &cm_params), 0)
+            << "Accept RDMA connection failed: " << strerror(errno);
+    }
+
+    // Resolve a route after address is resolved
+    void OnAddrResolved(struct rdma_cm_event *event) {
+        struct rdma_cm_id *id = event->id;
+        CHECK_EQ(rdma_resolve_route(id, kTimeoutms), 0)
+            << "Resolve RDMA route failed";
+    }
+
+    // Make a connection after route is resolved
+    void OnRouteResolved(struct rdma_cm_event *event) {
+        struct rdma_cm_id *id = event->id;
+        Endpoint *endpoint = reinterpret_cast<Endpoint *>(id->context);
+
+        if (context_ == nullptr) {
+            InitContext(id->verbs);
+        }
+
+        endpoint->Init(cq_, pd_);
+
+        RequestContext ctx;
+        ctx.node = static_cast<uint32_t>(my_node_.id);
+        ctx.port = static_cast<uint16_t>(my_node_.port);
+        snprintf(ctx.hostname, kMaxHostnameLength, "%s",
+                 my_node_.hostname.c_str());
+
+        struct rdma_conn_param cm_params;
+        memset(&cm_params, 0, sizeof(cm_params));
+        cm_params.retry_count = 7;
+        cm_params.rnr_retry_count = 7;
+        cm_params.private_data = &ctx;
+        cm_params.private_data_len = sizeof(RequestContext);
+
+        CHECK_EQ(rdma_connect(id, &cm_params), 0)
+            << "RDMA connect failed" << strerror(errno);
+    }
+
+    void OnConnected(struct rdma_cm_event *event) {
+        struct rdma_cm_id *id = event->id;
+        CHECK(id) << "rdma_cm_id not found.";
+        Endpoint *endpoint = reinterpret_cast<Endpoint *>(id->context);
+        CHECK(endpoint) << "Endpoint not found.";
+
+        if (cq_polling_thread_ == nullptr) {
+            cq_polling_thread_.reset(
+                new std::thread(&IBVerbsVan::PollCQ, this));
+        }
+
+        CHECK_EQ(endpoint->cm_id, id);
+        {
+            std::lock_guard<std::mutex> lk(endpoint->connect_mu);
+            endpoint->status = Endpoint::CONNECTED;
+        }
+        endpoint->cv.notify_all();
+    }
+
+    void OnDisconnected(struct rdma_cm_event *event) {
+        LOG(INFO) << "OnDisconnected from Node " << my_node_.id;
+        struct rdma_cm_id *id = event->id;
+        Endpoint *endpoint = reinterpret_cast<Endpoint *>(id->context);
+        {
+            std::lock_guard<std::mutex> lk(endpoint->connect_mu);
+            endpoint->status = Endpoint::IDLE;
+        }
+        endpoint->cv.notify_all();
+    }
+
+    AddressPool<BufferContext> addr_pool_;
+    std::unique_ptr<SimpleMempool> mempool_;
+
+    struct rdma_cm_id *listener_ = nullptr;
+    std::atomic<bool> should_stop_;
+
+    std::unordered_map<int, std::unique_ptr<Endpoint>> endpoints_;
+    std::unordered_set<std::unique_ptr<Endpoint>> incoming_;
+
+    struct rdma_event_channel *event_channel_ = nullptr;
+    struct ibv_context *context_ = nullptr;
+
+    std::unordered_map<char *, struct ibv_mr *> allocated_mr_;
+
+    // ibverbs protection domain
+    struct ibv_pd *pd_ = nullptr;
+    // Completion event channel, to wait for work completions
+    struct ibv_comp_channel *comp_event_channel_ = nullptr;
+    // Completion queue, to poll on work completions
+    struct ibv_cq *cq_ = nullptr;
+    // cq thread
+    std::unique_ptr<std::thread> cq_polling_thread_;
+    // event thread
+    std::unique_ptr<std::thread> cm_event_polling_thread_;
+    // Recv buffer queue
+    ThreadsafeQueue<std::tuple<Endpoint *, BufferContext *>> recv_buffers_;
+}; // namespace ps
+}; // namespace ps
+
+#endif // DMLC_USE_IBVERBS
+#endif // PS_IBVERBS_VAN_H_
diff --git a/ps-lite/src/network_utils.h b/ps-lite/src/network_utils.h
new file mode 100644
index 0000000..953380e
--- /dev/null
+++ b/ps-lite/src/network_utils.h
@@ -0,0 +1,277 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ * @file   network_utils.h
+ * @brief  network utilities
+ */
+#ifndef PS_NETWORK_UTILS_H_
+#define PS_NETWORK_UTILS_H_
+#include <unistd.h>
+#ifdef _MSC_VER
+#include <tchar.h>
+#include <winsock2.h>
+#include <windows.h>
+#include <iphlpapi.h>
+#undef interface
+#else
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <netinet/in.h>
+#endif
+#include <string>
+
+namespace ps {
+
+/**
+ * \brief return the IP address for given interface eth0, eth1, ...
+ */
+void GetIP(const std::string &interface, std::string *ip) {
+#ifdef _MSC_VER
+    typedef std::basic_string<TCHAR> tstring;
+    // Try to get the Adapters-info table, so we can given useful names to the
+    // IP addresses we are returning.  Gotta call GetAdaptersInfo() up to 5
+    // times to handle the potential race condition between the size-query call
+    // and the get-data call. I love a well-designed API :^P
+    IP_ADAPTER_INFO *pAdapterInfo = NULL;
+    {
+        ULONG bufLen = 0;
+        for (int i = 0; i < 5; i++) {
+            DWORD apRet = GetAdaptersInfo(pAdapterInfo, &bufLen);
+            if (apRet == ERROR_BUFFER_OVERFLOW) {
+                free(pAdapterInfo); // in case we had previously allocated it
+                pAdapterInfo = static_cast<IP_ADAPTER_INFO *>(malloc(bufLen));
+            } else if (apRet == ERROR_SUCCESS) {
+                break;
+            } else {
+                free(pAdapterInfo);
+                pAdapterInfo = NULL;
+                break;
+            }
+        }
+    }
+    if (pAdapterInfo) {
+        tstring keybase =
+            _T(
+        "SYSTEM\\CurrentControlSet\\Control\\Network\\{4D36E972-E325-11CE-BFC1-08002BE10318}\\");
+        tstring connection = _T("\\Connection");
+
+        IP_ADAPTER_INFO *curpAdapterInfo = pAdapterInfo;
+        while (curpAdapterInfo) {
+            HKEY hKEY;
+            std::string AdapterName = curpAdapterInfo->AdapterName;
+            // GUID only ascii
+            tstring key_set = keybase
+                              + tstring(AdapterName.begin(), AdapterName.end())
+                              + connection;
+            LPCTSTR data_Set = key_set.c_str();
+            LPCTSTR dwValue = NULL;
+            if (ERROR_SUCCESS
+                == ::RegOpenKeyEx(HKEY_LOCAL_MACHINE, data_Set, 0, KEY_READ,
+                                  &hKEY)) {
+                DWORD dwSize = 0;
+                DWORD dwType = REG_SZ;
+                if (ERROR_SUCCESS
+                    == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType,
+                                         (LPBYTE)dwValue, &dwSize)) {
+                    dwValue = new TCHAR[dwSize];
+                    if (ERROR_SUCCESS
+                        == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType,
+                                             (LPBYTE)dwValue, &dwSize)) {
+                        // interface name must only ascii
+                        tstring tstr = dwValue;
+                        std::string s(tstr.begin(), tstr.end());
+                        if (s == interface) {
+                            *ip =
+                                curpAdapterInfo->IpAddressList.IpAddress.String;
+                            break;
+                        }
+                    }
+                }
+                ::RegCloseKey(hKEY);
+            }
+            curpAdapterInfo = curpAdapterInfo->Next;
+        }
+        free(pAdapterInfo);
+    }
+#else
+    struct ifaddrs *ifAddrStruct = NULL;
+    struct ifaddrs *ifa = NULL;
+    void *tmpAddrPtr = NULL;
+
+    getifaddrs(&ifAddrStruct);
+    for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next) {
+        if (ifa->ifa_addr == NULL)
+            continue;
+        if (ifa->ifa_addr->sa_family == AF_INET) {
+            // is a valid IP4 Address
+            tmpAddrPtr =
+                &(reinterpret_cast<struct sockaddr_in *>(ifa->ifa_addr))
+                     ->sin_addr;
+            char addressBuffer[INET_ADDRSTRLEN];
+            inet_ntop(AF_INET, tmpAddrPtr, addressBuffer, INET_ADDRSTRLEN);
+            if (strncmp(ifa->ifa_name, interface.c_str(), interface.size())
+                == 0) {
+                *ip = addressBuffer;
+                break;
+            }
+        }
+    }
+    if (ifAddrStruct != NULL)
+        freeifaddrs(ifAddrStruct);
+#endif
+}
+
+/**
+ * \brief return the IP address and Interface the first interface which is not
+ * loopback
+ *
+ * only support IPv4
+ */
+void GetAvailableInterfaceAndIP(std::string *interface, std::string *ip) {
+#ifdef _MSC_VER
+    typedef std::basic_string<TCHAR> tstring;
+    IP_ADAPTER_INFO *pAdapterInfo = NULL;
+    {
+        ULONG bufLen = 0;
+        for (int i = 0; i < 5; i++) {
+            DWORD apRet = GetAdaptersInfo(pAdapterInfo, &bufLen);
+            if (apRet == ERROR_BUFFER_OVERFLOW) {
+                free(pAdapterInfo); // in case we had previously allocated it
+                pAdapterInfo = static_cast<IP_ADAPTER_INFO *>(malloc(bufLen));
+            } else if (apRet == ERROR_SUCCESS) {
+                break;
+            } else {
+                free(pAdapterInfo);
+                pAdapterInfo = NULL;
+                break;
+            }
+        }
+    }
+    if (pAdapterInfo) {
+        tstring keybase =
+            _T(
+        "SYSTEM\\CurrentControlSet\\Control\\Network\\{4D36E972-E325-11CE-BFC1-08002BE10318}\\");
+        tstring connection = _T("\\Connection");
+
+        IP_ADAPTER_INFO *curpAdapterInfo = pAdapterInfo;
+        HKEY hKEY = NULL;
+        while (curpAdapterInfo) {
+            std::string curip(curpAdapterInfo->IpAddressList.IpAddress.String);
+            if (curip == "127.0.0.1") {
+                curpAdapterInfo = curpAdapterInfo->Next;
+                continue;
+            }
+            if (curip == "0.0.0.0") {
+                curpAdapterInfo = curpAdapterInfo->Next;
+                continue;
+            }
+
+            std::string AdapterName = curpAdapterInfo->AdapterName;
+            // GUID only ascii
+            tstring key_set = keybase
+                              + tstring(AdapterName.begin(), AdapterName.end())
+                              + connection;
+            LPCTSTR data_Set = key_set.c_str();
+            LPCTSTR dwValue = NULL;
+            if (ERROR_SUCCESS
+                == ::RegOpenKeyEx(HKEY_LOCAL_MACHINE, data_Set, 0, KEY_READ,
+                                  &hKEY)) {
+                DWORD dwSize = 0;
+                DWORD dwType = REG_SZ;
+                if (ERROR_SUCCESS
+                    == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType,
+                                         (LPBYTE)dwValue, &dwSize)) {
+                    dwValue = new TCHAR[dwSize];
+                    if (ERROR_SUCCESS
+                        == ::RegQueryValueEx(hKEY, _T("Name"), 0, &dwType,
+                                             (LPBYTE)dwValue, &dwSize)) {
+                        // interface name must only ascii
+                        tstring tstr = dwValue;
+                        std::string s(tstr.begin(), tstr.end());
+
+                        *interface = s;
+                        *ip = curip;
+                        break;
+                    }
+                }
+                ::RegCloseKey(hKEY);
+                hKEY = NULL;
+            }
+            curpAdapterInfo = curpAdapterInfo->Next;
+        }
+        if (hKEY != NULL) {
+            ::RegCloseKey(hKEY);
+        }
+        free(pAdapterInfo);
+    }
+#else
+    struct ifaddrs *ifAddrStruct = nullptr;
+    struct ifaddrs *ifa = nullptr;
+
+    interface->clear();
+    ip->clear();
+    getifaddrs(&ifAddrStruct);
+    for (ifa = ifAddrStruct; ifa != nullptr; ifa = ifa->ifa_next) {
+        if (nullptr == ifa->ifa_addr)
+            continue;
+
+        if (AF_INET == ifa->ifa_addr->sa_family
+            && 0 == (ifa->ifa_flags & IFF_LOOPBACK)) {
+            char address_buffer[INET_ADDRSTRLEN];
+            void *sin_addr_ptr =
+                &(reinterpret_cast<struct sockaddr_in *>(ifa->ifa_addr))
+                     ->sin_addr;
+            inet_ntop(AF_INET, sin_addr_ptr, address_buffer, INET_ADDRSTRLEN);
+
+            *ip = address_buffer;
+            *interface = ifa->ifa_name;
+
+            break;
+        }
+    }
+    if (nullptr != ifAddrStruct)
+        freeifaddrs(ifAddrStruct);
+    return;
+#endif
+}
+
+/**
+ * \brief return an available port on local machine
+ *
+ * only support IPv4
+ * \return 0 on failure
+ */
+int GetAvailablePort() {
+    struct sockaddr_in addr;
+    addr.sin_port =
+        htons(0); // have system pick up a random port available for me
+    addr.sin_family = AF_INET;                // IPV4
+    addr.sin_addr.s_addr = htonl(INADDR_ANY); // set our addr to any interface
+
+    int sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (0 != bind(sock, (struct sockaddr *)&addr, sizeof(struct sockaddr_in))) {
+        perror("bind():");
+        return 0;
+    }
+#ifdef _MSC_VER
+    int addr_len = sizeof(struct sockaddr_in);
+#else
+    socklen_t addr_len = sizeof(struct sockaddr_in);
+#endif
+
+    if (0 != getsockname(sock, (struct sockaddr *)&addr, &addr_len)) {
+        perror("getsockname():");
+        return 0;
+    }
+
+    int ret_port = ntohs(addr.sin_port);
+#ifdef _MSC_VER
+    closesocket(sock);
+#else
+    close(sock);
+#endif
+    return ret_port;
+}
+
+} // namespace ps
+#endif // PS_NETWORK_UTILS_H_
diff --git a/ps-lite/src/p3_van.h b/ps-lite/src/p3_van.h
new file mode 100644
index 0000000..fb2e880
--- /dev/null
+++ b/ps-lite/src/p3_van.h
@@ -0,0 +1,71 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_P3_VAN_H_
+#define PS_P3_VAN_H_
+#include <memory>
+namespace ps {
+
+/**
+ * \brief P3 based Van implementation
+ */
+class P3Van : public ZMQVan {
+public:
+    P3Van() {
+    }
+    virtual ~P3Van() {
+    }
+
+protected:
+    void Start(int customer_id) override {
+        start_mu_.lock();
+        if (init_stage == 0) {
+            // start sender
+            // set num_threads= 5 for debug
+            int num_threads = 5;
+            for (int i = 0; i < num_threads; i++)
+                sender_threads_.emplace_back(
+                    new std::thread(&P3Van::Sending, this));
+            // sender_thread_ = std::unique_ptr<std::thread>(
+            //      new std::thread(&P3Van::Sending, this));
+            init_stage++;
+        }
+        start_mu_.unlock();
+        ZMQVan::Start(customer_id);
+    }
+
+    void Stop() override {
+        ZMQVan::Stop();
+        for (auto &thread : sender_threads_)
+            thread->join();
+    }
+
+    int SendMsg(const Message &msg) override {
+        send_queue_.Push(msg);
+        return 0;
+    }
+
+    void Sending() {
+        while (true) {
+            Message msg;
+            send_queue_.WaitAndPop(&msg);
+            ZMQVan::SendMsg(msg);
+            if (!msg.meta.control.empty()
+                && msg.meta.control.cmd == Control::TERMINATE) {
+                // debug for stop
+                send_queue_.Push(msg);
+                break;
+            }
+        }
+    }
+
+private:
+    /** the thread for sending messages */
+    // using multithread
+    std::vector<std::shared_ptr<std::thread>> sender_threads_;
+    ThreadsafePQueue send_queue_;
+    int init_stage = 0;
+};
+} // namespace ps
+
+#endif // PS_P3_VAN_H_
diff --git a/ps-lite/src/postoffice.cc b/ps-lite/src/postoffice.cc
new file mode 100644
index 0000000..0d91ab7
--- /dev/null
+++ b/ps-lite/src/postoffice.cc
@@ -0,0 +1,222 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#include <unistd.h>
+#include <thread>
+#include <chrono>
+#include "ps/internal/postoffice.h"
+#include "ps/internal/message.h"
+#include "ps/base.h"
+#include "common/logging.h"
+
+namespace ps {
+Postoffice::Postoffice() {
+    env_ref_ = Environment::_GetSharedRef();
+}
+
+void Postoffice::InitEnvironment() {
+    const char *val = NULL;
+    std::string van_type = GetEnv("DMLC_PS_VAN_TYPE", "zmq");
+    van_ = Van::Create(van_type);
+    val = CHECK_NOTNULL(Environment::Get()->find("DMLC_NUM_WORKER"));
+    num_workers_ = atoi(val);
+    val = CHECK_NOTNULL(Environment::Get()->find("DMLC_NUM_SERVER"));
+    num_servers_ = atoi(val);
+    val = CHECK_NOTNULL(Environment::Get()->find("DMLC_ROLE"));
+    std::string role(val);
+    is_worker_ = role == "worker";
+    is_server_ = role == "server";
+    is_scheduler_ = role == "scheduler";
+    verbose_ = GetEnv("PS_VERBOSE", 0);
+}
+
+void Postoffice::Start(int customer_id, const char *argv0,
+                       const bool do_barrier) {
+    start_mu_.lock();
+    if (init_stage_ == 0) {
+        InitEnvironment();
+        // init glog
+        if (argv0) {
+            dmlc::InitLogging(argv0);
+        } else {
+            dmlc::InitLogging("ps-lite\0");
+        }
+
+        // init node info.
+        for (int i = 0; i < num_workers_; ++i) {
+            int id = WorkerRankToID(i);
+            for (int g : {id, kWorkerGroup, kWorkerGroup + kServerGroup,
+                          kWorkerGroup + kScheduler,
+                          kWorkerGroup + kServerGroup + kScheduler}) {
+                node_ids_[g].push_back(id);
+            }
+        }
+
+        for (int i = 0; i < num_servers_; ++i) {
+            int id = ServerRankToID(i);
+            for (int g : {id, kServerGroup, kWorkerGroup + kServerGroup,
+                          kServerGroup + kScheduler,
+                          kWorkerGroup + kServerGroup + kScheduler}) {
+                node_ids_[g].push_back(id);
+            }
+        }
+
+        for (int g : {kScheduler, kScheduler + kServerGroup + kWorkerGroup,
+                      kScheduler + kWorkerGroup, kScheduler + kServerGroup}) {
+            node_ids_[g].push_back(kScheduler);
+        }
+        init_stage_++;
+    }
+    start_mu_.unlock();
+
+    // start van
+    van_->Start(customer_id);
+
+    start_mu_.lock();
+    if (init_stage_ == 1) {
+        // record start time
+        start_time_ = time(NULL);
+        init_stage_++;
+    }
+    start_mu_.unlock();
+    // do a barrier here
+    if (do_barrier)
+        Barrier(customer_id, kWorkerGroup + kServerGroup + kScheduler);
+}
+
+void Postoffice::Finalize(const int customer_id, const bool do_barrier) {
+    if (init_stage_ == 0)
+        return;
+    if (do_barrier)
+        Barrier(customer_id, kWorkerGroup + kServerGroup + kScheduler);
+    if (customer_id == 0) {
+        num_workers_ = 0;
+        num_servers_ = 0;
+        van_->Stop();
+        init_stage_ = 0;
+        customers_.clear();
+        node_ids_.clear();
+        barrier_done_.clear();
+        server_key_ranges_.clear();
+        heartbeats_.clear();
+        if (exit_callback_)
+            exit_callback_();
+    }
+}
+
+void Postoffice::AddCustomer(Customer *customer) {
+    std::lock_guard<std::mutex> lk(mu_);
+    int app_id = CHECK_NOTNULL(customer)->app_id();
+    // check if the customer id has existed
+    int customer_id = CHECK_NOTNULL(customer)->customer_id();
+    CHECK_EQ(customers_[app_id].count(customer_id), (size_t)0)
+        << "customer_id " << customer_id << " already exists\n";
+    customers_[app_id].insert(std::make_pair(customer_id, customer));
+    std::unique_lock<std::mutex> ulk(barrier_mu_);
+    barrier_done_[app_id].insert(std::make_pair(customer_id, false));
+}
+
+void Postoffice::RemoveCustomer(Customer *customer) {
+    std::lock_guard<std::mutex> lk(mu_);
+    int app_id = CHECK_NOTNULL(customer)->app_id();
+    int customer_id = CHECK_NOTNULL(customer)->customer_id();
+    customers_[app_id].erase(customer_id);
+    if (customers_[app_id].empty()) {
+        customers_.erase(app_id);
+    }
+}
+
+Customer *Postoffice::GetCustomer(int app_id, int customer_id,
+                                  int timeout) const {
+    Customer *obj = nullptr;
+    for (int i = 0; i < timeout * 1000 + 1; ++i) {
+        {
+            std::lock_guard<std::mutex> lk(mu_);
+            const auto it = customers_.find(app_id);
+            if (it != customers_.end()) {
+                std::unordered_map<int, Customer *> customers_in_app =
+                    it->second;
+                obj = customers_in_app[customer_id];
+                break;
+            }
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    return obj;
+}
+
+void Postoffice::Barrier(int customer_id, int node_group) {
+    if (GetNodeIDs(node_group).size() <= 1)
+        return;
+    auto role = van_->my_node().role;
+    if (role == Node::SCHEDULER) {
+        CHECK(node_group & kScheduler);
+    } else if (role == Node::WORKER) {
+        CHECK(node_group & kWorkerGroup);
+    } else if (role == Node::SERVER) {
+        CHECK(node_group & kServerGroup);
+    }
+
+    std::unique_lock<std::mutex> ulk(barrier_mu_);
+    barrier_done_[0][customer_id] = false;
+    Message req;
+    req.meta.recver = kScheduler;
+    req.meta.request = true;
+    req.meta.control.cmd = Control::BARRIER;
+    req.meta.app_id = 0;
+    req.meta.customer_id = customer_id;
+    req.meta.control.barrier_group = node_group;
+    req.meta.timestamp = van_->GetTimestamp();
+    van_->Send(req);
+    barrier_cond_.wait(
+        ulk, [this, customer_id] { return barrier_done_[0][customer_id]; });
+}
+
+const std::vector<Range> &Postoffice::GetServerKeyRanges() {
+    server_key_ranges_mu_.lock();
+    if (server_key_ranges_.empty()) {
+        for (int i = 0; i < num_servers_; ++i) {
+            server_key_ranges_.push_back(Range(
+                kMaxKey / num_servers_ * i, kMaxKey / num_servers_ * (i + 1)));
+        }
+    }
+    server_key_ranges_mu_.unlock();
+    return server_key_ranges_;
+}
+
+void Postoffice::Manage(const Message &recv) {
+    CHECK(!recv.meta.control.empty());
+    const auto &ctrl = recv.meta.control;
+    if (ctrl.cmd == Control::BARRIER && !recv.meta.request) {
+        barrier_mu_.lock();
+        auto size = barrier_done_[recv.meta.app_id].size();
+        for (size_t customer_id = 0; customer_id < size; customer_id++) {
+            barrier_done_[recv.meta.app_id][customer_id] = true;
+        }
+        barrier_mu_.unlock();
+        barrier_cond_.notify_all();
+    }
+}
+
+std::vector<int> Postoffice::GetDeadNodes(int t) {
+    std::vector<int> dead_nodes;
+    if (!van_->IsReady() || t == 0)
+        return dead_nodes;
+
+    time_t curr_time = time(NULL);
+    const auto &nodes = is_scheduler_ ?
+                            GetNodeIDs(kWorkerGroup + kServerGroup) :
+                            GetNodeIDs(kScheduler);
+    {
+        std::lock_guard<std::mutex> lk(heartbeat_mu_);
+        for (int r : nodes) {
+            auto it = heartbeats_.find(r);
+            if ((it == heartbeats_.end() || it->second + t < curr_time)
+                && start_time_ + t < curr_time) {
+                dead_nodes.push_back(r);
+            }
+        }
+    }
+    return dead_nodes;
+}
+} // namespace ps
diff --git a/ps-lite/src/python_binding.cc b/ps-lite/src/python_binding.cc
new file mode 100644
index 0000000..e4f5b6d
--- /dev/null
+++ b/ps-lite/src/python_binding.cc
@@ -0,0 +1,140 @@
+#include "ps/worker/worker.h"
+
+#include "ps/ps.h"
+#include "ps/server/kvserver.h"
+
+extern "C" {
+
+void Init() {
+    if (Postoffice::Get()->van())
+        return;
+    Start(0);
+}
+
+void Finalize() {
+    Finalize(0, true);
+}
+
+void Pull(int node_name, DLArray *arr) {
+    worker.pull(node_name, arr);
+}
+
+void Push(int node_name, const DLArray *arr, DLEvent *evt) {
+    worker.push(node_name, arr, evt);
+}
+
+void DDPushPull(int node_name, const DLArray *in_arr, DLArray *out_arr,
+                DLEvent *evt) {
+    worker.dd_pushpull(node_name, in_arr, out_arr, evt);
+}
+
+void SparsePush(int node_name, const DLArray *index, const DLArray *value,
+                DLEvent *evt) {
+    size_t index_size = 1;
+    for (int i = 0; i < index->ndim; i++)
+        index_size *= index->shape[i];
+    worker.sparse_push(node_name, index, value, index_size, evt);
+}
+
+void SparsePull(int node_name, const DLArray *index, DLArray *value) {
+    size_t index_size = 1;
+    for (int i = 0; i < index->ndim; i++)
+        index_size *= index->shape[i];
+    worker.sparse_pull(node_name, index, value, index_size);
+}
+
+void SDPushPull(int node_name, const DLArray *index, const DLArray *in_arr,
+                DLArray *out_arr, DLEvent *evt) {
+    size_t index_size = 1;
+    for (int i = 0; i < index->ndim; i++)
+        index_size *= index->shape[i];
+    worker.sd_pushpull(node_name, index, in_arr, index_size, out_arr, evt);
+}
+
+void SSPushPull(int node_name, const DLArray *inindices, const DLArray *in_arr,
+                const DLArray *outindices, DLArray *out_arr, DLEvent *evt) {
+    size_t index_size = 1;
+    assert(inindices->ndim == outindices->ndim);
+    for (int i = 0; i < inindices->ndim; ++i) {
+        assert(inindices->shape[i] == outindices->shape[i]);
+        index_size *= inindices->shape[i];
+    }
+    worker.ss_pushpull(node_name, inindices, in_arr, outindices, out_arr,
+                       index_size, evt);
+}
+
+/**
+ *   args:
+ *       index, example index
+ *       value, the example value
+ *       length, length of every example
+ */
+Worker::query_t PushData(const long *index, int index_size,
+                         const DLArray *value, const long *length) {
+    return worker.push_data(index, index_size, value, length);
+}
+
+Worker::query_t PullData(const long *index, int index_size, DLArray *value,
+                         const long *length) {
+    return worker.pull_data(index, index_size, value, length);
+}
+
+void Wait(int node_id) {
+    worker.wait(node_id);
+}
+
+void WaitData(Worker::query_t query) {
+    worker.wait_data(query);
+}
+
+void BarrierWorker() {
+    Postoffice::Get()->Barrier(0, kWorkerGroup);
+}
+
+void InitTensor(int node_name, int ptype, int len, int width, int init_type,
+                double init_a, double init_b, unsigned long long seed,
+                int otype, float lrs[], int nlr) {
+    worker.parameter_init(
+        node_name, static_cast<ParamType>(ptype), static_cast<size_t>(len),
+        static_cast<size_t>(width), static_cast<InitType>(init_type), init_a,
+        init_b, seed, static_cast<OptType>(otype), SArray<float>(lrs, nlr));
+}
+
+void Clear(int node_name) {
+    worker.clear(node_name);
+}
+
+void ClearOnServer(int node_name) {
+    worker.clear_on_server(node_name);
+}
+
+void SaveParam(int node_name, char *address) {
+    worker.parameter_save(node_name, address);
+}
+
+void LoadParam(int node_name, char *address) {
+    worker.parameter_load(node_name, address);
+}
+
+void startRecord(char *dirPath) {
+    PSAgent::Get()->startRecord(std::string(dirPath));
+}
+
+void getLoads() {
+    PSAgent::Get()->getLoads();
+}
+
+void StartServer() {
+    auto server = new KVServer(0);
+    RegisterExitCallback([server]() { delete server; });
+}
+
+int rank() {
+    return Postoffice::Get()->my_rank();
+}
+
+int nrank() {
+    return Postoffice::Get()->num_workers();
+}
+
+} // extern "C"
diff --git a/ps-lite/src/resender.h b/ps-lite/src/resender.h
new file mode 100644
index 0000000..e96bcbc
--- /dev/null
+++ b/ps-lite/src/resender.h
@@ -0,0 +1,150 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_RESENDER_H_
+#define PS_RESENDER_H_
+#include <chrono>
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+namespace ps {
+
+/**
+ * \brief resend a messsage if no ack is received within a given time
+ */
+class Resender {
+public:
+    /**
+     * \param timeout timeout in millisecond
+     */
+    Resender(int timeout, int max_num_retry, Van *van) {
+        timeout_ = timeout;
+        max_num_retry_ = max_num_retry;
+        van_ = van;
+        monitor_ = new std::thread(&Resender::Monitoring, this);
+    }
+    ~Resender() {
+        exit_ = true;
+        monitor_->join();
+        delete monitor_;
+    }
+
+    /**
+     * \brief add an outgoining message
+     *
+     */
+    void AddOutgoing(const Message &msg) {
+        if (msg.meta.control.cmd == Control::ACK)
+            return;
+        CHECK_NE(msg.meta.timestamp, Meta::kEmpty) << msg.DebugString();
+        auto key = GetKey(msg);
+        std::lock_guard<std::mutex> lk(mu_);
+        // already buffered, which often due to call Send by the monitor thread
+        if (send_buff_.find(key) != send_buff_.end())
+            return;
+
+        auto &ent = send_buff_[key];
+        ent.msg = msg;
+        ent.send = Now();
+        ent.num_retry = 0;
+    }
+
+    /**
+     * \brief add an incomming message
+     * \brief return true if msg has been added before or a ACK message
+     */
+    bool AddIncomming(const Message &msg) {
+        // a message can be received by multiple times
+        if (msg.meta.control.cmd == Control::TERMINATE) {
+            return false;
+        } else if (msg.meta.control.cmd == Control::ACK) {
+            mu_.lock();
+            auto key = msg.meta.control.msg_sig;
+            auto it = send_buff_.find(key);
+            if (it != send_buff_.end())
+                send_buff_.erase(it);
+            mu_.unlock();
+            return true;
+        } else {
+            mu_.lock();
+            auto key = GetKey(msg);
+            auto it = acked_.find(key);
+            bool duplicated = it != acked_.end();
+            if (!duplicated)
+                acked_.insert(key);
+            mu_.unlock();
+            // send back ack message (even if it is duplicated)
+            Message ack;
+            ack.meta.recver = msg.meta.sender;
+            ack.meta.sender = msg.meta.recver;
+            ack.meta.control.cmd = Control::ACK;
+            ack.meta.control.msg_sig = key;
+            van_->Send(ack);
+            // warning
+            if (duplicated)
+                LOG(WARNING) << "Duplicated message: " << msg.DebugString();
+            return duplicated;
+        }
+    }
+
+private:
+    using Time = std::chrono::milliseconds;
+    // the buffer entry
+    struct Entry {
+        Message msg;
+        Time send;
+        int num_retry = 0;
+    };
+    std::unordered_map<uint64_t, Entry> send_buff_;
+
+    uint64_t GetKey(const Message &msg) {
+        CHECK_NE(msg.meta.timestamp, Meta::kEmpty) << msg.DebugString();
+        uint16_t id = msg.meta.app_id;
+        uint8_t sender = msg.meta.sender == Node::kEmpty ? van_->my_node().id :
+                                                           msg.meta.sender;
+        uint8_t recver = msg.meta.recver;
+        return (static_cast<uint64_t>(id) << 48)
+               | (static_cast<uint64_t>(sender) << 40)
+               | (static_cast<uint64_t>(recver) << 32)
+               | (msg.meta.timestamp << 1) | msg.meta.request;
+    }
+    Time Now() {
+        return std::chrono::duration_cast<Time>(
+            std::chrono::high_resolution_clock::now().time_since_epoch());
+    }
+
+    void Monitoring() {
+        while (!exit_) {
+            std::this_thread::sleep_for(Time(timeout_));
+            std::vector<Message> resend;
+            Time now = Now();
+            mu_.lock();
+            for (auto &it : send_buff_) {
+                if (it.second.send + Time(timeout_) * (1 + it.second.num_retry)
+                    < now) {
+                    resend.push_back(it.second.msg);
+                    ++it.second.num_retry;
+                    LOG(WARNING)
+                        << van_->my_node().ShortDebugString()
+                        << ": Timeout to get the ACK message. Resend (retry="
+                        << it.second.num_retry << ") "
+                        << it.second.msg.DebugString();
+                    CHECK_LT(it.second.num_retry, max_num_retry_);
+                }
+            }
+            mu_.unlock();
+
+            for (const auto &msg : resend)
+                van_->Send(msg);
+        }
+    }
+    std::thread *monitor_;
+    std::unordered_set<uint64_t> acked_;
+    std::atomic<bool> exit_{false};
+    std::mutex mu_;
+    int timeout_;
+    int max_num_retry_;
+    Van *van_;
+};
+} // namespace ps
+#endif // PS_RESENDER_H_
diff --git a/ps-lite/src/thread_pool.cc b/ps-lite/src/thread_pool.cc
new file mode 100644
index 0000000..b7a02b9
--- /dev/null
+++ b/ps-lite/src/thread_pool.cc
@@ -0,0 +1,55 @@
+#include "common/thread_pool.h"
+
+static ThreadPool *pool;
+const size_t kThreadNum = 5;
+
+ThreadPool::ThreadPool(size_t thread_num) :
+    terminate_(false), thread_num_(thread_num), complete_task_num_(0) {
+    for (size_t i = 0; i < thread_num; ++i) {
+        threads_.emplace_back([this] {
+            for (;;) {
+                std::function<void()> task;
+                {
+                    std::unique_lock<std::mutex> lock(this->mutex_);
+                    this->cond_.wait(lock, [this] {
+                        return this->terminate_ || !this->tasks_.empty();
+                    });
+
+                    if (this->terminate_ && this->tasks_.empty())
+                        return;
+
+                    task = std::move(this->tasks_.front());
+                    this->tasks_.pop();
+                }
+                task();
+                complete_task_num_++;
+            }
+        });
+    }
+}
+
+ThreadPool::~ThreadPool() {
+    {
+        std::unique_lock<std::mutex> lock(mutex_);
+        terminate_ = true;
+    }
+    cond_.notify_all();
+
+    for (std::thread &thread : threads_) {
+        thread.join();
+    }
+}
+
+void ThreadPool::Wait(int task_num) {
+    while (complete_task_num_ != task_num) {
+        usleep(1000);
+    }
+    complete_task_num_ = 0;
+}
+
+ThreadPool *ThreadPool::Get() {
+    if (!pool) {
+        pool = new ThreadPool(kThreadNum);
+    }
+    return pool;
+}
diff --git a/ps-lite/src/van.cc b/ps-lite/src/van.cc
new file mode 100644
index 0000000..36e86d7
--- /dev/null
+++ b/ps-lite/src/van.cc
@@ -0,0 +1,580 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+
+#include <chrono>
+#include <thread>
+
+#include "ps/base.h"
+#include "ps/internal/customer.h"
+#include "ps/internal/postoffice.h"
+#include "ps/internal/van.h"
+#include "common/sarray.h"
+#include "common/logging.h"
+#include "./meta.pb.h"
+#include "./network_utils.h"
+#include "./ibverbs_van.h"
+#include "./resender.h"
+#include "./zmq_van.h"
+#include "./p3_van.h"
+
+namespace ps {
+
+// interval in second between to heartbeast signals. 0 means no heartbeat.
+// don't send heartbeast in default. because if the scheduler received a
+// heartbeart signal from a node before connected to that node, then it could be
+// problem.
+static const int kDefaultHeartbeatInterval = 0;
+
+Van *Van::Create(const std::string &type) {
+    if (type == "zmq") {
+        return new ZMQVan();
+    } else if (type == "p3") {
+        return new P3Van();
+#ifdef DMLC_USE_IBVERBS
+    } else if (type == "ibverbs") {
+        return new IBVerbsVan();
+#endif
+    } else {
+        LOG(FATAL) << "Unsupported van type: " << type;
+        return nullptr;
+    }
+}
+
+void Van::ProcessTerminateCommand() {
+    ready_ = false;
+}
+
+void Van::ProcessAddNodeCommandAtScheduler(Message *msg, Meta *nodes,
+                                           Meta *recovery_nodes) {
+    recovery_nodes->control.cmd = Control::ADD_NODE;
+    time_t t = time(NULL);
+    size_t num_nodes =
+        Postoffice::Get()->num_servers() + Postoffice::Get()->num_workers();
+    if (nodes->control.node.size() == num_nodes) {
+        // sort the nodes according their ip and port,
+        std::sort(
+            nodes->control.node.begin(), nodes->control.node.end(),
+            [](const Node &a, const Node &b) {
+                return (a.hostname.compare(b.hostname) | (a.port < b.port)) > 0;
+            });
+        // assign node rank
+        for (auto &node : nodes->control.node) {
+            std::string node_host_ip =
+                node.hostname + ":" + std::to_string(node.port);
+            if (connected_nodes_.find(node_host_ip) == connected_nodes_.end()) {
+                CHECK_EQ(node.id, Node::kEmpty);
+                int id = node.role == Node::SERVER ?
+                             Postoffice::ServerRankToID(num_servers_) :
+                             Postoffice::WorkerRankToID(num_workers_);
+                PS_VLOG(1) << "assign rank=" << id << " to node "
+                           << node.DebugString();
+                node.id = id;
+                Connect(node);
+                Postoffice::Get()->UpdateHeartbeat(node.id, t);
+                connected_nodes_[node_host_ip] = id;
+            } else {
+                int id = node.role == Node::SERVER ?
+                             Postoffice::ServerRankToID(num_servers_) :
+                             Postoffice::WorkerRankToID(num_workers_);
+                shared_node_mapping_[id] = connected_nodes_[node_host_ip];
+                node.id = connected_nodes_[node_host_ip];
+            }
+            if (node.role == Node::SERVER)
+                num_servers_++;
+            if (node.role == Node::WORKER)
+                num_workers_++;
+        }
+        nodes->control.node.push_back(my_node_);
+        nodes->control.cmd = Control::ADD_NODE;
+        Message back;
+        back.meta = *nodes;
+        for (int r :
+             Postoffice::Get()->GetNodeIDs(kWorkerGroup + kServerGroup)) {
+            int recver_id = r;
+            if (shared_node_mapping_.find(r) == shared_node_mapping_.end()) {
+                back.meta.recver = recver_id;
+                back.meta.timestamp = timestamp_++;
+                Send(back);
+            }
+        }
+        PS_VLOG(1) << "the scheduler is connected to " << num_workers_
+                   << " workers and " << num_servers_ << " servers";
+        ready_ = true;
+    } else if (!recovery_nodes->control.node.empty()) {
+        auto dead_nodes = Postoffice::Get()->GetDeadNodes(heartbeat_timeout_);
+        std::unordered_set<int> dead_set(dead_nodes.begin(), dead_nodes.end());
+        // send back the recovery node
+        CHECK_EQ(recovery_nodes->control.node.size(), 1);
+        Connect(recovery_nodes->control.node[0]);
+        Postoffice::Get()->UpdateHeartbeat(recovery_nodes->control.node[0].id,
+                                           t);
+        Message back;
+        for (int r :
+             Postoffice::Get()->GetNodeIDs(kWorkerGroup + kServerGroup)) {
+            if (r != recovery_nodes->control.node[0].id
+                && dead_set.find(r) != dead_set.end()) {
+                // do not try to send anything to dead node
+                continue;
+            }
+            // only send recovery_node to nodes already exist
+            // but send all nodes to the recovery_node
+            back.meta = (r == recovery_nodes->control.node[0].id) ?
+                            *nodes :
+                            *recovery_nodes;
+            back.meta.recver = r;
+            back.meta.timestamp = timestamp_++;
+            Send(back);
+        }
+    }
+}
+
+void Van::UpdateLocalID(Message *msg, std::unordered_set<int> *deadnodes_set,
+                        Meta *nodes, Meta *recovery_nodes) {
+    auto &ctrl = msg->meta.control;
+    size_t num_nodes =
+        Postoffice::Get()->num_servers() + Postoffice::Get()->num_workers();
+    // assign an id
+    if (msg->meta.sender == Meta::kEmpty) {
+        CHECK(is_scheduler_);
+        CHECK_EQ(ctrl.node.size(), 1);
+        if (nodes->control.node.size() < num_nodes) {
+            nodes->control.node.push_back(ctrl.node[0]);
+        } else {
+            // some node dies and restarts
+            CHECK(ready_.load());
+            for (size_t i = 0; i < nodes->control.node.size() - 1; ++i) {
+                const auto &node = nodes->control.node[i];
+                if (deadnodes_set->find(node.id) != deadnodes_set->end()
+                    && node.role == ctrl.node[0].role) {
+                    auto &recovery_node = ctrl.node[0];
+                    // assign previous node id
+                    recovery_node.id = node.id;
+                    recovery_node.is_recovery = true;
+                    PS_VLOG(1) << "replace dead node " << node.DebugString()
+                               << " by node " << recovery_node.DebugString();
+                    nodes->control.node[i] = recovery_node;
+                    recovery_nodes->control.node.push_back(recovery_node);
+                    break;
+                }
+            }
+        }
+    }
+
+    // update my id
+    for (size_t i = 0; i < ctrl.node.size(); ++i) {
+        const auto &node = ctrl.node[i];
+        if (my_node_.hostname == node.hostname && my_node_.port == node.port) {
+            if (getenv("DMLC_RANK") == nullptr || my_node_.id == Meta::kEmpty) {
+                my_node_ = node;
+                std::string rank =
+                    std::to_string(Postoffice::IDtoRank(node.id));
+#ifdef _MSC_VER
+                _putenv_s("DMLC_RANK", rank.c_str());
+#else
+                setenv("DMLC_RANK", rank.c_str(), true);
+#endif
+            }
+        }
+    }
+}
+
+void Van::ProcessHearbeat(Message *msg) {
+    auto &ctrl = msg->meta.control;
+    time_t t = time(NULL);
+    for (auto &node : ctrl.node) {
+        Postoffice::Get()->UpdateHeartbeat(node.id, t);
+        if (is_scheduler_) {
+            Message heartbeat_ack;
+            heartbeat_ack.meta.recver = node.id;
+            heartbeat_ack.meta.control.cmd = Control::HEARTBEAT;
+            heartbeat_ack.meta.control.node.push_back(my_node_);
+            heartbeat_ack.meta.timestamp = timestamp_++;
+            // send back heartbeat
+            Send(heartbeat_ack);
+        }
+    }
+}
+
+void Van::ProcessBarrierCommand(Message *msg) {
+    auto &ctrl = msg->meta.control;
+    if (msg->meta.request) {
+        if (barrier_count_.empty()) {
+            barrier_count_.resize(8, 0);
+        }
+        int group = ctrl.barrier_group;
+        ++barrier_count_[group];
+        PS_VLOG(1) << "Barrier count for " << group << " : "
+                   << barrier_count_[group];
+        if (barrier_count_[group]
+            == static_cast<int>(Postoffice::Get()->GetNodeIDs(group).size())) {
+            barrier_count_[group] = 0;
+            Message res;
+            res.meta.request = false;
+            res.meta.app_id = msg->meta.app_id;
+            res.meta.customer_id = msg->meta.customer_id;
+            res.meta.control.cmd = Control::BARRIER;
+            for (int r : Postoffice::Get()->GetNodeIDs(group)) {
+                int recver_id = r;
+                if (shared_node_mapping_.find(r)
+                    == shared_node_mapping_.end()) {
+                    res.meta.recver = recver_id;
+                    res.meta.timestamp = timestamp_++;
+                    Send(res);
+                }
+            }
+        }
+    } else {
+        Postoffice::Get()->Manage(*msg);
+    }
+}
+
+void Van::ProcessDataMsg(Message *msg) {
+    // data msg
+    CHECK_NE(msg->meta.sender, Meta::kEmpty);
+    CHECK_NE(msg->meta.recver, Meta::kEmpty);
+    CHECK_NE(msg->meta.app_id, Meta::kEmpty);
+    int app_id = msg->meta.app_id;
+    int customer_id =
+        Postoffice::Get()->is_worker() ? msg->meta.customer_id : app_id;
+    auto *obj = Postoffice::Get()->GetCustomer(app_id, customer_id, 5);
+    CHECK(obj) << "timeout (5 sec) to wait App " << app_id << " customer "
+               << customer_id << " ready at " << my_node_.role;
+    obj->Accept(*msg);
+}
+
+void Van::ProcessAddNodeCommand(Message *msg, Meta *nodes,
+                                Meta *recovery_nodes) {
+    auto dead_nodes = Postoffice::Get()->GetDeadNodes(heartbeat_timeout_);
+    std::unordered_set<int> dead_set(dead_nodes.begin(), dead_nodes.end());
+    auto &ctrl = msg->meta.control;
+
+    UpdateLocalID(msg, &dead_set, nodes, recovery_nodes);
+
+    if (is_scheduler_) {
+        ProcessAddNodeCommandAtScheduler(msg, nodes, recovery_nodes);
+    } else {
+        for (const auto &node : ctrl.node) {
+            std::string addr_str =
+                node.hostname + ":" + std::to_string(node.port);
+            if (connected_nodes_.find(addr_str) == connected_nodes_.end()) {
+                Connect(node);
+                connected_nodes_[addr_str] = node.id;
+            }
+            if (!node.is_recovery && node.role == Node::SERVER)
+                ++num_servers_;
+            if (!node.is_recovery && node.role == Node::WORKER)
+                ++num_workers_;
+        }
+        PS_VLOG(1) << my_node_.ShortDebugString() << " is connected to others";
+        ready_ = true;
+    }
+}
+
+void Van::Start(int customer_id) {
+    // get scheduler info
+    start_mu_.lock();
+
+    if (init_stage == 0) {
+        scheduler_.hostname = std::string(
+            CHECK_NOTNULL(Environment::Get()->find("DMLC_PS_ROOT_URI")));
+        scheduler_.port =
+            atoi(CHECK_NOTNULL(Environment::Get()->find("DMLC_PS_ROOT_PORT")));
+        scheduler_.role = Node::SCHEDULER;
+        scheduler_.id = kScheduler;
+        is_scheduler_ = Postoffice::Get()->is_scheduler();
+
+        // get my node info
+        if (is_scheduler_) {
+            my_node_ = scheduler_;
+        } else {
+            auto role =
+                Postoffice::Get()->is_worker() ? Node::WORKER : Node::SERVER;
+            const char *nhost = Environment::Get()->find("DMLC_NODE_HOST");
+            std::string ip;
+            if (nhost)
+                ip = std::string(nhost);
+            if (ip.empty()) {
+                const char *itf = Environment::Get()->find("DMLC_INTERFACE");
+                std::string interface;
+                if (itf)
+                    interface = std::string(itf);
+                if (interface.size()) {
+                    GetIP(interface, &ip);
+                } else {
+                    GetAvailableInterfaceAndIP(&interface, &ip);
+                }
+                CHECK(!interface.empty()) << "failed to get the interface";
+            }
+            int port = GetAvailablePort();
+            const char *pstr = Environment::Get()->find("PORT");
+            if (pstr)
+                port = atoi(pstr);
+            CHECK(!ip.empty()) << "failed to get ip";
+            CHECK(port) << "failed to get a port";
+            my_node_.hostname = ip;
+            my_node_.role = role;
+            my_node_.port = port;
+            // cannot determine my id now, the scheduler will assign it later
+            // set it explicitly to make re-register within a same process
+            // possible
+            my_node_.id = Node::kEmpty;
+            my_node_.customer_id = customer_id;
+        }
+
+        // bind.
+        my_node_.port = Bind(my_node_, is_scheduler_ ? 0 : 40);
+        PS_VLOG(1) << "Bind to " << my_node_.DebugString();
+        CHECK_NE(my_node_.port, -1) << "bind failed";
+
+        // connect to the scheduler
+        Connect(scheduler_);
+
+        // for debug use
+        if (Environment::Get()->find("PS_DROP_MSG")) {
+            drop_rate_ = atoi(Environment::Get()->find("PS_DROP_MSG"));
+        }
+        // start receiver
+        receiver_thread_ = std::unique_ptr<std::thread>(
+            new std::thread(&Van::Receiving, this));
+        init_stage++;
+    }
+    start_mu_.unlock();
+
+    if (!is_scheduler_) {
+        // let the scheduler know myself
+        Message msg;
+        Node customer_specific_node = my_node_;
+        customer_specific_node.customer_id = customer_id;
+        msg.meta.recver = kScheduler;
+        msg.meta.control.cmd = Control::ADD_NODE;
+        msg.meta.control.node.push_back(customer_specific_node);
+        msg.meta.timestamp = timestamp_++;
+        Send(msg);
+    }
+
+    // wait until ready
+    while (!ready_.load()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    start_mu_.lock();
+    if (init_stage == 1) {
+        // resender
+        if (Environment::Get()->find("PS_RESEND")
+            && atoi(Environment::Get()->find("PS_RESEND")) != 0) {
+            int timeout = 1000;
+            if (Environment::Get()->find("PS_RESEND_TIMEOUT")) {
+                timeout = atoi(Environment::Get()->find("PS_RESEND_TIMEOUT"));
+            }
+            resender_ = new Resender(timeout, 10, this);
+        }
+
+        if (!is_scheduler_) {
+            // start heartbeat thread
+            heartbeat_thread_ = std::unique_ptr<std::thread>(
+                new std::thread(&Van::Heartbeat, this));
+        }
+        init_stage++;
+    }
+    start_mu_.unlock();
+}
+
+void Van::Stop() {
+    // stop threads
+    Message exit;
+    exit.meta.control.cmd = Control::TERMINATE;
+    exit.meta.recver = my_node_.id;
+    // only customer 0 would call this method
+    exit.meta.customer_id = 0;
+    int ret = SendMsg(exit); // SendMsg(exit) to self
+    CHECK_NE(ret, -1);
+    receiver_thread_->join(); // wait for receiver to join
+    init_stage = 0;
+    if (!is_scheduler_)
+        heartbeat_thread_->join();
+    if (resender_)
+        delete resender_;
+    ready_ = false;
+    connected_nodes_.clear();
+    shared_node_mapping_.clear();
+    send_bytes_ = 0;
+    timestamp_ = 0;
+    my_node_.id = Meta::kEmpty;
+    barrier_count_.clear();
+}
+
+int Van::Send(const Message &msg) {
+    int send_bytes = SendMsg(msg);
+    CHECK_NE(send_bytes, -1);
+    send_bytes_ += send_bytes;
+    if (resender_)
+        resender_->AddOutgoing(msg);
+    if (Postoffice::Get()->verbose() >= 2) {
+        PS_VLOG(2) << msg.DebugString();
+    }
+    return send_bytes;
+}
+
+void Van::Receiving() {
+    Meta nodes;
+    Meta recovery_nodes; // store recovery nodes
+    recovery_nodes.control.cmd = Control::ADD_NODE;
+
+    while (true) {
+        Message msg;
+        int recv_bytes = RecvMsg(&msg);
+        // For debug, drop received message
+        if (ready_.load() && drop_rate_ > 0) {
+            unsigned seed = time(NULL) + my_node_.id;
+            if (rand_r(&seed) % 100 < drop_rate_) {
+                LOG(WARNING) << "Drop message " << msg.DebugString();
+                continue;
+            }
+        }
+
+        CHECK_NE(recv_bytes, -1);
+        recv_bytes_ += recv_bytes;
+        if (Postoffice::Get()->verbose() >= 2) {
+            PS_VLOG(2) << msg.DebugString();
+        }
+        // duplicated message
+        if (resender_ && resender_->AddIncomming(msg))
+            continue;
+
+        if (!msg.meta.control.empty()) {
+            // control msg
+            auto &ctrl = msg.meta.control;
+            if (ctrl.cmd == Control::TERMINATE) {
+                ProcessTerminateCommand();
+                break;
+            } else if (ctrl.cmd == Control::ADD_NODE) {
+                ProcessAddNodeCommand(&msg, &nodes, &recovery_nodes);
+            } else if (ctrl.cmd == Control::BARRIER) {
+                ProcessBarrierCommand(&msg);
+            } else if (ctrl.cmd == Control::HEARTBEAT) {
+                ProcessHearbeat(&msg);
+            } else {
+                LOG(WARNING)
+                    << "Drop unknown typed message " << msg.DebugString();
+            }
+        } else {
+            ProcessDataMsg(&msg);
+        }
+    }
+}
+
+void Van::PackMetaPB(const Meta &meta, PBMeta *pb) {
+    if (meta.app_id != Meta::kEmpty)
+        pb->set_app_id(meta.app_id);
+    if (meta.timestamp != Meta::kEmpty)
+        pb->set_timestamp(meta.timestamp);
+    pb->set_request(meta.request);
+    pb->set_priority(meta.priority);
+    pb->set_customer_id(meta.customer_id);
+    if (!meta.control.empty()) {
+        auto ctrl = pb->mutable_control();
+        ctrl->set_cmd(meta.control.cmd);
+        if (meta.control.cmd == Control::BARRIER) {
+            ctrl->set_barrier_group(meta.control.barrier_group);
+        } else if (meta.control.cmd == Control::ACK) {
+            ctrl->set_msg_sig(meta.control.msg_sig);
+        }
+        for (const auto &n : meta.control.node) {
+            auto p = ctrl->add_node();
+            p->set_id(n.id);
+            p->set_role(n.role);
+            p->set_port(n.port);
+            p->set_hostname(n.hostname);
+            p->set_is_recovery(n.is_recovery);
+            p->set_customer_id(n.customer_id);
+        }
+    }
+}
+
+void Van::PackMeta(const Meta &meta, char **meta_buf, int *buf_size) {
+    // convert into protobuf
+    PBMeta pb;
+    if (meta.app_id != Meta::kEmpty)
+        pb.set_app_id(meta.app_id);
+    if (meta.timestamp != Meta::kEmpty)
+        pb.set_timestamp(meta.timestamp);
+    pb.set_request(meta.request);
+    pb.set_priority(meta.priority);
+    pb.set_customer_id(meta.customer_id);
+    pb.set_psftype(meta.psftype);
+    if (!meta.control.empty()) {
+        auto ctrl = pb.mutable_control();
+        ctrl->set_cmd(meta.control.cmd);
+        if (meta.control.cmd == Control::BARRIER) {
+            ctrl->set_barrier_group(meta.control.barrier_group);
+        } else if (meta.control.cmd == Control::ACK) {
+            ctrl->set_msg_sig(meta.control.msg_sig);
+        }
+        for (const auto &n : meta.control.node) {
+            auto p = ctrl->add_node();
+            p->set_id(n.id);
+            p->set_role(n.role);
+            p->set_port(n.port);
+            p->set_hostname(n.hostname);
+            p->set_is_recovery(n.is_recovery);
+            p->set_customer_id(n.customer_id);
+        }
+    }
+
+    // to string
+    *buf_size = pb.ByteSizeLong();
+    *meta_buf = new char[*buf_size + 1];
+    CHECK(pb.SerializeToArray(*meta_buf, *buf_size))
+        << "failed to serialize protbuf";
+}
+
+void Van::UnpackMeta(const char *meta_buf, int buf_size, Meta *meta) {
+    // to protobuf
+    PBMeta pb;
+    CHECK(pb.ParseFromArray(meta_buf, buf_size))
+        << "failed to parse string into protobuf";
+
+    // to meta
+    meta->app_id = pb.has_app_id() ? pb.app_id() : Meta::kEmpty;
+    meta->timestamp = pb.has_timestamp() ? pb.timestamp() : Meta::kEmpty;
+    meta->request = pb.request();
+    meta->priority = pb.priority();
+    meta->customer_id = pb.customer_id();
+    meta->psftype = static_cast<PsfType>(pb.psftype());
+
+    if (pb.has_control()) {
+        const auto &ctrl = pb.control();
+        meta->control.cmd = static_cast<Control::Command>(ctrl.cmd());
+        meta->control.barrier_group = ctrl.barrier_group();
+        meta->control.msg_sig = ctrl.msg_sig();
+        for (int i = 0; i < ctrl.node_size(); ++i) {
+            const auto &p = ctrl.node(i);
+            Node n;
+            n.role = static_cast<Node::Role>(p.role());
+            n.port = p.port();
+            n.hostname = p.hostname();
+            n.id = p.has_id() ? p.id() : Node::kEmpty;
+            n.is_recovery = p.is_recovery();
+            n.customer_id = p.customer_id();
+            meta->control.node.push_back(n);
+        }
+    } else {
+        meta->control.cmd = Control::EMPTY;
+    }
+}
+
+void Van::Heartbeat() {
+    const char *val = Environment::Get()->find("PS_HEARTBEAT_INTERVAL");
+    const int interval = val ? atoi(val) : kDefaultHeartbeatInterval;
+    while (interval > 0 && ready_.load()) {
+        std::this_thread::sleep_for(std::chrono::seconds(interval));
+        Message msg;
+        msg.meta.recver = kScheduler;
+        msg.meta.control.cmd = Control::HEARTBEAT;
+        msg.meta.control.node.push_back(my_node_);
+        msg.meta.timestamp = timestamp_++;
+        Send(msg);
+    }
+}
+} // namespace ps
diff --git a/ps-lite/src/worker.cc b/ps-lite/src/worker.cc
new file mode 100644
index 0000000..34aa485
--- /dev/null
+++ b/ps-lite/src/worker.cc
@@ -0,0 +1,209 @@
+#include "ps/worker/worker.h"
+
+Worker::Worker() {
+}
+
+void Worker::parameter_init(int node_name, ParamType ptype, size_t len,
+                            size_t width, InitType init_type, double init_a,
+                            double init_b, unsigned long long seed,
+                            OptType otype, SArray<float> lrs) {
+    PSAgent::Get()->registerTensor(node_name, ptype, len, width);
+    if (MyRank() == 0) {
+        PSAgent::Get()->ParameterInit(node_name, init_type, init_a, init_b,
+                                      seed, otype, lrs);
+        PSAgent::Get()->wait(node_name);
+    }
+    Postoffice::Get()->Barrier(0, kWorkerGroup);
+}
+
+void Worker::parameter_save(int node_name, char *address) {
+    PSAgent::Get()->ParameterSave(node_name, address);
+}
+
+void Worker::parameter_load(int node_name, char *address) {
+    PSAgent::Get()->ParameterLoad(node_name, address);
+}
+
+void Worker::push(int node_name, const DLArray *arr, DLEvent *evt) {
+    float *data = static_cast<float *>(arr->data);
+    node2pushthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *data, DLEvent *evt) -> void {
+            if (evt != NULL)
+                DLEventSync(evt);
+            PSAgent::Get()->vecDensePush(node_name, data, -node_name);
+        },
+        data, evt);
+}
+
+Worker::query_t Worker::push_data(const long *indices, int index_size,
+                                  const DLArray *value, const long *lengths) {
+    float *data = static_cast<float *>(value->data);
+    data_mu.lock();
+    query_t cur_query = next_query++;
+    auto &timestamps = query2timestamp[cur_query];
+    data_mu.unlock();
+
+    for (int i = 0; i < index_size; i++) {
+        Key idx = (Key)indices[i];
+        auto len = lengths[i];
+        PSAgent::Get()->PushData(idx, data, len, timestamps);
+        data += len;
+    }
+    return cur_query;
+}
+
+// this is almost the same as push_data
+Worker::query_t Worker::pull_data(const long *indices, int index_size,
+                                  DLArray *value, const long *lengths) {
+    float *data = static_cast<float *>(value->data);
+    data_mu.lock();
+    query_t cur_query = next_query++;
+    auto &timestamps = query2timestamp[cur_query];
+    data_mu.unlock();
+
+    for (int i = 0; i < index_size; i++) {
+        Key idx = (Key)indices[i];
+        auto len = lengths[i];
+        PSAgent::Get()->PullData(idx, data, len, timestamps);
+        data += len;
+    }
+    return cur_query;
+}
+
+// wait_data waits until a query success
+void Worker::wait_data(query_t query) {
+    data_mu.lock();
+    auto iter = query2timestamp.find(query);
+    if (iter == query2timestamp.end()) {
+        data_mu.unlock();
+        LG << "Wait on empty query " << query;
+        return;
+    } else {
+        auto timestamps = std::move(iter->second);
+        query2timestamp.erase(iter);
+        data_mu.unlock();
+        for (int t : timestamps) {
+            PSAgent::Get()->waitTimestamp(t);
+        }
+    }
+}
+
+void Worker::pull(int node_name, DLArray *arr) {
+    float *rets = static_cast<float *>(arr->data);
+    node2pullthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *data, std::future<void> &push_thread) -> void {
+            if (push_thread.valid()) {
+                push_thread.wait();
+                PSAgent::Get()->wait(node_name);
+            }
+            PSAgent::Get()->vecDensePull(node_name, data, -node_name);
+        },
+        rets, std::ref(node2pushthread[node_name]));
+}
+
+void Worker::dd_pushpull(int node_name, const DLArray *in_arr, DLArray *out_arr,
+                         DLEvent *evt) {
+    float *in_data = static_cast<float *>(in_arr->data);
+    float *out_data = static_cast<float *>(out_arr->data);
+    node2pullthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *in_data, float *out_data, DLEvent *evt) -> void {
+            if (evt != NULL)
+                DLEventSync(evt);
+            PSAgent::Get()->vecDDPushPull(node_name, in_data, out_data,
+                                          -node_name);
+        },
+        in_data, out_data, evt);
+}
+
+void Worker::sparse_pull(int node_name, const DLArray *index, DLArray *value,
+                         size_t index_size) {
+    float *data = static_cast<float *>(value->data);
+    float *indices = static_cast<float *>(index->data);
+    node2pullthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *indices, float *data, const size_t index_size,
+                    std::future<void> &push_thread) -> void {
+            if (push_thread.valid()) {
+                push_thread.wait();
+                PSAgent::Get()->wait(node_name);
+            }
+            PSAgent::Get()->vecPullSparse(node_name, indices, data, index_size,
+                                          -node_name);
+        },
+        indices, data, index_size, std::ref(node2pushthread[node_name]));
+}
+
+void Worker::sparse_push(int node_name, const DLArray *index,
+                         const DLArray *value, size_t index_size,
+                         DLEvent *evt) {
+    // for gradient value
+    // the DLArray_len should be the length of the parameter
+    // for sparse push the length of gradient value is not equal to
+    // corresponding parameter
+    float *data = static_cast<float *>(value->data);
+    float *indices = static_cast<float *>(index->data);
+    node2pushthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *indices, float *data, const size_t index_size,
+                    DLEvent *evt) -> void {
+            if (evt != NULL)
+                DLEventSync(evt);
+            PSAgent::Get()->vecPushSparse(node_name, indices, data, index_size,
+                                          -node_name);
+        },
+        indices, data, index_size, evt);
+}
+
+void Worker::sd_pushpull(int node_name, const DLArray *index,
+                         const DLArray *in_arr, size_t index_size,
+                         DLArray *out_arr, DLEvent *evt) {
+    float *indices = static_cast<float *>(index->data);
+    float *in_data = static_cast<float *>(in_arr->data);
+    float *out_data = static_cast<float *>(out_arr->data);
+    node2pullthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *indices, float *in_data, size_t ind_size,
+                    float *out_data, DLEvent *evt) -> void {
+            if (evt != NULL)
+                DLEventSync(evt);
+            PSAgent::Get()->vecSDPushPull(node_name, indices, in_data, ind_size,
+                                          out_data, -node_name);
+        },
+        indices, in_data, index_size, out_data, evt);
+}
+
+void Worker::ss_pushpull(int node_name, const DLArray *inind,
+                         const DLArray *in_arr, const DLArray *outind,
+                         DLArray *out_arr, size_t index_size, DLEvent *evt) {
+    float *inindices = static_cast<float *>(inind->data);
+    float *outindices = static_cast<float *>(outind->data);
+    float *in_data = static_cast<float *>(in_arr->data);
+    float *out_data = static_cast<float *>(out_arr->data);
+    node2pullthread[node_name] = ThreadPool::Get()->Enqueue(
+        [node_name](float *inindices, float *in_data, float *outindices,
+                    float *out_data, size_t index_size, DLEvent *evt) -> void {
+            if (evt != NULL)
+                DLEventSync(evt);
+            PSAgent::Get()->vecSSPushPull(node_name, inindices, in_data,
+                                          outindices, out_data, index_size,
+                                          -node_name);
+        },
+        inindices, in_data, outindices, out_data, index_size, evt);
+}
+
+void Worker::wait(int node_name) {
+    std::future<void> &push_thread = node2pushthread[node_name];
+    std::future<void> &pull_thread = node2pullthread[node_name];
+    if (push_thread.valid())
+        push_thread.wait();
+    if (pull_thread.valid())
+        pull_thread.wait();
+    PSAgent::Get()->wait(node_name);
+}
+
+void Worker::clear(int node_name) {
+    PSAgent::Get()->clear(node_name);
+}
+
+void Worker::clear_on_server(int node_name) {
+    PSAgent::Get()->clearOnServer(node_name);
+}
+
+Worker worker;
diff --git a/ps-lite/src/zmq_van.h b/ps-lite/src/zmq_van.h
new file mode 100644
index 0000000..d56f1cc
--- /dev/null
+++ b/ps-lite/src/zmq_van.h
@@ -0,0 +1,278 @@
+/**
+ *  Copyright (c) 2015 by Contributors
+ */
+#ifndef PS_ZMQ_VAN_H_
+#define PS_ZMQ_VAN_H_
+#include <stdio.h>
+#include <stdlib.h>
+#include <zmq.h>
+#include <string>
+#include <unordered_map>
+#include "ps/internal/van.h"
+#if _MSC_VER
+#define rand_r(x) rand()
+#endif
+
+namespace ps {
+/**
+ * \brief be smart on freeing recved data
+ */
+inline void FreeData(void *data, void *hint) {
+    if (hint == NULL) {
+        delete[] static_cast<char *>(data);
+    } else {
+        delete static_cast<SArray<char> *>(hint);
+    }
+}
+
+/**
+ * \brief ZMQ based implementation
+ */
+class ZMQVan : public Van {
+public:
+    ZMQVan() {
+    }
+    virtual ~ZMQVan() {
+    }
+
+protected:
+    void Start(int customer_id) override {
+        // start zmq
+        start_mu_.lock();
+        if (context_ == nullptr) {
+            context_ = zmq_ctx_new();
+            CHECK(context_ != NULL) << "create 0mq context failed";
+            zmq_ctx_set(context_, ZMQ_MAX_SOCKETS, 65536);
+        }
+        start_mu_.unlock();
+        // zmq_ctx_set(context_, ZMQ_IO_THREADS, 4);
+        Van::Start(customer_id);
+    }
+
+    void Stop() override {
+        PS_VLOG(1) << my_node_.ShortDebugString() << " is stopping";
+        Van::Stop();
+        // close sockets
+        int linger = 0;
+        int rc = zmq_setsockopt(receiver_, ZMQ_LINGER, &linger, sizeof(linger));
+        CHECK(rc == 0 || errno == ETERM);
+        CHECK_EQ(zmq_close(receiver_), 0);
+        for (auto &it : senders_) {
+            int rc =
+                zmq_setsockopt(it.second, ZMQ_LINGER, &linger, sizeof(linger));
+            CHECK(rc == 0 || errno == ETERM);
+            CHECK_EQ(zmq_close(it.second), 0);
+        }
+        senders_.clear();
+        zmq_ctx_destroy(context_);
+        context_ = nullptr;
+    }
+
+    int Bind(const Node &node, int max_retry) override {
+        receiver_ = zmq_socket(context_, ZMQ_ROUTER);
+        CHECK(receiver_ != NULL)
+            << "create receiver socket failed: " << zmq_strerror(errno);
+        int local = GetEnv("DMLC_LOCAL", 0);
+        std::string hostname = node.hostname.empty() ? "*" : node.hostname;
+        int use_kubernetes = GetEnv("DMLC_USE_KUBERNETES", 0);
+        if (use_kubernetes > 0 && node.role == Node::SCHEDULER) {
+            hostname = "0.0.0.0";
+        }
+        hostname = "0.0.0.0";
+        std::string addr = local ? "ipc:///tmp/" : "tcp://" + hostname + ":";
+        int port = node.port;
+        unsigned seed = static_cast<unsigned>(time(NULL) + port);
+        for (int i = 0; i < max_retry + 1; ++i) {
+            auto address = addr + std::to_string(port);
+            if (zmq_bind(receiver_, address.c_str()) == 0)
+                break;
+            if (i == max_retry) {
+                port = -1;
+            } else {
+                port = 10000 + rand_r(&seed) % 40000;
+            }
+        }
+        return port;
+    }
+
+    void Connect(const Node &node) override {
+        CHECK_NE(node.id, node.kEmpty);
+        CHECK_NE(node.port, node.kEmpty);
+        CHECK(node.hostname.size());
+        int id = node.id;
+        auto it = senders_.find(id);
+        if (it != senders_.end()) {
+            zmq_close(it->second);
+        }
+        // worker doesn't need to connect to the other workers. same for server
+        if ((node.role == my_node_.role) && (node.id != my_node_.id)) {
+            return;
+        }
+        void *sender = zmq_socket(context_, ZMQ_DEALER);
+        CHECK(sender != NULL)
+            << zmq_strerror(errno)
+            << ". it often can be solved by \"sudo ulimit -n 65536\""
+            << " or edit /etc/security/limits.conf";
+        if (my_node_.id != Node::kEmpty) {
+            std::string my_id = "ps" + std::to_string(my_node_.id);
+            zmq_setsockopt(sender, ZMQ_IDENTITY, my_id.data(), my_id.size());
+            const char *watermark =
+                Environment::Get()->find("DMLC_PS_WATER_MARK");
+            if (watermark) {
+                const int hwm = atoi(watermark);
+                zmq_setsockopt(sender, ZMQ_SNDHWM, &hwm, sizeof(hwm));
+            }
+        }
+        // connect
+        std::string addr =
+            "tcp://" + node.hostname + ":" + std::to_string(node.port);
+        if (GetEnv("DMLC_LOCAL", 0)) {
+            addr = "ipc:///tmp/" + std::to_string(node.port);
+        }
+        if (zmq_connect(sender, addr.c_str()) != 0) {
+            LOG(FATAL) << "connect to " + addr
+                              + " failed: " + zmq_strerror(errno);
+        }
+        senders_[id] = sender;
+    }
+
+    int SendMsg(const Message &msg) override {
+        std::lock_guard<std::mutex> lk(mu_);
+        // find the socket
+        int id = msg.meta.recver;
+        CHECK_NE(id, Meta::kEmpty);
+        auto it = senders_.find(id);
+        if (it == senders_.end()) {
+            LOG(WARNING) << "there is no socket to node " << id;
+            return -1;
+        }
+        void *socket = it->second;
+        // send meta
+        int meta_size;
+        char *meta_buf;
+        PackMeta(msg.meta, &meta_buf, &meta_size);
+        int tag = ZMQ_SNDMORE;
+        int n = msg.data.size();
+        if (n == 0)
+            tag = 0;
+        zmq_msg_t meta_msg;
+        zmq_msg_init_data(&meta_msg, meta_buf, meta_size, FreeData, NULL);
+        while (true) {
+            if (zmq_msg_send(&meta_msg, socket, tag) == meta_size)
+                break;
+            if (errno == EINTR)
+                continue;
+            return -1;
+        }
+        // zmq_msg_close(&meta_msg);
+        int send_bytes = meta_size;
+        // send data
+        for (int i = 0; i < n; ++i) {
+            zmq_msg_t data_msg;
+            SArray<char> *data = new SArray<char>(msg.data[i]);
+            int data_size = data->size();
+            zmq_msg_init_data(&data_msg, data->data(), data->size(), FreeData,
+                              data);
+            if (i == n - 1)
+                tag = 0;
+            while (true) {
+                if (zmq_msg_send(&data_msg, socket, tag) == data_size)
+                    break;
+                if (errno == EINTR)
+                    continue;
+                LOG(WARNING) << "failed to send message to node [" << id
+                             << "] errno: " << errno << " "
+                             << zmq_strerror(errno) << ". " << i << "/" << n;
+                return -1;
+            }
+            // zmq_msg_close(&data_msg);
+            send_bytes += data_size;
+        }
+        return send_bytes;
+    }
+
+    int RecvMsg(Message *msg) override {
+        msg->data.clear();
+        size_t recv_bytes = 0;
+        for (int i = 0;; ++i) {
+            zmq_msg_t *zmsg = new zmq_msg_t;
+            CHECK(zmq_msg_init(zmsg) == 0) << zmq_strerror(errno);
+            while (true) {
+                if (zmq_msg_recv(zmsg, receiver_, 0) != -1)
+                    break;
+                if (errno == EINTR) {
+                    std::cout << "interrupted";
+                    continue;
+                }
+                LOG(WARNING) << "failed to receive message. errno: " << errno
+                             << " " << zmq_strerror(errno);
+                return -1;
+            }
+            char *buf = CHECK_NOTNULL((char *)zmq_msg_data(zmsg));
+            size_t size = zmq_msg_size(zmsg);
+            recv_bytes += size;
+
+            if (i == 0) {
+                // identify
+                msg->meta.sender = GetNodeID(buf, size);
+                msg->meta.recver = my_node_.id;
+                CHECK(zmq_msg_more(zmsg));
+                zmq_msg_close(zmsg);
+                delete zmsg;
+            } else if (i == 1) {
+                // task
+                UnpackMeta(buf, size, &(msg->meta));
+                zmq_msg_close(zmsg);
+                bool more = zmq_msg_more(zmsg);
+                delete zmsg;
+                if (!more)
+                    break;
+            } else {
+                // zero-copy
+                SArray<char> data;
+                data.reset(buf, size, [zmsg, size](char *buf) {
+                    zmq_msg_close(zmsg);
+                    delete zmsg;
+                });
+                msg->data.push_back(data);
+                if (!zmq_msg_more(zmsg)) {
+                    break;
+                }
+            }
+        }
+        return recv_bytes;
+    }
+
+private:
+    /**
+     * return the node id given the received identity
+     * \return -1 if not find
+     */
+    int GetNodeID(const char *buf, size_t size) {
+        if (size > 2 && buf[0] == 'p' && buf[1] == 's') {
+            int id = 0;
+            size_t i = 2;
+            for (; i < size; ++i) {
+                if (buf[i] >= '0' && buf[i] <= '9') {
+                    id = id * 10 + buf[i] - '0';
+                } else {
+                    break;
+                }
+            }
+            if (i == size)
+                return id;
+        }
+        return Meta::kEmpty;
+    }
+
+    void *context_ = nullptr;
+    /**
+     * \brief node_id to the socket for sending data to this node
+     */
+    std::unordered_map<int, void *> senders_;
+    std::mutex mu_;
+    void *receiver_ = nullptr;
+};
+} // namespace ps
+
+#endif // PS_ZMQ_VAN_H_
diff --git a/python/graphboard/__init__.py b/python/graphboard/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/graphboard/graph2fig.py b/python/graphboard/graph2fig.py
new file mode 100644
index 0000000..50ea064
--- /dev/null
+++ b/python/graphboard/graph2fig.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+
+from graphviz import Digraph
+import subprocess
+import os
+import signal
+
+pid = None
+
+
+def show(executor, port=9997):
+    print("Generating graph figure")
+    dot = Digraph()
+    dot.format = 'png'
+    for node in executor.topo_order:
+        dot.node(str(node.id), node.name)
+        print(node.desc)
+        if node.inputs:
+            for n in node.inputs:
+                dot.edge(str(n.id), str(node.id))
+    print(dot.source)
+    dot.render('python/graphboard/output')
+    print("Starting server..")
+    cmd = 'cd python/graphboard; python -m SimpleHTTPServer '+str(port)
+    pro = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid)
+    global pid
+    pid = pro.pid
+
+
+def close():
+    global pid
+    os.killpg(pid, signal.SIGTERM)
diff --git a/python/graphboard/index.html b/python/graphboard/index.html
new file mode 100644
index 0000000..5f795c2
--- /dev/null
+++ b/python/graphboard/index.html
@@ -0,0 +1,13 @@
+<html>
+<header>
+    <title>Hetu</title>
+</header>
+
+<body text="Orange">
+    <H1>
+        Hetu Graph Borad:
+    </H1>
+    <img alt="dataflow_graph" src="output.png" />
+</body>
+
+</html>
\ No newline at end of file
diff --git a/python/hetu/__init__.py b/python/hetu/__init__.py
new file mode 100644
index 0000000..4ab0ed5
--- /dev/null
+++ b/python/hetu/__init__.py
@@ -0,0 +1,10 @@
+from __future__ import absolute_import
+from .gpu_ops import *
+from .context import context, get_current_context
+from .dataloader import dataloader_op, Dataloader, GNNDataLoaderOp
+from .ndarray import cpu, gpu, rcpu, rgpu, array, sparse_array, empty, is_gpu_ctx
+from . import optimizer as optim
+from . import lr_scheduler as lr
+from . import initializers as init
+from . import data
+#from .tokenizers import *
diff --git a/python/hetu/_base.py b/python/hetu/_base.py
new file mode 100644
index 0000000..28cf754
--- /dev/null
+++ b/python/hetu/_base.py
@@ -0,0 +1,114 @@
+""" ctypes library of hetusys and helper functions """
+from __future__ import absolute_import
+
+import os
+import ctypes
+
+
+def _check_functions(lib, func_dict):
+    for func in func_dict:
+        if hasattr(lib, func):
+            func_dict[func] = True
+
+
+# Defines a dictionary to indicate whether to use DNNL.True is use,False not use.
+DNNL_LIB = {
+    'DnnlMatrixMultiply': False,
+    'DnnlMatrixElementwiseMultiplyByConst': False,
+    'DnnlMatrixElementwiseMultiply': False,
+    'DnnlMatrixElementwiseAddByConst': False,
+    'DnnlMatrixElementwiseAdd': False,
+    'DnnlMatrixElementwiseDivideByConst': False,
+    'DnnlMatrixElementwiseDivide': False,
+    'cpu_BroadcastTo': False,  # c++
+    'cpu_ReduceSumAxisZero': False,
+    'cpu_ArraySet': False,
+    'cpu_Reshape': False,  # c++
+    'DnnlSoftmax': False,
+    'DnnlSoftmaxCrossEntropy': False,  # c++
+    'DnnlSoftmaxCrossEntropy_Gradient': False,  # c++
+    'DnnlSqrt': False,
+    'DnnlReciprocalSqrt': False,
+    'DnnlTanh': False,
+    'DnnlOpposite': False,
+    'DnnlSigmoid': False,
+    'DnnlConv2d': False,
+    'DnnlConv2d_Gradient_of_Filter': False,
+    'DnnlConv2d_Gradient_of_Data': False,
+    'DnnlAvgPool': False,
+    'DnnlAvgPool_Gradient': False,
+    'DnnlMaxPool': False,
+    'DnnlMaxPool_Gradient': False,
+    'DnnlRelu': False,
+    'DnnlRelu_Gradient': False,
+    'DnnlBatchNorm': False,
+    'DnnlBatchNorm_Gradient': False,
+    'DnnlBatchNorm_Inference': False,
+    'DnnlConcat': False,
+    'cpu_Concat_Gradient': False,  # c++
+    'cpu_Dropout': False,  # c++
+    'cpu_Dropout_Gradient': False,  # c++
+    'cpu_Pad': False,  # c++
+    'cpu_Pad_Gradient': False,  # c++
+    'cpu_EmbeddingLookup': False,  # c++
+    'cpu_Transpose': False,  # c++
+    'cpu_SGDOptimizerUpdate': False,  # c++
+    'cpu_SGDOptimizerSparseUpdate': False,  # c++
+    'cpu_MomentumOptimizerUpdate': False,  # c++
+    'cpu_AdaGradOptimizerUpdate': False,  # c++
+    'cpu_AdamOptimizerUpdate': False,  # c++
+    'cpu_UniformInit': False,  # c++
+    'cpu_NormalInit': False,  # c++
+    'cpu_TruncatedNormalInit': False,  # c++
+}
+
+
+def _load_lib():
+    """Load libary in build/lib."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_path = os.path.join(curr_path, '../../build/lib/')
+    path_to_so_file = os.path.join(lib_path, "libc_runtime_api.so")
+    lib = ctypes.CDLL(path_to_so_file, ctypes.RTLD_GLOBAL)
+    _check_functions(lib, DNNL_LIB)
+    return lib
+
+
+# global library instance
+_LIB = _load_lib()
+
+
+##################
+# Helper Methods #
+##################
+
+def check_call(ret):
+    """Check the return value of C API call
+
+    This function will crash when error occurs.
+    Wrap every API call with this function
+
+    Parameters
+    ----------
+    ret : int
+        return value from API calls
+    """
+    assert(ret == 0)
+
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array
+
+    Parameters
+    ----------
+    ctype : ctypes data type
+        data type of the array we want to convert to
+
+    values : tuple or list
+        data content
+
+    Returns
+    -------
+    out : ctypes array
+        Created ctypes array
+    """
+    return (ctype * len(values))(*values)
diff --git a/python/hetu/communicator/__init__.py b/python/hetu/communicator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/hetu/communicator/mpi_comm.py b/python/hetu/communicator/mpi_comm.py
new file mode 100644
index 0000000..6e20878
--- /dev/null
+++ b/python/hetu/communicator/mpi_comm.py
@@ -0,0 +1,106 @@
+from ctypes import *
+import ctypes
+from hetu import ndarray
+import numpy as np
+import os
+from enum import Enum
+
+
+def _load_mpi_lib():
+    """Load libary in build/lib."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_path = os.path.join(curr_path, '../../../build/lib/')
+    path_to_so_file = os.path.join(lib_path, "lib_mpi_runtime_api.so")
+    lib = CDLL(path_to_so_file, RTLD_LOCAL)
+    return lib
+
+
+lib_mpi = _load_mpi_lib()
+
+
+class MPIDataType_t(Enum):
+    MPI_Char = 0
+    MPI_Int = 1
+    MPI_Uint32 = 2
+    MPI_Int64 = 3
+    MPI_Uint64 = 4
+    MPI_Float32 = 5
+    MPI_Float64 = 6
+
+
+class MPIOp_t(Enum):
+    MPI_OP_NULL = 0
+    MPI_MAX = 1
+    MPI_MIN = 2
+    MPI_SUM = 3
+
+
+class MPI_Communicator():
+
+    def __init__(self):
+        '''
+            mpicomm: the MPI communicator, to use in MPI_Bcast, MPI_Reduce, MPI_Scatter, etc
+            ncclcomm: the NCCL communicator, to use in ncclAllReduce ...
+            nRanks: the total number of MPI threads
+            myRanks: the rank in all MPI threads
+            localRank: the rank among the MPI threads in this device
+            ncclId: ncclGetUniqueId should be called once when creating a communicator
+                    and the Id should be distributed to all ranks in the communicator before calling ncclCommInitRank.
+            stream: the stream for NCCL communication
+        '''
+        self.mpicomm = c_int64(0)
+        self.nRanks = c_int32(0)
+        self.myRank = c_int32(0)
+        self.MPI_Init()
+        self.MPI_GetComm()
+        self.MPI_Get_Comm_rank()
+        self.MPI_Get_Comm_size()
+
+    def MPI_Init(self):
+        lib_mpi.MPIInit()
+
+    def MPI_GetComm(self):
+        lib_mpi.MPIGetComm(ctypes.byref(self.mpicomm))
+
+    def MPI_Get_Comm_rank(self):
+        lib_mpi.getMPICommRank(ctypes.byref(
+            self.mpicomm), ctypes.byref(self.myRank))
+
+    def MPI_Get_Comm_size(self):
+        lib_mpi.getMPICommSize(ctypes.byref(
+            self.mpicomm), ctypes.byref(self.nRanks))
+
+    def rank(self):
+        return self.myRank
+
+    def size(self):
+        return self.nRanks
+
+    def DLArrayAllReduce(self, dlarray, datatype, reduceop):
+        lib_mpi.dlarrayAllReduce(dlarray.handle, c_int(datatype.value), c_int(
+            reduceop.value), ctypes.byref(self.mpicomm))
+
+    def allReduce(self, arr):
+        self.DLArrayAllReduce(arr, MPIDataType_t.MPI_Float32, MPIOp_t.MPI_SUM)
+
+    def finish(self):
+        lib_mpi.MPIFinalize()
+
+
+def mpi_communicator():
+    '''
+
+    '''
+    return MPI_Communicator()
+
+
+# mpirun --allow-run-as-root -np 4 python2 mpi_comm.py
+if __name__ == "__main__":
+    comm = mpi_communicator()
+    comm.MPI_GetComm()
+    print("rank = %d" % (comm.rank().value))
+    arr = np.ones([10]) * comm.rank().value
+    arr = ndarray.array(arr)
+    comm.allReduce(arr)
+    print(arr.asnumpy())
+    comm.finish()
diff --git a/python/hetu/communicator/mpi_nccl_comm.py b/python/hetu/communicator/mpi_nccl_comm.py
new file mode 100644
index 0000000..94486b7
--- /dev/null
+++ b/python/hetu/communicator/mpi_nccl_comm.py
@@ -0,0 +1,310 @@
+from ctypes import *
+from hetu import ndarray
+from hetu.stream import *
+from hetu.context import DeviceGroup
+import numpy as np
+from enum import Enum
+import os
+import socket
+
+
+def _load_nccl_lib():
+    """Load libary in build/lib."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_path = os.path.join(curr_path, '../../../build/lib/')
+    path_to_so_file = os.path.join(lib_path, "lib_mpi_nccl_runtime_api.so")
+    lib = CDLL(path_to_so_file, RTLD_LOCAL)
+    return lib
+
+
+lib_mpi_nccl = _load_nccl_lib()
+# lib_mpi_nccl = CDLL("./lib_mpi_nccl_runtime_api.so", RTLD_LOCAL)
+
+
+class ncclDataType_t(Enum):
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclNumTypes = 9
+
+
+class ncclRedOp_t(Enum):
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclNumOps = 4
+
+
+class ncclUniqueId(Structure):
+    _fields_ = [("internal", (c_int8 * 128))]
+
+
+class MPI_Communicator(object):
+    def __init__(self, devices=None):
+        '''
+            mpicomm: the MPI communicator, to use in MPI_Bcast, MPI_Reduce, MPI_Scatter, etc
+            nRanks: the total number of MPI threads
+            myRanks: the rank in all MPI threads
+            localRank: the rank among the MPI threads in this device
+        '''
+        self.mpicomm = c_int64(0)
+        self.nRanks = c_int32(0)
+        self.myRank = c_int32(0)
+        self.localRank = c_int32(-1)
+        self.device_id = c_int(0)
+
+        self.MPI_Init()
+        self.MPIGetComm()
+        self.MPI_Comm_rank()
+        self.MPI_Comm_size()
+        self.hostHashs = (c_ulonglong * self.nRanks.value)()
+        self.hostDevices = (c_int * self.nRanks.value)()
+        self.getLocalRank()
+
+        self.devices = devices
+        self.device_id.value = self.getDeviceFromLocalRank(
+            self.localRank.value)
+        self.getGlobalDevice()
+
+    @property
+    def dev_id(self):
+        return self.device_id.value
+
+    @property
+    def local_rank(self):
+        return self.localRank.value
+
+    @property
+    def rank(self):
+        return self.myRank.value
+
+    @property
+    def nrank(self):
+        return self.nRanks.value
+
+    def MPI_Init(self):
+        lib_mpi_nccl.MPIInit()
+
+    def MPI_Finalize(self):
+        lib_mpi_nccl.MPIFinalize()
+
+    def MPIGetComm(self):
+        lib_mpi_nccl.MPIGetComm(ctypes.byref(self.mpicomm))
+
+    def MPI_Comm_rank(self):
+        lib_mpi_nccl.getMPICommRank(ctypes.byref(
+            self.mpicomm), ctypes.byref(self.myRank))
+
+    def MPI_Comm_size(self):
+        lib_mpi_nccl.getMPICommSize(ctypes.byref(
+            self.mpicomm), ctypes.byref(self.nRanks))
+
+    def getLocalRank(self):
+        lib_mpi_nccl.getLocalRank(ctypes.byref(
+            self.mpicomm), self.nRanks, self.myRank, ctypes.byref(self.localRank), self.hostHashs)
+
+    def getGlobalDevice(self):
+        lib_mpi_nccl.getGlobalDevice(ctypes.byref(
+            self.mpicomm), self.nRanks, self.myRank, self.device_id, self.hostDevices)
+
+    def getRankFromDevice(self, hostname, device_id):
+        if hostname == 'localhost':
+            hostname = socket.gethostname()
+        # hash
+        result = 5381
+        for c in hostname:
+            result = result * 33 + ord(c)
+        rank = 0
+        while rank < self.nrank and (result != self.hostHashs[rank] or device_id != self.hostDevices[rank]):
+            rank += 1
+        assert rank < self.nrank, 'Device %d in host %s not found.' % (
+            device_id, hostname)
+        return rank
+
+    def getDeviceFromLocalRank(self, local_rank):
+        return self.devices[local_rank] if self.devices else local_rank
+
+    def getLocalRankFromDevice(self, device_id):
+        return self.devices.index(device_id) if self.devices else device_id
+
+    def ncclInit(self, stream=None):
+        return NCCL_Communicator(self, stream=stream)
+
+    def ncclGroupInit(self, devices_context, stream=None):
+        return NCCL_Communicator(self, devices_context, stream=stream)
+
+    def __del__(self):
+        self.MPI_Finalize()
+
+
+class NCCL_Communicator():
+    def __init__(self, comm, devices_context=None, stream=None):
+        '''
+            ncclcomm: the NCCL communicator, to use in ncclAllReduce ...
+            ncclId: ncclGetUniqueId should be called once when creating a communicator
+                    and the Id should be distributed to all ranks in the communicator before calling ncclCommInitRank.
+            stream: the stream for NCCL communication
+        '''
+        self.mpi_communicator = comm
+        self.mpicomm = comm.mpicomm
+        self.nRanks = comm.nRanks
+        self.myRank = comm.myRank
+        self.localRank = comm.localRank
+        self.device_id = comm.device_id
+
+        if stream == None:
+            self.stream = create_stream_handle(
+                ndarray.gpu(self.device_id.value))
+        else:
+            self.stream = stream
+
+        self.ncclId = ncclUniqueId()
+        self.ncclcomm = c_int64(0)
+        self.ncclSetDevice(self.device_id.value)
+        if devices_context is None:
+            self.ncclGetUniqueId()
+            self.ncclCommInitRank()
+        else:
+            assert isinstance(
+                devices_context, DeviceGroup), "Devices context should be a DeviceGroup."
+            group_list = list(devices_context)
+            if len(set(group_list)) != len(group_list):
+                print("Warning: Repeated ranks are found in the group.")
+                group_list = list(set(group_list))
+
+            # the group_list here is as list of ndarray.(Remote)DLContext
+            global_rank = self.rank
+            global_size = self.nrank
+            group_rank = -1
+            group_size = len(group_list)
+            local_rank = -1
+
+            rank_list = []
+            assert group_size <= global_size, "Error: Too many ranks in the group."
+            local_rank_cnt = 0
+            for i in range(group_size):
+                at_local = group_list[i].local
+                hostname = 'localhost' if at_local else group_list[i].hostname
+                cur_rank = self.mpi_communicator.getRankFromDevice(
+                    hostname, group_list[i].device_id)
+                if cur_rank == global_rank:
+                    group_rank = i
+                    local_rank = local_rank_cnt
+                    assert self.dev_id == group_list[i].device_id
+                elif at_local:
+                    local_rank_cnt += 1
+                rank_list.append(cur_rank)
+                assert cur_rank < global_size, "Error: The range of ranks should be [0, nrank-1]."
+
+            self.nRanks = c_int32(group_size)
+            self.myRank = c_int32(group_rank)
+            self.localRank = c_int32(local_rank)
+
+            if local_rank >= 0:
+                group_id = 1234
+                for x in rank_list:
+                    group_id += x
+                    group_id *= 33
+                    group_id %= 10000007
+                self.ncclGetGroupUniqueId(
+                    (c_int32 * group_size)(*rank_list), c_int32(global_rank), self.nRanks, c_int32(group_id))
+                self.ncclCommInitRank()
+
+    @property
+    def dev_id(self):
+        return self.device_id.value
+
+    @property
+    def local_rank(self):
+        return self.localRank.value
+
+    @property
+    def rank(self):
+        return self.myRank.value
+
+    @property
+    def nrank(self):
+        return self.nRanks.value
+
+    def ncclSetDevice(self, device_id):
+        self.device_id.value = device_id
+        lib_mpi_nccl.setDevice(self.device_id.value)
+
+    def getRankFromDevice(self, hostname, device_id):
+        return self.mpi_communicator.getRankFromDevice(hostname, device_id)
+
+    def ncclGetUniqueId(self, senderRank=0):
+        lib_mpi_nccl.getNcclUniqueId(ctypes.byref(
+            self.ncclId), self.mpicomm, self.localRank, c_int(senderRank))
+
+    def ncclGetGroupUniqueId(self, group_list, ori_rank, group_size, group_id):
+        lib_mpi_nccl.getGroupNcclUniqueId(ctypes.byref(
+            self.ncclId), self.mpicomm, ori_rank, group_list, group_size, group_id)
+
+    def ncclCommInitRank(self):
+        '''
+            Use partial AllReduce to change here.
+            self.nRanks is the number of threads to use ncclallreduce
+            self.myRank is the rank among these threads. the value must in [0, self.nRank - 1]
+        '''
+        lib_mpi_nccl.initNcclCommRank(ctypes.byref(self.ncclcomm), self.nRanks, ctypes.byref(
+            self.ncclId), self.myRank, self.localRank)
+
+    def dlarrayNcclAllReduce(self, input_arr, output_arr, datatype, reduceop, executor_stream=None):
+        lib_mpi_nccl.dlarrayAllReduce(input_arr.handle, output_arr.handle, c_int(datatype.value), c_int(
+            reduceop.value), self.ncclcomm, executor_stream.handle if executor_stream else self.stream.handle)
+
+    def dlarrayBroadcast(self, input_arr, output_arr, datatype, root, executor_stream=None):
+        lib_mpi_nccl.dlarrayBroadcast(input_arr.handle, output_arr.handle, c_int(datatype.value), c_int(
+            root), self.ncclcomm, executor_stream.handle if executor_stream else self.stream.handle)
+
+    def dlarrayAllGather(self, input_arr, output_arr, datatype, executor_stream=None):
+        lib_mpi_nccl.dlarrayAllGather(input_arr.handle, output_arr.handle, c_int(
+            datatype.value), self.ncclcomm, executor_stream.handle if executor_stream else self.stream.handle)
+
+    def dlarraySend(self, arr, datatype, target, executor_stream=None):
+        lib_mpi_nccl.dlarraySend(arr.handle, c_int(datatype.value), c_int(
+            target), self.ncclcomm, executor_stream.handle if executor_stream else self.stream.handle)
+
+    def dlarrayRecv(self, arr, datatype, src, executor_stream=None):
+        lib_mpi_nccl.dlarrayRecv(arr.handle, c_int(datatype.value), c_int(
+            src), self.ncclcomm, executor_stream.handle if executor_stream else self.stream.handle)
+
+    def ncclCommDestroy(self):
+        lib_mpi_nccl.commDestroyNccl(ctypes.byref(self.ncclcomm))
+
+    def __del__(self):
+        self.ncclCommDestroy()
+
+
+def mpi_communicator(devices=None):
+    return MPI_Communicator(devices=devices)
+
+
+# NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 4 python mpi_nccl_comm.py
+if __name__ == "__main__":
+    t = mpi_communicator()
+    t = t.ncclInit()
+
+    arr = np.ones(16)*t.localRank.value
+    print("before: = ", arr)
+    arr = ndarray.array(arr, ctx=ndarray.gpu(t.device_id.value))
+    output_arr = np.zeros(16 * t.nRanks.value)
+
+    output_arr = ndarray.array(output_arr, ctx=ndarray.gpu(t.device_id.value))
+    t.dlarrayNcclAllReduce(
+        arr, arr, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+
+    print("after: = ", arr.asnumpy())
diff --git a/python/hetu/communicator/nccl_comm.py b/python/hetu/communicator/nccl_comm.py
new file mode 100644
index 0000000..2df3cde
--- /dev/null
+++ b/python/hetu/communicator/nccl_comm.py
@@ -0,0 +1,87 @@
+from ctypes import *
+from .. import ndarray
+from ..stream import *
+import numpy as np
+import os
+
+
+def _load_nccl_lib():
+    """Load libary in build/lib."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_path = os.path.join(curr_path, '../../../build/lib/')
+    path_to_so_file = os.path.join(lib_path, "lib_nccl_runtime.so")
+    lib = CDLL(path_to_so_file, RTLD_LOCAL)
+    return lib
+
+
+lib_nccl = _load_nccl_lib()
+
+
+class NCCL_Communicator():
+
+    def __init__(self, devs, devs_number):
+        self.comms = (c_int64 * devs_number)(0)
+        self.streams = (c_int64 * devs_number)(0)
+        self.stream_handles = []
+        self.devs = (c_int * devs_number)(*devs)
+        self.devs_number = c_int(devs_number)
+        self.send_buff = None
+        self.recv_buff = None
+
+    def _create_streams(self):
+        for i in range(self.devs_number.value):
+            self.stream_handles.append(create_stream_handle(ndarray.gpu(i)))
+            lib_nccl.update_stream(i, self.streams, c_int64(
+                self.stream_handles[-1].handle.contents.handle))
+
+    def _destroy_streams(self):
+        self.stream_handles = []
+        lib_nccl.free_streams(self.streams, self.devs, self.devs_number)
+
+    def _init_NCCL(self):
+        lib_nccl.init_NCCL(self.comms, self.devs, self.devs_number)
+
+    def _destroy_NCCL_comms(self):
+        lib_nccl.finish_NCCL(self.comms, self.devs_number)
+
+    def _stream_sync(self):
+        lib_nccl.Synchronize_streams(self.streams, self.devs, self.devs_number)
+
+    def _allreduce(self, send_buff, recv_buff, size):
+        lib_nccl.NCCL_AllReduce(
+            send_buff, recv_buff, size, self.comms, self.streams, self.devs_number)
+
+    def get_send_buff(self, array):
+        self.send_buff = (c_void_p * self.devs_number.value)(*array)
+
+    def get_recv_buff(self, array):
+        self.recv_buff = (c_void_p * self.devs_number.value)(*array)
+
+    def all_reduce(self, send_array, recv_array, size):
+        self.get_send_buff(send_array)
+        self.get_recv_buff(recv_array)
+        self._allreduce(self.send_buff, self.recv_buff, size)
+
+    def show_property(self):
+        print("self.comms = ", self.comms)
+        print("self.streams = ", self.streams)
+        print("self.devs = ")
+        lib_nccl.for_each(self.devs, self.devs_number)
+        print("self.devs_number = ", self.devs_number.value)
+
+    def All_Reduce_Ndarray(self, gradient_list, allreduced_gradient_list):
+        gradient_buff = []
+        allreduced_gradient_buff = []
+        for i in range(self.devs_number.value):
+            gradient_buff.append(gradient_list[i].handle.contents.data)
+            allreduced_gradient_buff.append(
+                allreduced_gradient_list[i].handle.contents.data)
+        length = 1
+        for i in range(gradient_list[0].handle.contents.ndim):
+            length = length * gradient_list[0].handle.contents.shape[i]
+
+        self.all_reduce(gradient_buff, allreduced_gradient_buff, length)
+
+
+def nccl_communicator(devs, devs_number):
+    return NCCL_Communicator(devs, devs_number)
diff --git a/python/hetu/context.py b/python/hetu/context.py
new file mode 100644
index 0000000..ff42be8
--- /dev/null
+++ b/python/hetu/context.py
@@ -0,0 +1,425 @@
+from .ndarray import cpu, gpu, rcpu, rgpu, DLContext, is_gpu_ctx
+import contextlib
+import re
+
+
+class DeviceGroup(object):
+    def __init__(self, ctxs):
+        self._contexts = self.parse_contexts(ctxs)
+        self.get_servers_n_workers()
+
+    @classmethod
+    def parse_contexts(cls, ctxs):
+        if isinstance(ctxs, DeviceGroup):
+            return ctxs
+        if isinstance(ctxs, str):
+            ctxs = re.split(';|,| +', ctxs.lower())
+        if not isinstance(ctxs, list):
+            ctxs = [ctxs]
+        new_ctxs = []
+        for c in ctxs:
+            if isinstance(c, tuple):
+                c = tuple([ccc for ccc in [cls.str2ctx(cc)
+                                           for cc in c] if ccc is not None])
+            else:
+                c = cls.str2ctx(c)
+            if c is not None:
+                new_ctxs.append(c)
+        return new_ctxs
+
+    @classmethod
+    def str2ctx(cls, c):
+        if isinstance(c, str):
+            c = c.lower().split(':')
+            assert c[-2] in ('cpu', 'gpu'), 'Context invalid: %s' % c
+            hostname = 'localhost' if len(c) == 2 else c[0]
+            idx = int(c[-1])
+            c = rcpu(hostname, idx) if c[-2] == 'cpu' else rgpu(hostname, idx)
+        assert isinstance(c, DLContext), 'Context invalid: %s' % c
+        return c
+
+    def index(self, ctx):
+        return self._contexts.index(ctx)
+
+    def __getitem__(self, key):
+        return self._contexts[key]
+
+    def __iter__(self):
+        return iter(self._contexts)
+
+    def __len__(self):
+        return len(self._contexts)
+
+    @property
+    def worker_num(self):
+        return len(self._workers)
+
+    @property
+    def server_num(self):
+        return len(self._servers)
+
+    @property
+    def workers(self):
+        return self._workers
+
+    @property
+    def servers(self):
+        return self._servers
+
+    def get_servers_n_workers(self):
+        self._workers = []
+        self._servers = []
+        for ctx in self._contexts:
+            if isinstance(ctx, tuple) or is_gpu_ctx(ctx):
+                self._workers.append(ctx)
+            else:
+                self._servers.append(ctx)
+
+    def __repr__(self):
+        result = 'DeviceGroup('
+        for c in self._contexts:
+            result += ('(' + ', '.join([str(cc) for cc in c]) +
+                       '), ') if isinstance(c, tuple) else '%s, ' % c
+        result += ')'
+        return result
+
+    def __hash__(self):
+        if not hasattr(self, 'hash'):
+            self.hash = hash(
+                tuple(sorted(self._contexts, key=lambda x: x.device_id)))
+        return self.hash
+
+    def __eq__(self, other):
+        return hash(self) == hash(other)
+
+
+class ContextStack(object):
+    def __init__(self):
+        self._stack = []
+
+    def peek(self):
+        return self._stack[-1] if self._stack else None
+
+    def push(self, ctx):
+        return self._stack.append(ctx)
+
+    def pop(self):
+        self._stack.pop()
+
+
+_default_ctx_stack = ContextStack()
+
+
+def get_current_context():
+    return _default_ctx_stack.peek()
+
+
+@contextlib.contextmanager
+def context(ctx):
+    try:
+        ctx = DeviceGroup(ctx)
+        _default_ctx_stack.push(ctx)
+        yield ctx
+    finally:
+        _default_ctx_stack.pop()
+
+
+def check_worker(ctx):
+    # if the context is GPU or is a tuple (which means model parallel),
+    # we regard it as a worker
+    return isinstance(ctx, tuple) or is_gpu_ctx(ctx)
+
+
+def get_launch_config_by_traverse_nodes(node_list, default_ctx):
+    node_strategy = dict()
+    devices = set()
+    for ctx in default_ctx:
+        if isinstance(ctx, tuple):
+            devices.update(ctx)
+        else:
+            devices.add(ctx)
+    launchPS = default_ctx.server_num > 0
+    launchMPI = (not launchPS) and default_ctx.worker_num > 1
+    nrank = default_ctx.worker_num
+    for node in node_list:
+        traverse_dfs(node, node_strategy, devices, nrank)
+    launchPS = launchPS or any([x == 'PS' for x in node_strategy.values()])
+    launchMPI = launchMPI or any(
+        [x == 'AllReduce' for x in node_strategy.values()])
+    return launchMPI, launchPS, node_strategy, devices
+
+
+def traverse_dfs(node, node_strategy, devices, nrank):
+    if node in node_strategy:
+        return
+    strategy = None
+    if node.raw_ctx is not None and node.raw_ctx.server_num > 0 and node.raw_ctx.worker_num > 0:
+        strategy = 'PS'
+    elif node.raw_ctx is not None and node.raw_ctx.worker_num > 1:
+        strategy = 'AllReduce'
+    node_strategy[node] = strategy
+    for ctx in node.raw_ctx:
+        if isinstance(ctx, tuple):
+            devices.update(ctx)
+        else:
+            devices.add(ctx)
+    local_nrank = nrank if node.raw_ctx is None else node.raw_ctx.worker_num
+    assert local_nrank in (
+        0, nrank), 'Number of workers not consist: (%d, %d).' % (local_nrank, nrank)
+    for n in node.inputs:
+        traverse_dfs(n, node_strategy, devices, nrank)
+
+
+def assign_context_by_traverse_nodes(node_list, ctx, mpi_comm, p2p_stream):
+    from .dataloader import DataloaderOp
+    from .optimizer import OptimizerOp
+    from .gpu_ops.PipelineSend import pipeline_send_op
+    from .gpu_ops.PipelineReceive import pipeline_receive_op
+    from .gpu_ops.Variable import PlaceholderOp
+    from .gpu_ops.Dispatch import DispatchOp, DispatchGradientOp
+    from .gpu_ops.Concat import concat_op
+    from .gpu_ops.Split import split_op
+    from .gpu_ops.AddElewise import add_op
+
+    def receive_model_parallel(prev_input, node):
+        # assert dp_index_map[prev_input] < 0 and dp_index_map[node] >= 0
+        dev_pos = dp_index_map[node]
+        if isinstance(node.raw_ctx.workers[dev_pos], tuple):
+            # here we receive from a node on one device dispatching to many
+            # in this case current node MUST have mp_index, and the split will be handled in sending
+            assert mp_index_map[node] >= 0, 'Now only support 1 to N.'
+            hostname = prev_input.raw_ctx.workers[dev_pos].hostname
+            target_id = prev_input.raw_ctx.workers[dev_pos].device_id
+            if prev_input not in recv_src:
+                recv_src[prev_input] = pipeline_receive_op(mpi_comm.getRankFromDevice(
+                    hostname, target_id), mpi_comm, stream=p2p_stream, ctx=ctx)
+            return recv_src[prev_input]
+        else:
+            # here we receive from a node on multiple devices
+            # in this case current node MUST NOT have mp_index, and handle the combination
+            target = node_tar_states_map[prev_input]
+            assert mp_index_map[node] < 0 and (target is None or all(
+                [ts == 1 for ts in target])), 'Now only support N to 1.'
+            if prev_input not in recv_src:
+                device_index = -1
+
+                def make_comb(devices, cur_states, depth):
+                    if depth == len(cur_states):
+                        nonlocal device_index
+                        device_index += 1
+                        return pipeline_receive_op(mpi_comm.getRankFromDevice(devices[device_index].hostname, devices[device_index].device_id), mpi_comm, stream=p2p_stream, ctx=ctx)
+                    else:
+                        result = make_comb(devices, cur_states, depth + 1)
+                        for _ in range(1, cur_states[depth]):
+                            result = concat_op(result, make_comb(
+                                devices, cur_states, depth + 1), axis=depth, ctx=ctx)
+                        return result
+                res = make_comb(
+                    prev_input.raw_ctx.workers[dev_pos], node_cur_states_map[prev_input], 0)
+                for _ in range(1, node_cur_duplicate_map.get(prev_input, 1)):
+                    res = add_op(res, make_comb(
+                        prev_input.raw_ctx.workers[dev_pos], node_cur_states_map[prev_input], 0), ctx=ctx)
+                assert device_index + \
+                    1 == len(prev_input.raw_ctx.workers[dev_pos])
+                recv_src[prev_input] = res
+            return recv_src[prev_input]
+
+    def send_model_parallel(prev_input, node):
+        # assert dp_index_map[prev_input] >= 0 and dp_index_map[node] < 0
+        dev_pos = dp_index_map[prev_input]
+        if not isinstance(prev_input.raw_ctx.workers[dev_pos], tuple):
+            # here we send from a node on one device dispatching to many nodes
+            # in this case current node MUST have mp_index, and the split will be handled in sending
+            assert mp_index_map[prev_input] < 0, 'Now only support 1 to N.'
+            device_index = 0
+
+            def make_split(devices, target_states, cur_states, depth):
+                if len(target_states) == depth:
+                    nonlocal device_index
+                    hostname = devices[device_index].hostname
+                    target_id = devices[device_index].device_id
+                    device_index += 1
+                    key = (prev_input, target_id)
+                    if key not in send_dst:
+                        cur_node = prev_input if all([x == 1 for x in target_states]) else split_op(
+                            prev_input, list(range(len(target_states))), list(cur_states), list(target_states), ctx=ctx)
+                        target_rank = mpi_comm.getRankFromDevice(
+                            hostname, target_id)
+                        send_dst[key] = pipeline_send_op(
+                            cur_node, target_rank, mpi_comm, stream=p2p_stream, ctx=ctx)
+                        my_eval_nodes.append(send_dst[key])
+                else:
+                    for ts in range(target_states[depth]):
+                        cur_states[depth] = ts
+                        make_split(devices, target_states,
+                                   cur_states, depth + 1)
+            for _ in range(node_tar_duplicate_map.get(prev_input, 1)):
+                cur_states = [0 for _ in range(
+                    len(node_tar_states_map[prev_input]))]
+                make_split(
+                    node.raw_ctx.workers[dev_pos], node_tar_states_map[prev_input], cur_states, 0)
+            assert device_index == len(node.raw_ctx.workers[dev_pos])
+        else:
+            # here we send from a node on multiple devices to one node
+            # in this case current node MUST NOT have mp_index, and the combination will be handled in receiving
+            target = node_tar_states_map[prev_input]
+            assert mp_index_map[prev_input] >= 0 and (target is None or all(
+                [ts == 1 for ts in target])), 'Now only support N to 1.'
+            hostname = node.raw_ctx.workers[dev_pos].hostname
+            target_id = node.raw_ctx.workers[dev_pos].device_id
+            key = (prev_input, target_id)
+            if key not in send_dst:
+                send_dst[key] = pipeline_send_op(prev_input, mpi_comm.getRankFromDevice(
+                    hostname, target_id), mpi_comm, stream=p2p_stream, ctx=ctx)
+                my_eval_nodes.append(send_dst[key])
+
+    def assign_ctx(node):
+        if node in dp_index_map:
+            return
+        mp_index_map[node] = -1
+        dp_index_map[node] = -1
+        if isinstance(node, DataloaderOp):
+            return
+        elif isinstance(node, OptimizerOp):
+            nonlocal opt
+            assert opt is None, 'Multiple optimizer is invalid.'
+            opt = node
+            for n in node.inputs:
+                assign_ctx(n)
+            grads = []
+            original_params = node.optimizer.params
+            for ind, param in enumerate(original_params):
+                ori_grad = node.inputs[ind]
+                if param in trainable_params:
+                    new_grad = receive_model_parallel(ori_grad.inputs[0], param) if isinstance(
+                        ori_grad, (DispatchOp, DispatchGradientOp)) else ori_grad
+                    grads.append(new_grad)
+                elif isinstance(ori_grad, (DispatchOp, DispatchGradientOp)):
+                    real_input = ori_grad.inputs[0]
+                    my_pos = dp_index_map[real_input]
+                    if my_pos >= 0:
+                        send_model_parallel(ori_grad.inputs[0], param)
+            if trainable_params:
+                # indices = [original_params.index(param) for param in trainable_params]
+                node.optimizer.params = trainable_params
+                # grads = [node.inputs[index] for index in indices]
+                node.inputs = grads
+                node.ctx = ctx
+                my_eval_nodes.append(node)
+        elif isinstance(node, DispatchOp):
+            real_node = node.inputs[0]
+            assign_ctx(real_node)
+            node_tar_states_map[real_node] = node.parts
+            node_tar_duplicate_map[real_node] = node.duplicate
+        elif isinstance(node, DispatchGradientOp):
+            real_node = node.inputs[0]
+            assign_ctx(real_node)
+            assign_ctx(node.inputs[1])
+            node_tar_states_map[real_node] = node_cur_states_map.get(
+                node.inputs[1], None)
+            node_tar_duplicate_map[real_node] = node_cur_duplicate_map.get(
+                node.inputs[1], 1)
+        else:
+            # now we only support SAME model parallel in data parallel
+            # and 1 context can only appear once
+            mp_index = -1
+            dp_index = -1
+            for i, c in enumerate(node.raw_ctx.workers):
+                if isinstance(c, tuple) and ctx in c:
+                    mp_index = c.index(ctx)
+                    dp_index = i
+                elif ctx == c:
+                    dp_index = i
+            mp_index_map[node] = mp_index
+            dp_index_map[node] = dp_index
+            need_states_deduction = False
+            for i, n in enumerate(node.inputs):
+                if isinstance(n, DataloaderOp):
+                    if dp_index >= 0 and n in node_list and n not in my_eval_nodes:
+                        my_eval_nodes.append(n)
+                    continue
+                assign_ctx(n)
+
+                # we assume that in model parallel + data parallel mode,
+                # devices number of each stage is equal
+                # the device in correspondent place will communicate with each other
+                # TODO: not support following case: context(1,5) -> context(5,1); context(1,5) -> context(3,1)
+                # solution: modify following is_my_node logic to support
+                # TODO: not support the case that each process has different group init numbers, since there is an AllGather in mpi_nccl_comm's init
+                # solution: modify mpi_nccl_comm class, so that the MPI part only process once while nccl has several groups
+                assert node.raw_ctx.worker_num == n.raw_ctx.worker_num, \
+                    'In pipeline + data parallel, devices number of each stage should be equal!'
+
+                if isinstance(n, (DispatchOp, DispatchGradientOp)):
+                    need_states_deduction = True
+                    # here we only allow pipeline + model parallel, which means the devices are all different
+                    # TODO: release the constraint above
+                    # here in every context each device appear only once
+                    # TODO: consider whether or not release the constraint above?
+                    # here we only allow one2n/n2one/n2n, can not change from x to y where x != 1 and y != 1 and x != y in dimension-granularity
+                    # TODO: consider whether or not release the constraint above? too complex and not realistic!
+                    real_input = n.inputs[0]
+                    if dp_index >= 0 and dp_index_map[real_input] < 0:
+                        node.inputs[i] = receive_model_parallel(
+                            real_input, node)
+                    elif dp_index < 0 and dp_index_map[real_input] >= 0:
+                        send_model_parallel(real_input, node)
+                else:
+                    assert mp_index < 0 and mp_index_map[n] < 0
+                    # handle receiving
+                    if dp_index >= 0 and dp_index != dp_index_map[n]:
+                        my_pos = dp_index
+                        hostname = n.raw_ctx.workers[my_pos].hostname
+                        target_id = n.raw_ctx.workers[my_pos].device_id
+                        if n not in recv_src:
+                            recv_src[n] = pipeline_receive_op(mpi_comm.getRankFromDevice(
+                                hostname, target_id), mpi_comm, stream=p2p_stream, ctx=ctx)
+                        node.inputs[i] = recv_src[n]
+                    # handle sending
+                    if dp_index_map[n] >= 0 and dp_index != dp_index_map[n]:
+                        my_pos = dp_index_map[n]
+                        hostname = node.raw_ctx.workers[my_pos].hostname
+                        target_id = node.raw_ctx.workers[my_pos].device_id
+                        key = (n, target_id)
+                        if key not in send_dst:
+                            send_dst[key] = pipeline_send_op(n, mpi_comm.getRankFromDevice(
+                                hostname, target_id), mpi_comm, stream=p2p_stream, ctx=ctx)
+                            my_eval_nodes.append(send_dst[key])
+
+            if dp_index >= 0:
+                node.ctx = ctx
+                if node in node_list:
+                    my_eval_nodes.append(node)
+                if isinstance(node, PlaceholderOp) and node.trainable:
+                    trainable_params.append(node)
+            if need_states_deduction:
+                input_states = []
+                input_duplicates = []
+                for n in node.inputs:
+                    if isinstance(n, (DispatchOp, DispatchGradientOp)):
+                        input_states.append(node_tar_states_map[n.inputs[0]])
+                        input_duplicates.append(
+                            node_tar_duplicate_map[n.inputs[0]])
+                    else:
+                        input_states.append(node_cur_states_map.get(n, None))
+                        input_duplicates.append(
+                            node_cur_duplicate_map.get(n, 1))
+                node_cur_states_map[node], node_cur_duplicate_map[node] = node.deduce_states(
+                    input_states, input_duplicates)
+
+    opt = None
+    trainable_params = []
+    send_dst = {}
+    recv_src = {}
+    mp_index_map = {}  # model parallel index
+    dp_index_map = {}  # data parallel index
+    node_cur_duplicate_map = {}  # save nodes' duplicate information
+    node_tar_duplicate_map = {}  # save nodes' target states
+    node_cur_states_map = {}  # save nodes' current states
+    node_tar_states_map = {}  # save nodes' target states
+    my_eval_nodes = []
+    for node in node_list:
+        assign_ctx(node)
+
+    has_send_recv = send_dst != {} or recv_src != {}
+    return my_eval_nodes, trainable_params, has_send_recv
diff --git a/python/hetu/cpu_links/__init__.py b/python/hetu/cpu_links/__init__.py
new file mode 100644
index 0000000..d687205
--- /dev/null
+++ b/python/hetu/cpu_links/__init__.py
@@ -0,0 +1,2 @@
+from __future__ import absolute_import
+from .dnnl_op import *
diff --git a/python/hetu/cpu_links/dnnl_op.py b/python/hetu/cpu_links/dnnl_op.py
new file mode 100644
index 0000000..7dd003e
--- /dev/null
+++ b/python/hetu/cpu_links/dnnl_op.py
@@ -0,0 +1,377 @@
+from __future__ import absolute_import
+import os
+import ctypes
+from .._base import _LIB
+import numpy as np
+from .. import ndarray as _nd
+
+
+def matrix_multiply(matA, transposeA, matB, transposeB, matC):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DnnlMatrixMultiply(matA.handle, transposeA,
+                            matB.handle, transposeB, matC.handle)
+
+
+def matrix_elementwise_multiply_by_const(mat, val, output):
+    assert isinstance(mat, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseMultiplyByConst(
+        mat.handle, ctypes.c_float(val), output.handle)
+
+
+def matrix_elementwise_multiply(matA, matB, output):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseMultiply(matA.handle, matB.handle, output.handle)
+
+
+def matrix_elementwise_add_by_const(mat, val, output):
+    assert isinstance(mat, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseAddByConst(
+        mat.handle, ctypes.c_float(val), output.handle)
+
+
+def matrix_elementwise_add(matA, matB, output):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseAdd(matA.handle, matB.handle, output.handle)
+
+
+def matrix_elementwise_divide_by_const(mat, val, output):
+    assert isinstance(mat, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseDivideByConst(
+        mat.handle, ctypes.c_float(val), output.handle)
+
+
+def matrix_elementwise_divide(matA, matB, output):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMatrixElementwiseDivide(matA.handle, matB.handle, output.handle)
+
+
+def broadcast_to(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.cpu_BroadcastTo(in_arr.handle, out_arr.handle)
+
+
+def reduce_sum_axis_zero(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.cpu_ReduceSumAxisZero(in_arr.handle, out_arr.handle)
+
+
+def array_set(output, value):
+    assert isinstance(output, _nd.NDArray)
+    _LIB.cpu_ArraySet(output.handle, ctypes.c_float(value))
+
+
+def reshape(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.cpu_Reshape(in_arr.handle, out_arr.handle)
+
+
+def softmax(mat, output):
+    assert isinstance(mat, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlSoftmax(mat.handle, output.handle)
+
+
+def softmax_crossentropy(matA, matB, output):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlSoftmaxCrossEntropy(matA.handle, matB.handle, output.handle)
+
+
+def sqrt(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlSqrt(in_arr.handle, out_arr.handle)
+
+
+def rsqrt(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlReciprocalSqrt(in_arr.handle, out_arr.handle)
+
+
+def tanh(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlTanh(in_arr.handle, out_arr.handle)
+
+
+def opposite(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlOpposite(in_arr.handle, out_arr.handle)
+
+
+def sigmoid(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlSigmoid(in_arr.handle, out_arr.handle)
+
+
+def conv2d(input_x, input_f, output, padding=0, stride=1):
+    assert isinstance(input_x, _nd.NDArray)
+    assert isinstance(input_f, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlConv2d(input_x.handle, input_f.handle, output.handle,
+                    ctypes.c_int(padding), ctypes.c_int(stride))
+
+
+def conv2d_gradient_of_data(input_f, gradient_y, gradient_x, padding=0, stride=1):
+    assert isinstance(gradient_y, _nd.NDArray)
+    assert isinstance(input_f, _nd.NDArray)
+    assert isinstance(gradient_x, _nd.NDArray)
+    _LIB.DnnlConv2d_Gradient_of_Data(input_f.handle, gradient_y.handle, gradient_x.handle, ctypes.c_int(padding),
+                                     ctypes.c_int(stride))
+
+
+def conv2d_gradient_of_filter(input_x, gradient_y, gradient_f, padding=0, stride=1):
+    assert isinstance(gradient_y, _nd.NDArray)
+    assert isinstance(input_x, _nd.NDArray)
+    assert isinstance(gradient_f, _nd.NDArray)
+    _LIB.DnnlConv2d_Gradient_of_Filter(input_x.handle, gradient_y.handle, gradient_f.handle, ctypes.c_int(padding),
+                                       ctypes.c_int(stride))
+
+
+def avg_pool(input, H, W, output, padding=0, stride=1):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlAvgPool(input.handle, ctypes.c_int(H), ctypes.c_int(W), output.handle, ctypes.c_int(padding),
+                     ctypes.c_int(stride))
+
+
+def avg_pool_gradient(gradient_Y, H, W, gradient_X, padding=0, stride=1):
+    assert isinstance(gradient_Y, _nd.NDArray)
+    assert isinstance(gradient_X, _nd.NDArray)
+    _LIB.DnnlAvgPool_Gradient(gradient_Y.handle, ctypes.c_int(H), ctypes.c_int(W), gradient_X.handle,
+                              ctypes.c_int(padding), ctypes.c_int(stride))
+
+
+def max_pool(input, H, W, output, padding=0, stride=1):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMaxPool(input.handle, ctypes.c_int(H), ctypes.c_int(W), output.handle, ctypes.c_int(padding),
+                     ctypes.c_int(stride))
+
+
+def max_pool_gradient(input, input_grad, H, W, output, padding=0, stride=1):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlMaxPool_Gradient(input.handle, input_grad.handle, ctypes.c_int(H), ctypes.c_int(W), output.handle,
+                              ctypes.c_int(padding), ctypes.c_int(stride))
+
+
+def relu(in_arr, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DnnlRelu(in_arr.handle, out_arr.handle)
+
+
+def relu_gradient(input, in_grad, output):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(in_grad, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlRelu_Gradient(input.handle, in_grad.handle, output.handle)
+
+
+def batch_norm(input, bn_scale, bn_bias, output, mean, var, momentum=0.99, eps=0.01):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(bn_scale, _nd.NDArray)
+    assert isinstance(bn_bias, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    _LIB.DnnlBatchNorm(input.handle, bn_scale.handle, bn_bias.handle, output.handle,
+                       mean.handle, var.handle, ctypes.c_float(momentum), ctypes.c_float(eps))
+
+
+def batch_norm_gradient(gradient_Y, input_X, bn_scale, bn_bias, gradient_X, gradient_bn_scale, gradient_bn_bias, mean,
+                        var, eps=0.01):
+    assert isinstance(gradient_Y, _nd.NDArray)
+    assert isinstance(input_X, _nd.NDArray)
+    assert isinstance(gradient_X, _nd.NDArray)
+    assert isinstance(gradient_bn_scale, _nd.NDArray)
+    assert isinstance(gradient_bn_bias, _nd.NDArray)
+    assert isinstance(bn_scale, _nd.NDArray)
+    assert isinstance(bn_bias, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    _LIB.DnnlBatchNorm_Gradient(gradient_Y.handle, input_X.handle, bn_scale.handle,
+                                bn_bias.handle, gradient_X.handle, gradient_bn_scale.handle,
+                                gradient_bn_bias.handle, mean.handle, var.handle, ctypes.c_float(eps))
+
+
+def batch_norm_inference(input, bn_scale, bn_bias, output, mean, var, momentum=0.99, eps=0.01):
+    assert isinstance(input, _nd.NDArray)
+    assert isinstance(bn_scale, _nd.NDArray)
+    assert isinstance(bn_bias, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    _LIB.DnnlBatchNorm_Inference(input.handle, bn_scale.handle, bn_bias.handle, output.handle,
+                                 mean.handle, var.handle, ctypes.c_float(momentum), ctypes.c_float(eps))
+
+
+def concat(input_x, input_y, output, axis=0):
+    assert isinstance(input_x, _nd.NDArray)
+    assert isinstance(input_y, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.DnnlConcat(input_x.handle, input_y.handle,
+                    output.handle, ctypes.c_int(axis))
+
+
+def concat_gradient(output_gradient, input_gradient, axis=0, id=0):
+    assert isinstance(output_gradient, _nd.NDArray)
+    assert isinstance(input_gradient, _nd.NDArray)
+    _LIB.cpu_Concat_Gradient(
+        output_gradient.handle, input_gradient.handle, ctypes.c_int(axis), ctypes.c_int(id))
+
+
+def dropout(in_arr, dropout, out_arr):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.cpu_Dropout(in_arr.handle, ctypes.c_float(dropout), out_arr.handle)
+
+
+def dropout_gradient(in_gradient_y, dropout, out_gradient_x):
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.cpu_Dropout_Gradient(in_gradient_y.handle,
+                              ctypes.c_float(dropout), out_gradient_x.handle)
+
+
+def pad(in_arr, out_arr, paddings, mode='CONSTANT', constant_values=0):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    padding_arr = []
+    for i in range(len(paddings)):
+        for j in range(len(paddings[0])):
+            padding_arr.append(paddings[i][j])
+    pad_len = len(padding_arr)
+    padding_c_arr = (ctypes.c_int * pad_len)(*padding_arr)
+    f_type = 3
+    if mode == 'CONSTANT':
+        f_type = 0
+    elif mode == 'REFLECT':
+        f_type = 1
+    elif mode == 'SYMMETRIC':
+        f_type = 2
+    assert (f_type <= 2)
+    _LIB.cpu_Pad(in_arr.handle, out_arr.handle, padding_c_arr,
+                 ctypes.c_int(pad_len), ctypes.c_int(f_type), ctypes.c_float(constant_values))
+
+
+def pad_gradient(out_grad_arr, in_grad_arr, paddings, mode="CONSTANT"):
+    assert isinstance(out_grad_arr, _nd.NDArray)
+    assert isinstance(in_grad_arr, _nd.NDArray)
+    padding_arr = []
+    for i in range(len(paddings)):
+        for j in range(len(paddings[0])):
+            padding_arr.append(paddings[i][j])
+    pad_len = len(padding_arr)
+    padding_c_arr = (ctypes.c_int * pad_len)(*padding_arr)
+    f_type = 3
+    if mode == 'CONSTANT':
+        f_type = 0
+    elif mode == 'REFLECT':
+        f_type = 1
+    elif mode == 'SYMMETRIC':
+        f_type = 2
+    assert (f_type <= 2)
+    _LIB.cpu_Pad_Gradient(out_grad_arr.handle,
+                          in_grad_arr.handle, padding_c_arr, ctypes.c_int(pad_len), ctypes.c_int(f_type))
+
+
+def transpose(in_arr, out_arr, perm):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    pointer_func = ctypes.c_int * len(perm)
+    pointer = pointer_func(*list(perm))
+    _LIB.cpu_Transpose(in_arr.handle, out_arr.handle, pointer)
+
+
+def embedding_lookup(in_mat, ids, out_mat):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(ids, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    _LIB.cpu_EmbeddingLookup(in_mat.handle, ids.handle, out_mat.handle)
+
+
+def add_l2_regularization(param, grad, l2reg):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, _nd.NDArray)
+    _LIB.cpu_AddL2Regularization(
+        param.handle, grad.handle, ctypes.c_float(l2reg))
+
+
+def sgd_update(param, grad, lr):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, _nd.NDArray)
+    _LIB.cpu_SGDOptimizerUpdate(param.handle, grad.handle, ctypes.c_float(lr))
+
+
+def sgd_update_sparse(param, grad_ind, grad_value, lr):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad_ind, _nd.NDArray)
+    assert isinstance(grad_value, _nd.NDArray)
+    _LIB.cpu_SGDOptimizerSparseUpdate(
+        param.handle, grad_ind.handle, grad_value.handle, ctypes.c_float(lr))
+
+
+def momentum_update(param, grad, velocity, lr, momentum, nesterov):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, _nd.NDArray)
+    assert isinstance(velocity, _nd.NDArray)
+    _LIB.cpu_MomentumOptimizerUpdate(param.handle, grad.handle, velocity.handle,
+                                     ctypes.c_float(lr), ctypes.c_float(momentum), ctypes.c_bool(nesterov))
+
+
+def adagrad_update(param, grad, accumulation, lr, eps):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, _nd.NDArray)
+    assert isinstance(accumulation, _nd.NDArray)
+    _LIB.cpu_AdaGradOptimizerUpdate(param.handle, grad.handle, accumulation.handle,
+                                    ctypes.c_float(lr), ctypes.c_float(eps))
+
+
+def adam_update(param, grad, expavg, expavgsq, lr, beta1, beta2, beta1t, beta2t, eps):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, _nd.NDArray)
+    assert isinstance(expavg, _nd.NDArray)
+    assert isinstance(expavgsq, _nd.NDArray)
+    _LIB.cpu_AdamOptimizerUpdate(param.handle, grad.handle, expavg.handle,
+                                 expavgsq.handle, ctypes.c_float(lr),
+                                 ctypes.c_float(beta1), ctypes.c_float(
+                                     beta2), ctypes.c_float(beta1t),
+                                 ctypes.c_float(beta2t), ctypes.c_float(eps))
+
+
+def normal_init(param, mean, stddev, seed):
+    assert isinstance(param, _nd.NDArray)
+    _LIB.cpu_NormalInit(param.handle, ctypes.c_float(
+        mean), ctypes.c_float(stddev), ctypes.c_ulonglong(seed))
+
+
+def uniform_init(param, lb, ub, seed):
+    assert isinstance(param, _nd.NDArray)
+    _LIB.cpu_UniformInit(param.handle, ctypes.c_float(
+        lb), ctypes.c_float(ub), ctypes.c_ulonglong(seed))
+
+
+def truncated_normal_init(param, mean, stddev, seed):
+    assert isinstance(param, _nd.NDArray)
+    _LIB.cpu_TruncatedNormalInit(param.handle, ctypes.c_float(
+        mean), ctypes.c_float(stddev), ctypes.c_ulonglong(seed))
diff --git a/python/hetu/cstable.py b/python/hetu/cstable.py
new file mode 100644
index 0000000..c11fc8f
--- /dev/null
+++ b/python/hetu/cstable.py
@@ -0,0 +1,211 @@
+from hetu.gpu_ops.executor import path_to_lib
+from hetu import ndarray
+from hetu import get_worker_communicate
+
+import numpy as np
+import sys
+import os
+from functools import partial
+
+"""
+    CacheSparseTable:
+        length, width: the length and width of the whole embedding table
+        limit: the max number of embedding lines stored in cache
+        node_id: the unique node_id in the model
+        policy: cache policy, LRU or LFU
+"""
+
+
+class CacheSparseTable:
+    def __init__(self, limit, length, width, node_id, policy="LRU", bound=100):
+        # make sure we open libps.so first
+        comm = get_worker_communicate()
+        sys.path.append(os.path.dirname(__file__)+"/../../build/lib")
+        import hetu_cache
+        policy = policy.lower()
+        if policy == "lru":
+            self.cache = hetu_cache.LRUCache(limit, length, width, node_id)
+        elif policy == "lfu":
+            self.cache = hetu_cache.LFUCache(limit, length, width, node_id)
+        elif policy == "lfuopt":
+            self.cache = hetu_cache.LFUOptCache(limit, length, width, node_id)
+        else:
+            raise NotImplementedError(policy)
+        self.cache.pull_bound = bound
+        self.cache.push_bound = bound
+        comm.BarrierWorker()
+
+    """
+        embedding_lookup:
+            keys: a list of keys to lookup
+            dest: target memory space to write to
+            sync: async call of sync call
+            if async, a wait_t is returned, use wait.wait() to wait until it finish.
+            if async, must make sure keys and dest are alive throughout the call
+    """
+
+    def embedding_lookup(self, keys, dest, sync=False):
+        wait = None
+        if type(keys) is np.ndarray and type(dest) is np.ndarray:
+            assert dest.shape == (keys.size, self.width)
+            assert keys.dtype == np.uint64
+            assert dest.dtype == np.float32
+            wait = self.cache.embedding_lookup(keys, dest)
+        elif type(keys) is ndarray.NDArray and type(dest) is ndarray.NDArray:
+            assert dest.shape == (*keys.shape, self.width)
+            assert not ndarray.is_gpu_ctx(keys.ctx)
+            assert not ndarray.is_gpu_ctx(dest.ctx)
+            wait = self.cache.embedding_lookup_raw(
+                keys.handle.contents.data, dest.handle.contents.data, np.prod(keys.shape))
+        else:
+            raise TypeError
+        if sync:
+            wait.wait()
+        else:
+            return wait
+    """
+        embedding_lookup:
+            keys: a list of keys to update
+            grads: gradients to send
+            sync: async call of sync call
+            if async, a wait_t is returned, use wait.wait() to wait until it finish.
+            if async, must make sure keys and dest are alive throughout the call
+    """
+
+    def embedding_update(self, keys, grads, sync=False):
+        wait = None
+        if type(keys) is np.ndarray and type(grads) is np.ndarray:
+            assert grads.shape == (keys.size, self.width)
+            assert keys.dtype == np.uint64
+            assert grads.dtype == np.float32
+            wait = self.cache.embedding_update(keys, grads)
+        elif type(keys) is ndarray.NDArray and type(grads) is ndarray.NDArray:
+            assert grads.shape == (*keys.shape, self.width)
+            assert not ndarray.is_gpu_ctx(keys.ctx)
+            assert not ndarray.is_gpu_ctx(grads.ctx)
+            wait = self.cache.embedding_update_raw(
+                keys.handle.contents.data, grads.handle.contents.data, np.prod(keys.shape))
+        else:
+            raise TypeError
+        if sync:
+            wait.wait()
+        else:
+            return wait
+
+    def embedding_push_pull(self, pullkeys, dest, pushkeys, grads, sync=False):
+        wait = None
+        if type(pullkeys) is ndarray.NDArray and type(dest) is ndarray.NDArray and \
+                type(pushkeys) is ndarray.NDArray and type(grads) is ndarray.NDArray:
+            assert grads.shape == (*pushkeys.shape, self.width)
+            assert dest.shape == (*pullkeys.shape, self.width)
+            assert not ndarray.is_gpu_ctx(pullkeys.ctx)
+            assert not ndarray.is_gpu_ctx(pushkeys.ctx)
+            assert not ndarray.is_gpu_ctx(grads.ctx)
+            assert not ndarray.is_gpu_ctx(dest.ctx)
+            wait = self.cache.embedding_push_pull_raw(
+                pullkeys.handle.contents.data, dest.handle.contents.data, np.prod(
+                    pullkeys.shape),
+                pushkeys.handle.contents.data, grads.handle.contents.data, np.prod(
+                    pushkeys.shape)
+            )
+        else:
+            raise TypeError
+        if sync:
+            wait.wait()
+        else:
+            return wait
+
+    @property
+    def width(self):
+        return self.cache.width
+
+    @property
+    def limit(self):
+        return self.cache.limit
+
+    def perf_enabled(self, enable=True):
+        self.cache.perf_enabled = enable
+
+    @property
+    def perf(self):
+        # perf data example [item1, item2...]
+        # item = "type": pull_or_push, "is_full": is_cache_full, "num_all", num_of_key
+        # "num_unique": num_of_unique_key, "num_miss": num_of_missed_unique_key,
+        # "num_evict": num_push_of_eviction, "num_transfered"(if push): miss+outofpushbound+evict
+        # "num_transfered"(if pull): miss+outofpullbound, "time": last_time_in_ms
+        return self.cache.perf
+
+    # if bypass, directly pull and push the server
+    def bypass(self):
+        self.cache.bypass()
+
+    def undobypass(self):
+        self.cache.undo_bypass()
+
+    def __repr__(self):
+        return self.cache.__repr__()
+
+    # the following calls are single key call
+    # for debugging
+    def lookup(self, key):
+        return self.cache.lookup(key)
+
+    def count(self, key):
+        return self.cache.count(key)
+
+    def insert(self, key, embedding):
+        return self.cache.insert(key, embedding)
+
+    def keys(self):
+        return self.cache.keys()
+
+    # PerfHelperFunction
+
+    # miss rate for pull
+    def overall_miss_rate(self, include_cold_start=False):
+        if not include_cold_start:
+            perf = list(filter(lambda x: x["is_full"], self.perf))
+        else:
+            perf = self.perf
+        if not perf:
+            return -1
+        pull_perf = list(filter(lambda x: x["type"] == "Pull", perf))
+        num_all = [x["num_unique"] for x in pull_perf]
+        num_miss = [x["num_miss"] for x in pull_perf]
+        return np.sum(num_miss) / np.sum(num_all)
+
+    # data rate compared with vanilla sparse pull (ignore cost for idx&version)
+    def overall_data_rate(self, include_cold_start=False):
+        if not include_cold_start:
+            perf = list(filter(lambda x: x["is_full"], self.perf))
+        else:
+            perf = self.perf
+        if not perf:
+            return -1
+        num_all = [x["num_all"] for x in perf]
+        num_miss = [x["num_transfered"] for x in perf]
+        return np.sum(num_miss) / np.sum(num_all)
+
+    def debug_keys(self):
+        comm = get_worker_communicate()
+        nrank = comm.nrank()
+        form = "w" if comm.rank() == 0 else "a"
+        for i in range(nrank):
+            if i == comm.rank():
+                with open("_keys.log".format(comm.rank()), form) as f:
+                    print(*self.keys(), file=f, flush=True)
+            comm.BarrierWorker()
+
+        if comm.rank() != 0:
+            return
+        keys = []
+        with open("_keys.log".format(comm.rank()), "r") as f:
+            for i in range(nrank):
+                keys.append(set(map(int, f.readline().split())))
+        rt = np.zeros([nrank, nrank])
+        for i in range(nrank):
+            for j in range(nrank):
+                if not keys[i]:
+                    continue
+                rt[i][j] = len(keys[i].intersection(keys[j])) / len(keys[i])
+        return rt
diff --git a/python/hetu/data.py b/python/hetu/data.py
new file mode 100644
index 0000000..a5e9599
--- /dev/null
+++ b/python/hetu/data.py
@@ -0,0 +1,300 @@
+import numpy as np
+import os
+
+
+def mnist(dataset='mnist.pkl.gz', onehot=True):
+    import six.moves.cPickle as pickle
+    import gzip
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        from six.moves import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    # Load the dataset
+    with gzip.open(dataset, 'rb') as f:
+        try:
+            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
+        except:
+            train_set, valid_set, test_set = pickle.load(f)
+    # train_set, valid_set, test_set format: tuple(input, target)
+    # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
+    # where each row corresponds to an example. target is a
+    # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
+    # as the number of rows in the input. It should give the target
+    # to the example with the same index in the input.
+    if onehot:
+        train_set = (train_set[0], convert_to_one_hot(train_set[1], 10))
+        valid_set = (valid_set[0], convert_to_one_hot(valid_set[1], 10))
+        test_set = (test_set[0], convert_to_one_hot(test_set[1], 10))
+    return train_set, valid_set, test_set
+
+
+def cifar10(directory='CIFAR_10', onehot=True):
+    import six.moves.cPickle as pickle
+    file_lists = [os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%d' % i) for i in range(1, 6)] +\
+        [os.path.join(directory, 'cifar-10-batches-py', 'test_batch')]
+    if not all([os.path.exists(fl) for fl in file_lists]):
+        from tqdm import tqdm
+        from six.moves import urllib
+        import tarfile
+        filename = "cifar-10-python.tar.gz"
+        if not os.path.exists(filename):
+            def gen_bar_updater():
+                pbar = tqdm(total=None)
+
+                def bar_update(count, block_size, total_size):
+                    if pbar.total is None and total_size:
+                        pbar.total = total_size
+                    progress_bytes = count * block_size
+                    pbar.update(progress_bytes - pbar.n)
+                return bar_update
+            print('Downloading CIFAR 10 dataset...')
+            url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+            urllib.request.urlretrieve(
+                url, filename, reporthook=gen_bar_updater())
+        with tarfile.open(filename, 'r:gz') as tar:
+            tar.extractall(path=directory)
+
+    images, labels = [], []
+    for filename in file_lists[:5]:
+        with open(filename, 'rb') as fo:
+            cifar10 = pickle.load(fo, encoding='latin1')
+        for i in range(len(cifar10["labels"])):
+            image = cifar10["data"][i]
+            image = image.astype(float)
+            images.append(image)
+        labels += cifar10["labels"]
+    images = np.array(images, dtype='float')
+    labels = np.array(labels, dtype='int')
+    train_images, train_labels = images, labels
+
+    images, labels = [], []
+    for filename in file_lists[5:]:
+        with open(filename, 'rb') as fo:
+            cifar10 = pickle.load(fo, encoding='latin1')
+        for i in range(len(cifar10["labels"])):
+            image = cifar10["data"][i]
+            image = image.astype(float)
+            images.append(image)
+        labels += cifar10["labels"]
+    images = np.array(images, dtype='float')
+    labels = np.array(labels, dtype='int')
+    test_images, test_labels = images, labels
+    if onehot:
+        train_labels = convert_to_one_hot(train_labels, 10)
+        test_labels = convert_to_one_hot(test_labels, 10)
+    return train_images, train_labels, test_images, test_labels
+
+
+def cifar100(directory='CIFAR_100', onehot=True):
+    import six.moves.cPickle as pickle
+    file_lists = [os.path.join(directory, 'cifar-100-python', 'train'),
+                  os.path.join(directory, 'cifar-100-python', 'test')]
+
+    if not all([os.path.exists(fl) for fl in file_lists]):
+        from tqdm import tqdm
+        from six.moves import urllib
+        import tarfile
+        filename = "cifar-100-python.tar.gz"
+        if not os.path.exists(filename):
+            def gen_bar_updater():
+                pbar = tqdm(total=None)
+
+                def bar_update(count, block_size, total_size):
+                    if pbar.total is None and total_size:
+                        pbar.total = total_size
+                    progress_bytes = count * block_size
+                    pbar.update(progress_bytes - pbar.n)
+                return bar_update
+            print('Downloading CIFAR 100 dataset...')
+            url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
+            urllib.request.urlretrieve(
+                url, filename, reporthook=gen_bar_updater())
+        with tarfile.open(filename, 'r:gz') as tar:
+            tar.extractall(path=directory)
+
+    with open(file_lists[0], 'rb') as input_file:
+        train_file = pickle.load(input_file, encoding='latin1')
+        train_images = train_file['data']
+        train_labels = train_file['fine_labels']
+    train_images = np.array(train_images, dtype='float').reshape(
+        train_images.shape[0], 3, 32, 32)
+    train_labels = np.array(train_labels, dtype='int')
+
+    with open(file_lists[1], 'rb') as input_file:
+        test_file = pickle.load(input_file, encoding='latin1')
+        test_images = test_file['data']
+        test_labels = test_file['fine_labels']
+    test_images = np.array(test_images, dtype='float').reshape(
+        test_images.shape[0], 3, 32, 32)
+    test_labels = np.array(test_labels, dtype='int')
+
+    if onehot:
+        train_labels = convert_to_one_hot(train_labels, 100)
+        test_labels = convert_to_one_hot(test_labels, 100)
+
+    return train_images, train_labels, test_images, test_labels
+
+
+def normalize_cifar(num_class=10, onehot=True):
+    if num_class == 10:
+        x_train, y_train, x_test, y_test = cifar10(onehot=onehot)
+    elif num_class == 100:
+        x_train, y_train, x_test, y_test = cifar100(onehot=onehot)
+    else:
+        raise NotImplementedError
+
+    x_train = x_train.reshape((-1, 3, 32, 32))
+    x_test = x_test.reshape((-1, 3, 32, 32))
+    x_train = x_train.astype('float32')
+    x_test = x_test.astype('float32')
+
+    x_train[:, 0, :, :] = (
+        x_train[:, 0, :, :] - np.mean(x_train[:, 0, :, :])) / np.std(x_train[:, 0, :, :])
+    x_train[:, 1, :, :] = (
+        x_train[:, 1, :, :] - np.mean(x_train[:, 1, :, :])) / np.std(x_train[:, 1, :, :])
+    x_train[:, 2, :, :] = (
+        x_train[:, 2, :, :] - np.mean(x_train[:, 2, :, :])) / np.std(x_train[:, 2, :, :])
+
+    x_test[:, 0, :, :] = (
+        x_test[:, 0, :, :] - np.mean(x_test[:, 0, :, :])) / np.std(x_test[:, 0, :, :])
+    x_test[:, 1, :, :] = (
+        x_test[:, 1, :, :] - np.mean(x_test[:, 1, :, :])) / np.std(x_test[:, 1, :, :])
+    x_test[:, 2, :, :] = (
+        x_test[:, 2, :, :] - np.mean(x_test[:, 2, :, :])) / np.std(x_test[:, 2, :, :])
+
+    return x_train, y_train, x_test, y_test
+
+
+def tf_normalize_cifar(num_class=10, onehot=True):
+    if num_class == 10:
+        x_train, y_train, x_test, y_test = cifar10(onehot=onehot)
+    elif num_class == 100:
+        x_train, y_train, x_test, y_test = cifar100(onehot=onehot)
+    else:
+        raise NotImplementedError
+    x_train = x_train.reshape((-1, 3, 32, 32))
+    x_test = x_test.reshape((-1, 3, 32, 32))
+    x_train = x_train.transpose([0, 2, 3, 1]).astype('float32')
+    x_test = x_test.transpose([0, 2, 3, 1]).astype('float32')
+
+    x_train[:, :, :, 0] = (
+        x_train[:, :, :, 0] - np.mean(x_train[:, :, :, 0])) / np.std(x_train[:, :, :, 0])
+    x_train[:, :, :, 1] = (
+        x_train[:, :, :, 1] - np.mean(x_train[:, :, :, 1])) / np.std(x_train[:, :, :, 1])
+    x_train[:, :, :, 2] = (
+        x_train[:, :, :, 2] - np.mean(x_train[:, :, :, 2])) / np.std(x_train[:, :, :, 2])
+
+    x_test[:, :, :, 0] = (
+        x_test[:, :, :, 0] - np.mean(x_test[:, :, :, 0])) / np.std(x_test[:, :, :, 0])
+    x_test[:, :, :, 1] = (
+        x_test[:, :, :, 1] - np.mean(x_test[:, :, :, 1])) / np.std(x_test[:, :, :, 1])
+    x_test[:, :, :, 2] = (
+        x_test[:, :, :, 2] - np.mean(x_test[:, :, :, 2])) / np.std(x_test[:, :, :, 2])
+
+    return x_train, y_train, x_test, y_test
+
+
+def convert_to_one_hot(vals, max_val=0):
+    """Helper method to convert label array to one-hot array."""
+    if max_val == 0:
+        max_val = vals.max() + 1
+    one_hot_vals = np.zeros((vals.size, max_val))
+    one_hot_vals[np.arange(vals.size), vals] = 1
+    return one_hot_vals
+
+
+########################
+# Not in use currently #
+########################
+
+def data_augmentation(images, mode='train', flip=False,
+                      crop=False, crop_shape=(24, 24, 3), whiten=False,
+                      noise=False, noise_mean=0, noise_std=0.01):
+    if crop:
+        if mode == 'train':
+            images = self._image_crop(images, shape=crop_shape)
+        elif mode == 'test':
+            images = self._image_crop_test(images, shape=crop_shape)
+    if flip:
+        images = self._image_flip(images)
+    if whiten:
+        images = self._image_whitening(images)
+    if noise:
+        images = self._image_noise(images, mean=noise_mean, std=noise_std)
+
+    return images
+
+
+def _image_crop(images, shape):
+    new_images = []
+    for i in range(images.shape[0]):
+        old_image = images[i, :, :, :]
+        old_image = numpy.pad(old_image, [[4, 4], [4, 4], [0, 0]], 'constant')
+        left = numpy.random.randint(old_image.shape[0] - shape[0] + 1)
+        top = numpy.random.randint(old_image.shape[1] - shape[1] + 1)
+        new_image = old_image[left: left+shape[0], top: top+shape[1], :]
+        new_images.append(new_image)
+
+    return numpy.array(new_images)
+
+
+def _image_crop_test(images, shape):
+    new_images = []
+    for i in range(images.shape[0]):
+        old_image = images[i, :, :, :]
+        old_image = numpy.pad(old_image, [[4, 4], [4, 4], [0, 0]], 'constant')
+        left = int((old_image.shape[0] - shape[0]) / 2)
+        top = int((old_image.shape[1] - shape[1]) / 2)
+        new_image = old_image[left: left+shape[0], top: top+shape[1], :]
+        new_images.append(new_image)
+
+    return numpy.array(new_images)
+
+
+def _image_flip(images):
+    for i in range(images.shape[0]):
+        old_image = images[i, :, :, :]
+        if numpy.random.random() < 0.5:
+            new_image = cv2.flip(old_image, 1)
+        else:
+            new_image = old_image
+        images[i, :, :, :] = new_image
+
+    return images
+
+
+def _image_whitening(images):
+    for i in range(images.shape[0]):
+        old_image = images[i, :, :, :]
+        new_image = (old_image - numpy.mean(old_image)) / numpy.std(old_image)
+        images[i, :, :, :] = new_image
+
+    return images
+
+
+def _image_noise(images, mean=0, std=0.01):
+    for i in range(images.shape[0]):
+        old_image = images[i, :, :, :]
+        new_image = old_image
+        for i in range(image.shape[0]):
+            for j in range(image.shape[1]):
+                for k in range(image.shape[2]):
+                    new_image[i, j, k] += random.gauss(mean, std)
+        images[i, :, :, :] = new_image
+
+    return images
diff --git a/python/hetu/dataloader.py b/python/hetu/dataloader.py
new file mode 100644
index 0000000..41bcd64
--- /dev/null
+++ b/python/hetu/dataloader.py
@@ -0,0 +1,190 @@
+from __future__ import absolute_import
+import os
+import numpy as np
+import multiprocessing as mp
+
+from . import ndarray
+from .gpu_ops.Node import Op
+
+
+# Multi-Process not useful now, since we don't have memory to CPU bottleneck
+class Dataloader(object):
+    def __init__(self, raw_data, batch_size, name='default', func=None, drop_last=True):
+        self.func = func if func else lambda x: x
+        self.raw_data = np.array(self.func(raw_data), np.float32)
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.name = str(name)
+
+    def init_states(self, rank=None, nrank=None):
+        if rank is not None:
+            cur_size = self.raw_data.shape[0] // nrank
+            start = cur_size * rank
+            ending = start + cur_size
+            self.raw_data = self.raw_data[start:ending]
+        self.samples_num = len(self.raw_data)
+        self.queue_size = 3  # if use prefetch, needs 3; if only current batch, needs 2
+        self.batch_size = min(int(self.batch_size),
+                              self.samples_num // self.queue_size)
+        assert self.batch_size > 0, 'Batch size %d invalid.' % self.batch_size
+        self.batch_num = int(np.ceil(self.samples_num / self.batch_size)) if not self.drop_last else \
+            self.samples_num // self.batch_size
+        self.shape = tuple([self.batch_size] + list(self.raw_data.shape[1:]))
+        self.seq = np.arange(self.samples_num)
+
+        self.index = 0
+        self.arrs = []
+        self.arr_map = {}
+        # prefetch to fill up the queue
+        for i in range(self.queue_size):
+            next_index = self.index + self.batch_size
+            self.arrs.append(ndarray.array(
+                self.raw_data[self.seq[self.index:next_index]], ctx=ndarray.cpu(0)))
+            self.index = next_index
+            self.arr_map[i] = i
+        self.max_key = self.queue_size - 1
+
+        # in case the last batch's shape is different, pre-allocate an array
+        if not self.drop_last:
+            res_num = self.samples_num % self.batch_size
+            if res_num > 0:
+                self.arrs.append(ndarray.empty(
+                    tuple([res_num] + list(self.shape[1:])), ctx=ndarray.cpu(0)))
+            self.rest = self.queue_size
+
+        self.batch_index = 0
+
+    def _get_arr(self, batchind):
+        # get specific batch
+        # if the batch to be fetched is the newest one, replace the oldest with new batch
+        assert batchind in self.arr_map
+        res = self.arrs[self.arr_map[batchind]]
+        if batchind == self.max_key:
+            self.max_key = (self.max_key + 1) % self.samples_num
+            min_key = (self.max_key - self.queue_size) % self.samples_num
+            if self.index >= self.samples_num or (self.drop_last and self.index + self.batch_size > self.samples_num):
+                self.index = 0
+            next_index = self.index + self.batch_size
+            if next_index <= self.samples_num:
+                temp_ind = self.arr_map.pop(min_key)
+                if temp_ind == self.queue_size and not self.drop_last:
+                    temp_ind = self.rest
+                    self.rest = self.queue_size
+                self.arr_map[self.max_key] = temp_ind
+                self.arrs[temp_ind][:] = self.raw_data[self.seq[self.index:next_index]]
+            else:
+                assert not self.drop_last
+                self.arrs[-1][:] = self.raw_data[self.seq[self.index:next_index]]
+                self.rest = self.arr_map.pop(min_key)
+                self.arr_map[self.max_key] = self.queue_size
+            self.index = next_index
+        return res
+
+    def get_arr(self):
+        # step forward in this function
+        res = self._get_arr(self.batch_index)
+        self.last_batch_size = res.shape[0]
+        self.batch_index = (self.batch_index + 1) % self.samples_num
+        return res
+
+    def get_next_arr(self):
+        res = self._get_arr(self.batch_index)
+        return res
+
+    def get_cur_shape(self):
+        return tuple(self.arrs[self.arr_map[self.batch_index]].shape)
+
+
+class GNNDataLoaderOp(Op):
+    graph = None
+    nxt_graph = None
+
+    def __init__(self, handler, ctx=ndarray.cpu(0)):
+        super().__init__(DataloaderOp, [], ctx)
+        self.on_gpu = True
+        self.on_cpu = False
+        self.handler = handler
+        self.name = "GNNDataloaderOp"
+        self.desc = self.name
+
+    def get_batch_num(self, name):
+        return None
+
+    def get_arr(self, name):
+        return self.handler(self.graph)
+
+    def get_next_arr(self, name):
+        return self.handler(self.nxt_graph)
+
+    def get_cur_shape(self, name):
+        return self.handler(self.graph).shape
+
+    def gradient(self, output_grad):
+        return None
+
+    def infer_shape(self, input_shapes):
+        raise NotImplementedError
+
+    @classmethod
+    def step(cls, graph):
+        cls.graph = cls.nxt_graph
+        cls.nxt_graph = graph
+
+
+class DataloaderOp(Op):
+    def __init__(self, dataloaders):
+        super().__init__(DataloaderOp, [], ndarray.cpu(0))
+        self.on_gpu = False
+        self.on_cpu = True
+        self.dataloaders = {
+            dl.name: dl for dl in dataloaders
+        }
+        self.name = "DataloaderOp%d(%s)" % (
+            self.id, '_'.join(self.dataloaders.keys()))
+        self.desc = self.name
+
+    def get_batch_num(self, name):
+        return self.dataloaders[name].batch_num
+
+    def get_arr(self, name):
+        return self.dataloaders[name].get_arr()
+
+    def get_next_arr(self, name):
+        return self.dataloaders[name].get_next_arr()
+
+    def get_cur_shape(self, name):
+        return self.dataloaders[name].get_cur_shape()
+
+    def gradient(self, output_grad):
+        return None
+
+    def infer_shape(self, input_shapes):
+        # actually this function can never be called
+        raise NotImplementedError
+
+    def forward_hook(self, config):
+        pass
+
+    def backward_hook(self, config):
+        for d in self.dataloaders.values():
+            if config.context_launch:
+                d.init_states(config.rank, config.nrank)
+            else:
+                d.init_states()
+
+
+def dataloader_op(dataloaders):
+    '''
+    dataloaders: list of dataloaders
+    '''
+    temp_dataloaders = []
+    for dl in dataloaders:
+        if isinstance(dl, Dataloader):
+            temp_dataloaders.append(dl)
+        elif isinstance(dl, list):
+            temp_dataloaders.append(Dataloader(*dl))
+        elif isinstance(dl, dict):
+            temp_dataloaders.append(Dataloader(**dl))
+        else:
+            assert False, 'Dataloader parameter invalid.'
+    return DataloaderOp(temp_dataloaders)
diff --git a/python/hetu/gpu_links/AddConstLink.py b/python/hetu/gpu_links/AddConstLink.py
new file mode 100644
index 0000000..6b60eb5
--- /dev/null
+++ b/python/hetu/gpu_links/AddConstLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_add_by_const(in_mat, val, out_mat, stream=None):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseAddByConst(
+        in_mat.handle, ctypes.c_float(val), out_mat.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/AddElewiseLink.py b/python/hetu/gpu_links/AddElewiseLink.py
new file mode 100644
index 0000000..7370b0e
--- /dev/null
+++ b/python/hetu/gpu_links/AddElewiseLink.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_add(matA, matB, matC, lazy_input=False, stream=None):
+    # deprecated
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseAdd(matA.handle, matB.handle, matC.handle, ctypes.c_bool(
+        lazy_input), stream.handle if stream else None)
+
+
+def matrix_elementwise_add_simple(matA, matB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseAddSimple(
+        matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
+
+
+def matrix_elementwise_add_lazy(matA, matB, matC, gpu_buf, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    assert isinstance(gpu_buf, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseAddLazy(
+        matA.handle, matB.handle, matC.handle, gpu_buf.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ArraySetLink.py b/python/hetu/gpu_links/ArraySetLink.py
new file mode 100644
index 0000000..49be621
--- /dev/null
+++ b/python/hetu/gpu_links/ArraySetLink.py
@@ -0,0 +1,11 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def array_set(arr, value, stream=None):
+    assert isinstance(arr, _nd.NDArray)
+    _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(
+        value), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/AvgPoolLink.py b/python/hetu/gpu_links/AvgPoolLink.py
new file mode 100644
index 0000000..c17273f
--- /dev/null
+++ b/python/hetu/gpu_links/AvgPoolLink.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def average_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(pooled_layer, _nd.NDArray)
+    _LIB.DLGpuAvgerage_Pooling2d(
+        in_arr.handle, kernel_H, kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
+
+
+def average_pooling2d_gradient(in_gradient_y, kernel_H, kernel_W, out_gradient_x, padding=0, stride=1, stream=None):
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.DLGpuAvgerage_Pooling2d_gradient(
+        in_gradient_y.handle, kernel_H, kernel_W, out_gradient_x.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/BatchMatrixMultLink.py b/python/hetu/gpu_links/BatchMatrixMultLink.py
new file mode 100644
index 0000000..50f8457
--- /dev/null
+++ b/python/hetu/gpu_links/BatchMatrixMultLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def batch_matrix_multiply(matA, transA, matB, transB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuBatchMatrixMultiply(
+        matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/BinaryCrossEntropyLink.py b/python/hetu/gpu_links/BinaryCrossEntropyLink.py
new file mode 100644
index 0000000..94c4053
--- /dev/null
+++ b/python/hetu/gpu_links/BinaryCrossEntropyLink.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def binary_cross_entropy(in_arr_a, in_arr_b, out_arr, stream=None):
+    assert isinstance(in_arr_a, _nd.NDArray)
+    assert isinstance(in_arr_b, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuBinaryCrossEntropy(
+        in_arr_a.handle, in_arr_b.handle, out_arr.handle, stream.handle if stream else None)
+
+
+def binary_cross_entropy_gradient(in_arr_a, in_arr_b, in_arr_c, out_arr, stream=None):
+    assert isinstance(in_arr_a, _nd.NDArray)
+    assert isinstance(in_arr_b, _nd.NDArray)
+    assert isinstance(in_arr_c, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuBinaryCrossEntropy_Gradient(
+        in_arr_a.handle, in_arr_b.handle, in_arr_c.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/BroadcastLink.py b/python/hetu/gpu_links/BroadcastLink.py
new file mode 100644
index 0000000..8cfce39
--- /dev/null
+++ b/python/hetu/gpu_links/BroadcastLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def broadcast_to(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle,
+                          stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/BroadcastShapeLink.py b/python/hetu/gpu_links/BroadcastShapeLink.py
new file mode 100644
index 0000000..bf82f0d
--- /dev/null
+++ b/python/hetu/gpu_links/BroadcastShapeLink.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def broadcast_shape(in_arr, out_arr, add_axes=None, stream=None):
+    # deprecated, only used in tests
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    if add_axes is not None:
+        pointer_func = ctypes.c_int * len(add_axes)
+        pointer = pointer_func(*list(add_axes))
+    _LIB.DLGpuBroadcastShape(in_arr.handle, out_arr.handle,
+                             pointer if add_axes else None, stream.handle if stream else None)
+
+
+def broadcast_shape_simple(in_arr, out_arr, out_strides, in_dims, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    assert isinstance(out_strides, _nd.NDArray)
+    assert isinstance(in_dims, _nd.NDArray)
+    _LIB.DLGpuBroadcastShapeSimple(
+        in_arr.handle, out_arr.handle, out_strides.handle, in_dims.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ConcatLink.py b/python/hetu/gpu_links/ConcatLink.py
new file mode 100644
index 0000000..5803b94
--- /dev/null
+++ b/python/hetu/gpu_links/ConcatLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def concat(in_arr1, in_arr2, out_arr, axis=0, stream=None):
+    assert isinstance(in_arr1, _nd.NDArray)
+    assert isinstance(in_arr2, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuConcat(in_arr1.handle, in_arr2.handle,
+                     out_arr.handle, axis, stream.handle if stream else None)
+
+
+def concat_gradient(out_grad_arr, in_arr, axis=0, idx=0, stream=None):
+    assert isinstance(out_grad_arr, _nd.NDArray)
+    assert isinstance(in_arr, _nd.NDArray)
+    _LIB.DLGpuConcat_gradient(
+        out_grad_arr.handle, in_arr.handle, axis, idx, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/Conv2dBroadcastLink.py b/python/hetu/gpu_links/Conv2dBroadcastLink.py
new file mode 100644
index 0000000..7a80f5d
--- /dev/null
+++ b/python/hetu/gpu_links/Conv2dBroadcastLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def conv2d_broadcast_to(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuConv2d_broadcast_to(
+        in_arr.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/Conv2dLink.py b/python/hetu/gpu_links/Conv2dLink.py
new file mode 100644
index 0000000..be03247
--- /dev/null
+++ b/python/hetu/gpu_links/Conv2dLink.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def conv2d(in_arr_x, in_arr_f, out_arr, workspace_arr, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(in_arr_f, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    assert isinstance(workspace_arr, _nd.NDArray)
+    _LIB.DLGpuConv2d(in_arr_x.handle, in_arr_f.handle,
+                     out_arr.handle, workspace_arr.handle, padding, stride, stream.handle if stream else None)
+
+
+def conv2d_gradient_of_filter(in_arr_x, in_gradient_y, out_gradient_f, workspace_im2col, workspace_batch_filter, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_f, _nd.NDArray)
+    assert isinstance(workspace_im2col, _nd.NDArray)
+    assert isinstance(workspace_batch_filter, _nd.NDArray)
+    _LIB.DLGpuConv2d_Gradient_of_Filter(in_arr_x.handle, in_gradient_y.handle, out_gradient_f.handle,
+                                        workspace_im2col.handle, workspace_batch_filter.handle, padding, stride, stream.handle if stream else None)
+
+
+def conv2d_gradient_of_data(in_arr_f, in_gradient_y, out_gradient_x, workspace_im2col, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_f, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    assert isinstance(workspace_im2col, _nd.NDArray)
+    _LIB.DLGpuConv2d_Gradient_of_Data(
+        in_arr_f.handle, in_gradient_y.handle, out_gradient_x.handle, workspace_im2col.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/Conv2dReduceSumLink.py b/python/hetu/gpu_links/Conv2dReduceSumLink.py
new file mode 100644
index 0000000..a1aad66
--- /dev/null
+++ b/python/hetu/gpu_links/Conv2dReduceSumLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def conv2d_reduce_sum(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuConv2d_reduce_sum(
+        in_arr.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CuSparseLink.py b/python/hetu/gpu_links/CuSparseLink.py
new file mode 100644
index 0000000..b9ecc77
--- /dev/null
+++ b/python/hetu/gpu_links/CuSparseLink.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuSparse_Csrmv(mat, trans, in_arr, out_arr, stream=None):
+    assert isinstance(mat, _nd.ND_Sparse_Array)
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuSparse_DLGpuCsrmv(mat.data.handle, mat.row.handle, mat.col.handle, mat.nrow,
+                             mat.ncol, trans, in_arr.handle, out_arr.handle, stream.handle if stream else None)
+
+
+def CuSparse_Csrmm(matA, transA, matB, transB, matC, stream=None, start_pos=-1, end_pos=-1):
+    assert isinstance(matA, _nd.ND_Sparse_Array)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.CuSparse_DLGpuCsrmm(matA.data.handle, matA.row.handle, matA.col.handle, matA.nrow, matA.ncol,
+                             transA, matB.handle, transB, matC.handle, start_pos, end_pos, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnAvgPoolLink.py b/python/hetu/gpu_links/CudnnAvgPoolLink.py
new file mode 100644
index 0000000..eaa826f
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnAvgPoolLink.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_average_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(pooled_layer, _nd.NDArray)
+    _LIB.CuDNN_DLGpuAvgerage_Pooling2d(
+        in_arr.handle, kernel_H, kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
+
+
+def CuDNN_average_pooling2d_gradient(in_arr_y, in_gradient_y, in_arr_x, kernel_H, kernel_W, out_gradient_x, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_y, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.CuDNN_DLGpuAvgerage_Pooling2d_gradient(
+        in_arr_y.handle, in_gradient_y.handle, in_arr_x.handle, kernel_H, kernel_W, out_gradient_x.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnBnLink.py b/python/hetu/gpu_links/CudnnBnLink.py
new file mode 100644
index 0000000..1e8b1db
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnBnLink.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_Batch_Normalization(in_arr, bn_scale_arr, bn_bias_arr, out_arr, save_mean, save_var, running_mean, running_var, momentum=0.99, eps=0.001, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(bn_scale_arr, _nd.NDArray)
+    assert isinstance(bn_bias_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuBatch_Normalization(in_arr.handle, bn_scale_arr.handle, bn_bias_arr.handle,
+                                        out_arr.handle, ctypes.c_float(momentum), ctypes.c_float(eps), save_mean.handle, save_var.handle, running_mean.handle, running_var.handle, stream.handle if stream else None)
+
+
+def CuDNN_Batch_Normalization_gradient(in_gradient_y, in_arr_x, in_bn_scale, out_gradient_x, out_gradient_bn_scale, out_gradient_bn_bias, running_mean, running_var, eps=0.001, stream=None):
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(in_bn_scale, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    assert isinstance(out_gradient_bn_scale, _nd.NDArray)
+    assert isinstance(out_gradient_bn_bias, _nd.NDArray)
+    _LIB.CuDNN_DLGpuBatch_Normalization_gradient(in_gradient_y.handle, in_arr_x.handle, in_bn_scale.handle,
+                                                 out_gradient_x.handle, out_gradient_bn_scale.handle, out_gradient_bn_bias.handle, ctypes.c_float(eps), running_mean.handle, running_var.handle, stream.handle if stream else None)
+
+
+def CuDNN_Batch_Normalization_inference(in_arr, bn_scale_arr, bn_bias_arr, out_arr, save_mean, save_var, eps=0.001, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(bn_scale_arr, _nd.NDArray)
+    assert isinstance(bn_bias_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuBatch_Normalization_inference(in_arr.handle, bn_scale_arr.handle, bn_bias_arr.handle,
+                                                  out_arr.handle, ctypes.c_float(eps), save_mean.handle, save_var.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnConv2d.py b/python/hetu/gpu_links/CudnnConv2d.py
new file mode 100644
index 0000000..a943a0a
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnConv2d.py
@@ -0,0 +1,29 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_conv2d(in_arr_x, in_arr_f, out_arr, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(in_arr_f, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuConv2d(in_arr_x.handle, in_arr_f.handle,
+                           out_arr.handle, padding, stride, stream.handle if stream else None)
+
+
+def CuDNN_conv2d_gradient_of_filter(in_arr_x, in_gradient_y, out_gradient_f, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_f, _nd.NDArray)
+    _LIB.CuDNN_DLGpuConv2d_Gradient_of_Filter(
+        in_arr_x.handle, in_gradient_y.handle, out_gradient_f.handle, padding, stride, stream.handle if stream else None)
+
+
+def CuDNN_conv2d_gradient_of_data(in_arr_f, in_gradient_y, out_gradient_x, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_f, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.CuDNN_DLGpuConv2d_Gradient_of_Data(
+        in_arr_f.handle, in_gradient_y.handle, out_gradient_x.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnDropoutLink.py b/python/hetu/gpu_links/CudnnDropoutLink.py
new file mode 100644
index 0000000..c38c6c4
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnDropoutLink.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_Dropout(in_arr, dropout, out_arr, reserve_size, reserve_space, firstTime, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuDropout(in_arr.handle, ctypes.c_float(dropout), out_arr.handle, ctypes.byref(reserve_size),
+                            ctypes.byref(reserve_space), ctypes.c_int(firstTime), stream.handle if stream else None)
+
+
+def CuDNN_Dropout_gradient(in_gradient_y, dropout, out_gradient_x, reserve_size, reserve_space, stream=None):
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.CuDNN_DLGpuDropout_gradient(
+        in_gradient_y.handle, ctypes.c_float(
+            dropout), out_gradient_x.handle, ctypes.byref(reserve_size),
+        ctypes.byref(reserve_space), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnMaxPoolLink.py b/python/hetu/gpu_links/CudnnMaxPoolLink.py
new file mode 100644
index 0000000..cbeaaa1
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnMaxPoolLink.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_max_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(pooled_layer, _nd.NDArray)
+    _LIB.CuDNN_DLGpuMax_Pooling2d(
+        in_arr.handle, kernel_H, kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
+
+
+def CuDNN_max_pooling2d_gradient(in_arr_y, in_gradient_y, in_arr_x, kernel_H, kernel_W, out_gradient_x, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr_y, _nd.NDArray)
+    assert isinstance(in_gradient_y, _nd.NDArray)
+    assert isinstance(in_arr_x, _nd.NDArray)
+    assert isinstance(out_gradient_x, _nd.NDArray)
+    _LIB.CuDNN_DLGpuMax_Pooling2d_gradient(in_arr_y.handle, in_gradient_y.handle,
+                                           in_arr_x.handle, kernel_H, kernel_W, out_gradient_x.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnSoftmaxCrossEntropyLink.py b/python/hetu/gpu_links/CudnnSoftmaxCrossEntropyLink.py
new file mode 100644
index 0000000..c837dc5
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnSoftmaxCrossEntropyLink.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_softmax_cross_entropy(y, y_, out, stream=None):
+    assert isinstance(y, _nd.NDArray)
+    assert isinstance(y_, _nd.NDArray)
+    assert isinstance(out, _nd.NDArray)
+    _LIB.CuDNN_DLGpuSoftmaxEntropy(
+        y.handle, y_.handle, out.handle, stream.handle if stream else None)
+
+
+def CuDNN_softmax_cross_entropy_gradient(grad_arr, y_arr, label, out_arr, stream=None):
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(y_arr, _nd.NDArray)
+    assert isinstance(label, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuSoftmaxEntropyGradient(
+        grad_arr.handle, y_arr.handle, label.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/CudnnSoftmaxLink.py b/python/hetu/gpu_links/CudnnSoftmaxLink.py
new file mode 100644
index 0000000..5f3878b
--- /dev/null
+++ b/python/hetu/gpu_links/CudnnSoftmaxLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def CuDNN_softmax(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuSoftmax(in_arr.handle, out_arr.handle,
+                            stream.handle if stream else None)
+
+
+def CuDNN_softmax_gradient(y_arr, grad_arr, out_arr, stream=None):
+    assert isinstance(y_arr, _nd.NDArray)
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.CuDNN_DLGpuSoftmaxGradient(
+        y_arr.handle, grad_arr.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/DotLink.py b/python/hetu/gpu_links/DotLink.py
new file mode 100644
index 0000000..c0351c5
--- /dev/null
+++ b/python/hetu/gpu_links/DotLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_dot(matA, matB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuDot(
+        matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/Dropout2dLink.py b/python/hetu/gpu_links/Dropout2dLink.py
new file mode 100644
index 0000000..a30e4a8
--- /dev/null
+++ b/python/hetu/gpu_links/Dropout2dLink.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def dropout2d(in_arr, dropout2d_rate, out_arr, seed, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuDropout2d(in_arr.handle, ctypes.c_float(
+        dropout2d_rate), out_arr.handle, ctypes.byref(seed), stream.handle if stream else None)
+
+
+def dropout2d_gradient(grad_arr, dropout2d_rate, out_arr, seed, stream=None):
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuDropout2dGradient(grad_arr.handle, ctypes.c_float(
+        dropout2d_rate), out_arr.handle, seed, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/DropoutLink.py b/python/hetu/gpu_links/DropoutLink.py
new file mode 100644
index 0000000..3b3d918
--- /dev/null
+++ b/python/hetu/gpu_links/DropoutLink.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def dropout(in_arr, dropout_rate, out_arr, seed, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuDropout(in_arr.handle, ctypes.c_float(
+        dropout_rate), out_arr.handle, ctypes.byref(seed), stream.handle if stream else None)
+
+
+def dropout_gradient(grad_arr, dropout_rate, out_arr, seed, stream=None):
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuDropoutGradient(grad_arr.handle, ctypes.c_float(
+        dropout_rate), out_arr.handle, seed, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/EmbeddingLookUpLink.py b/python/hetu/gpu_links/EmbeddingLookUpLink.py
new file mode 100644
index 0000000..5d4aca1
--- /dev/null
+++ b/python/hetu/gpu_links/EmbeddingLookUpLink.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def embedding_lookup(in_mat, ids, out_mat, stream=None):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(ids, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    _LIB.DLGpuEmbeddingLookUp(
+        in_mat.handle, ids.handle, out_mat.handle, stream.handle if stream else None)
+
+
+def embedding_lookup_gradient(grad_out, ids, grad_in, stream=None):
+    assert isinstance(grad_out, _nd.NDArray)
+    assert isinstance(ids, _nd.NDArray)
+    assert isinstance(grad_in, _nd.NDArray)
+    _LIB.DLGpuEmbeddingLookUp_Gradient(
+        grad_out.handle, ids.handle, grad_in.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/IndexedSliceLink.py b/python/hetu/gpu_links/IndexedSliceLink.py
new file mode 100644
index 0000000..48aed4c
--- /dev/null
+++ b/python/hetu/gpu_links/IndexedSliceLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def indexedslice_oneside_add(indslice, output, stream=None):
+    assert isinstance(indslice.indices, _nd.NDArray)
+    assert isinstance(indslice.values, _nd.NDArray)
+    assert isinstance(output, _nd.NDArray)
+    _LIB.IndexedSlicesOneSideAdd(indslice.indices.handle, indslice.values.handle, output.handle,
+                                 stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/InitializersLink.py b/python/hetu/gpu_links/InitializersLink.py
new file mode 100644
index 0000000..1248316
--- /dev/null
+++ b/python/hetu/gpu_links/InitializersLink.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def normal_init(arr, mean, stddev, seed, stream=None):
+    assert isinstance(arr, _nd.NDArray)
+    _LIB.DLGpuNormalInit(arr.handle, ctypes.c_float(mean), ctypes.c_float(
+        stddev), ctypes.c_ulonglong(seed), stream.handle if stream else None)
+
+
+def uniform_init(arr, lb, ub, seed, stream=None):
+    assert isinstance(arr, _nd.NDArray)
+    _LIB.DLGpuUniformInit(arr.handle, ctypes.c_float(lb), ctypes.c_float(
+        ub), ctypes.c_ulonglong(seed), stream.handle if stream else None)
+
+
+def truncated_normal_init(arr, mean, stddev, seed, stream=None):
+    # time consuming !!
+    assert isinstance(arr, _nd.NDArray)
+    _LIB.DLGpuTruncatedNormalInit(arr.handle, ctypes.c_float(mean), ctypes.c_float(
+        stddev), ctypes.c_ulonglong(seed), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/InstanceNorm2dLink.py b/python/hetu/gpu_links/InstanceNorm2dLink.py
new file mode 100644
index 0000000..aabaf18
--- /dev/null
+++ b/python/hetu/gpu_links/InstanceNorm2dLink.py
@@ -0,0 +1,26 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def instance_normalization2d(in_arr, mean, var, out_arr, eps, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuInstanceNormalization2d(in_arr.handle, mean.handle,
+                                      var.handle, out_arr.handle, ctypes.c_float(eps), stream.handle if stream else None)
+
+
+def instance_normalization2d_gradient(out_grads, in_arr, grad_arr, mean_arr, var_arr, eps, stream=None):
+    assert isinstance(out_grads, _nd.NDArray)
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(mean_arr, _nd.NDArray)
+    assert isinstance(var_arr, _nd.NDArray)
+    _LIB.DLGpuInstanceNormalization2dGradient(out_grads.handle, in_arr.handle, grad_arr.handle,
+                                              mean_arr.handle, var_arr.handle, ctypes.c_float(
+                                                  eps),
+                                              stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/LayerNormLink.py b/python/hetu/gpu_links/LayerNormLink.py
new file mode 100644
index 0000000..bd52130
--- /dev/null
+++ b/python/hetu/gpu_links/LayerNormLink.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def layer_normalization(in_arr, ln_scale, ln_bias, mean, var, out_arr, eps, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(ln_scale, _nd.NDArray)
+    assert isinstance(ln_bias, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuLayerNormalization(in_arr.handle, ln_scale.handle, ln_bias.handle, mean.handle,
+                                 var.handle, out_arr.handle, ctypes.c_float(eps), stream.handle if stream else None)
+
+
+def layer_normalization_gradient(out_grads, in_arr, ln_scale, grad_arr, grad_scale, grad_bias,
+                                 mean_arr, var_arr, eps, stream=None):
+    assert isinstance(out_grads, _nd.NDArray)
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(ln_scale, _nd.NDArray)
+    assert isinstance(grad_arr, _nd.NDArray)
+    assert isinstance(grad_scale, _nd.NDArray)
+    assert isinstance(grad_bias, _nd.NDArray)
+    assert isinstance(mean_arr, _nd.NDArray)
+    assert isinstance(var_arr, _nd.NDArray)
+    _LIB.DLGpuLayerNormalizationGradient(out_grads.handle, in_arr.handle, ln_scale.handle,
+                                         grad_arr.handle, grad_scale.handle, grad_bias.handle,
+                                         mean_arr.handle, var_arr.handle, ctypes.c_float(
+                                             eps),
+                                         stream.handle if stream else None)
+
+
+def layer_normalization_inference(in_arr, ln_scale, ln_bias, mean, var, out_arr, eps, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(ln_scale, _nd.NDArray)
+    assert isinstance(ln_bias, _nd.NDArray)
+    assert isinstance(mean, _nd.NDArray)
+    assert isinstance(var, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuLayerNormalizationInference(in_arr.handle, ln_scale.handle, ln_bias.handle, mean.handle,
+                                          var.handle, out_arr.handle, ctypes.c_float(eps), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/LeakyReluLink.py b/python/hetu/gpu_links/LeakyReluLink.py
new file mode 100644
index 0000000..e66d4d0
--- /dev/null
+++ b/python/hetu/gpu_links/LeakyReluLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def leaky_relu(in_arr, alpha, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuLeakyRelu(in_arr.handle, ctypes.c_float(
+        alpha), out_arr.handle, stream.handle if stream else None)
+
+
+def leaky_relu_gradient(in_arr, in_grad_arr, alpha, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(in_grad_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuLeakyReluGradient(in_arr.handle, in_grad_arr.handle, ctypes.c_float(
+        alpha), out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixDivideConstLink.py b/python/hetu/gpu_links/MatrixDivideConstLink.py
new file mode 100644
index 0000000..a613061
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixDivideConstLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_divide_const(val, in_mat, out_mat, stream=None):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    _LIB.DLGpuMatrixDivConst(
+        ctypes.c_float(val), in_mat.handle, out_mat.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixDivideLink.py b/python/hetu/gpu_links/MatrixDivideLink.py
new file mode 100644
index 0000000..03e0a92
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixDivideLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_divide(matA, matB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseDivide(
+        matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixMultLink.py b/python/hetu/gpu_links/MatrixMultLink.py
new file mode 100644
index 0000000..eddc3b1
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixMultLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_multiply(matA, transA, matB, transB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuMatrixMultiply(
+        matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixRsqrtLink.py b/python/hetu/gpu_links/MatrixRsqrtLink.py
new file mode 100644
index 0000000..e0ac62b
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixRsqrtLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_rsqrt(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuReciprocalSqrt(
+        in_arr.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixSqrtLink.py b/python/hetu/gpu_links/MatrixSqrtLink.py
new file mode 100644
index 0000000..f9e7c50
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixSqrtLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_sqrt(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuSqrt(in_arr.handle, out_arr.handle,
+                   stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MatrixTransLink.py b/python/hetu/gpu_links/MatrixTransLink.py
new file mode 100644
index 0000000..23e9c8b
--- /dev/null
+++ b/python/hetu/gpu_links/MatrixTransLink.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_transpose(in_mat, out_mat, perm, stream=None):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    pointer_func = ctypes.c_int * len(perm)
+    pointer = pointer_func(*list(perm))
+    _LIB.DLGpuTranspose(in_mat.handle, out_mat.handle,
+                        pointer, stream.handle if stream else None)
+
+
+def matrix_transpose_simple(in_mat, out_mat, gpu_buf, stream=None):
+    assert isinstance(in_mat, _nd.NDArray)
+    assert isinstance(out_mat, _nd.NDArray)
+    assert isinstance(gpu_buf, _nd.NDArray)
+    _LIB.DLGpuTransposeSimple(
+        in_mat.handle, out_mat.handle, gpu_buf.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MaxPoolLink.py b/python/hetu/gpu_links/MaxPoolLink.py
new file mode 100644
index 0000000..f7ed18e
--- /dev/null
+++ b/python/hetu/gpu_links/MaxPoolLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def max_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(pooled_layer, _nd.NDArray)
+    _LIB.DLGpuMax_Pooling2d(in_arr.handle, kernel_H,
+                            kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
+
+
+def max_pooling2d_gradient(in_arr, in_grad_arr, kernel_H, kernel_W, out_grad_arr, padding=0, stride=1, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(in_grad_arr, _nd.NDArray)
+    assert isinstance(out_grad_arr, _nd.NDArray)
+    _LIB.DLGpuMax_Pooling2d_gradient(
+        in_arr.handle, in_grad_arr.handle, kernel_H, kernel_W, out_grad_arr.handle, padding, stride, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MultiplyConstLink.py b/python/hetu/gpu_links/MultiplyConstLink.py
new file mode 100644
index 0000000..9b4bb60
--- /dev/null
+++ b/python/hetu/gpu_links/MultiplyConstLink.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_multiply_by_const(in_mat, val, out_mat, stream=None):
+
+    assert isinstance(in_mat, (_nd.NDArray, _nd.IndexedSlices))
+    assert isinstance(out_mat, (_nd.NDArray, _nd.IndexedSlices))
+
+    if isinstance(in_mat, _nd.NDArray):
+        _LIB.DLGpuMatrixMultiplyByConst(
+            in_mat.handle, ctypes.c_float(val), out_mat.handle, stream.handle if stream else None)
+    else:
+        # isinstance(in_mat, _nd.IndexedSlices)
+        _LIB.DLGpuMatrixMultiplyByConst(
+            in_mat.values.handle, ctypes.c_float(val), out_mat.values.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/MultiplyElewiseLink.py b/python/hetu/gpu_links/MultiplyElewiseLink.py
new file mode 100644
index 0000000..57dd0c9
--- /dev/null
+++ b/python/hetu/gpu_links/MultiplyElewiseLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_elementwise_multiply(matA, matB, matC, stream=None):
+    assert isinstance(matA, _nd.NDArray)
+    assert isinstance(matB, _nd.NDArray)
+    assert isinstance(matC, _nd.NDArray)
+    _LIB.DLGpuMatrixElementwiseMultiply(
+        matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/OneHotLink.py b/python/hetu/gpu_links/OneHotLink.py
new file mode 100644
index 0000000..dae3813
--- /dev/null
+++ b/python/hetu/gpu_links/OneHotLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def one_hot(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuOneHot(in_arr.handle, out_arr.handle,
+                     stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/OppositeLink.py b/python/hetu/gpu_links/OppositeLink.py
new file mode 100644
index 0000000..46fee67
--- /dev/null
+++ b/python/hetu/gpu_links/OppositeLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_opposite(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuOpposite(in_arr.handle, out_arr.handle,
+                       stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/OptimizerLink.py b/python/hetu/gpu_links/OptimizerLink.py
new file mode 100644
index 0000000..8646418
--- /dev/null
+++ b/python/hetu/gpu_links/OptimizerLink.py
@@ -0,0 +1,74 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def add_l2_regularization(param, grad, l2reg, stream=None):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, (_nd.NDArray, _nd.IndexedSlices))
+    # not support indexed slices now
+    if isinstance(grad, _nd.NDArray):
+        _LIB.AddL2Regularization(param.handle, grad.handle, ctypes.c_float(
+            l2reg), stream.handle if stream else None)
+
+
+def sgd_update(param, grad, lr, stream=None):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, (_nd.NDArray, _nd.IndexedSlices))
+    if isinstance(grad, _nd.NDArray):
+        _LIB.SGDOptimizerUpdate(param.handle, grad.handle, ctypes.c_float(
+            lr), stream.handle if stream else None)
+    else:
+        assert isinstance(grad.indices, _nd.NDArray)
+        assert isinstance(grad.values, _nd.NDArray)
+        _LIB.SGDOptimizerSparseUpdate(param.handle, grad.indices.handle, grad.values.handle, ctypes.c_float(
+            lr), stream.handle if stream else None)
+
+
+def momentum_update(param, grad, velocity, lr, momentum, nesterov, stream=None):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, (_nd.NDArray, _nd.IndexedSlices))
+    assert isinstance(velocity, _nd.NDArray)
+    if isinstance(grad, _nd.NDArray):
+        _LIB.MomentumOptimizerUpdate(param.handle, grad.handle, velocity.handle, ctypes.c_float(
+            lr), ctypes.c_float(momentum), ctypes.c_bool(nesterov), stream.handle if stream else None)
+    else:
+        assert isinstance(grad.indices, _nd.NDArray)
+        assert isinstance(grad.values, _nd.NDArray)
+        _LIB.MomentumOptimizerSparseUpdate(param.handle, grad.indices.handle, grad.values.handle, velocity.handle, ctypes.c_float(
+            lr), ctypes.c_float(momentum), ctypes.c_bool(nesterov), stream.handle if stream else None)
+
+
+def adagrad_update(param, grad, accumulation, lr, eps, stream=None):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, (_nd.NDArray, _nd.IndexedSlices))
+    assert isinstance(accumulation, _nd.NDArray)
+    if isinstance(grad, _nd.NDArray):
+        _LIB.AdaGradOptimizerUpdate(param.handle, grad.handle, accumulation.handle, ctypes.c_float(
+            lr), ctypes.c_float(eps), stream.handle if stream else None)
+    else:
+        grad.deduplicate(stream)
+        assert isinstance(grad.indices, _nd.NDArray)
+        assert isinstance(grad.values, _nd.NDArray)
+        _LIB.AdaGradOptimizerSparseUpdate(param.handle, grad.indices.handle, grad.values.handle, accumulation.handle, ctypes.c_float(
+            lr), ctypes.c_float(eps), stream.handle if stream else None)
+        grad.free_deduplicate()
+
+
+def adam_update(param, grad, expavg, expavgsq, lr, beta1, beta2, beta1t, beta2t, eps, stream=None):
+    assert isinstance(param, _nd.NDArray)
+    assert isinstance(grad, (_nd.NDArray, _nd.IndexedSlices))
+    assert isinstance(expavg, _nd.NDArray)
+    assert isinstance(expavgsq, _nd.NDArray)
+    if isinstance(grad, _nd.NDArray):
+        _LIB.AdamOptimizerUpdate(param.handle, grad.handle, expavg.handle, expavgsq.handle, ctypes.c_float(lr), ctypes.c_float(beta1), ctypes.c_float(beta2),
+                                 ctypes.c_float(beta1t), ctypes.c_float(beta2t), ctypes.c_float(eps), stream.handle if stream else None)
+    else:
+        grad.deduplicate(stream)
+        assert isinstance(grad.indices, _nd.NDArray)
+        assert isinstance(grad.values, _nd.NDArray)
+        _LIB.AdamOptimizerSparseUpdate(param.handle, grad.indices.handle, grad.values.handle, expavg.handle, expavgsq.handle, ctypes.c_float(lr), ctypes.c_float(beta1), ctypes.c_float(beta2),
+                                       ctypes.c_float(beta1t), ctypes.c_float(beta2t), ctypes.c_float(eps), stream.handle if stream else None)
+        grad.free_deduplicate()
diff --git a/python/hetu/gpu_links/PadLink.py b/python/hetu/gpu_links/PadLink.py
new file mode 100644
index 0000000..ee269a7
--- /dev/null
+++ b/python/hetu/gpu_links/PadLink.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def pad(in_arr, out_arr, paddings, mode='CONSTANT', constant_values=0, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    padding_arr = []
+    for i in range(len(paddings)):
+        for j in range(len(paddings[0])):
+            padding_arr.append(paddings[i][j])
+    pad_len = len(padding_arr)
+    padding_c_arr = (ctypes.c_int * pad_len)(*padding_arr)
+    f_type = 3
+    if mode == 'CONSTANT':
+        f_type = 0
+    elif mode == 'REFLECT':
+        f_type = 1
+    elif mode == 'SYMMETRIC':
+        f_type = 2
+    assert(f_type <= 2)
+    _LIB.DLGpuPad(in_arr.handle, out_arr.handle, padding_c_arr,
+                  pad_len, f_type, constant_values, stream.handle if stream else None)
+
+
+def pad_gradient(out_grad_arr, in_grad_arr, paddings, mode="CONSTANT", stream=None):
+    assert isinstance(out_grad_arr, _nd.NDArray)
+    assert isinstance(in_grad_arr, _nd.NDArray)
+    padding_arr = []
+    for i in range(len(paddings)):
+        for j in range(len(paddings[0])):
+            padding_arr.append(paddings[i][j])
+    pad_len = len(padding_arr)
+    padding_c_arr = (ctypes.c_int * pad_len)(*padding_arr)
+    f_type = 3
+    if mode == 'CONSTANT':
+        f_type = 0
+    elif mode == 'REFLECT':
+        f_type = 1
+    elif mode == 'SYMMETRIC':
+        f_type = 2
+    assert(f_type <= 2)
+    _LIB.DLGpuPad_gradient(out_grad_arr.handle,
+                           in_grad_arr.handle, padding_c_arr, pad_len, f_type, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ReduceMeanLink.py b/python/hetu/gpu_links/ReduceMeanLink.py
new file mode 100644
index 0000000..eddd42c
--- /dev/null
+++ b/python/hetu/gpu_links/ReduceMeanLink.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def reduce_mean(in_arr, out_arr, axes, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    pointer_func = ctypes.c_int * len(axes)
+    pointer = pointer_func(*list(axes))
+    _LIB.DLGpuReduceMean(
+        in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ReduceSumAxisZeroLink.py b/python/hetu/gpu_links/ReduceSumAxisZeroLink.py
new file mode 100644
index 0000000..563b469
--- /dev/null
+++ b/python/hetu/gpu_links/ReduceSumAxisZeroLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuReduceSumAxisZero(
+        in_arr.handle, out_arr.handle, stream.handle if stream else None)
+
+
+def _reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    assert isinstance(workspace_arr, _nd.NDArray)
+    _LIB._DLGpuReduceSumAxisZero(
+        in_arr.handle, out_arr.handle, workspace_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ReduceSumLink.py b/python/hetu/gpu_links/ReduceSumLink.py
new file mode 100644
index 0000000..1686179
--- /dev/null
+++ b/python/hetu/gpu_links/ReduceSumLink.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def reduce_sum(in_arr, out_arr, axes, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    pointer_func = ctypes.c_int * len(axes)
+    pointer = pointer_func(*list(axes))
+    _LIB.DLGpuReduceSum(
+        in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ReluLink.py b/python/hetu/gpu_links/ReluLink.py
new file mode 100644
index 0000000..c1ccb66
--- /dev/null
+++ b/python/hetu/gpu_links/ReluLink.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def relu(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuRelu(in_arr.handle, out_arr.handle,
+                   stream.handle if stream else None)
+
+
+def relu_gradient(in_arr, in_grad_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(in_grad_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle,
+                           out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/ReshapeLink.py b/python/hetu/gpu_links/ReshapeLink.py
new file mode 100644
index 0000000..d8e3291
--- /dev/null
+++ b/python/hetu/gpu_links/ReshapeLink.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def array_reshape(in_arr, out_arr, stream=None):
+
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuReshape(in_arr.handle, out_arr.handle,
+                      stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/SigmoidLink.py b/python/hetu/gpu_links/SigmoidLink.py
new file mode 100644
index 0000000..d76a006
--- /dev/null
+++ b/python/hetu/gpu_links/SigmoidLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def sigmoid(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuSigmoid(in_arr.handle, out_arr.handle,
+                      stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/SliceLink.py b/python/hetu/gpu_links/SliceLink.py
new file mode 100644
index 0000000..86b42fd
--- /dev/null
+++ b/python/hetu/gpu_links/SliceLink.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def matrix_slice(in_arr, out_arr, begin_pos, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    pointer_func = ctypes.c_int64 * len(begin_pos)
+    pointer = pointer_func(*list(begin_pos))
+    _LIB.DLGpuSlice(in_arr.handle, out_arr.handle, pointer,
+                    stream.handle if stream else None)
+
+
+def matrix_slice_gradient(in_arr, out_arr, begin_pos, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    pointer_func = ctypes.c_int64 * len(begin_pos)
+    pointer = pointer_func(*list(begin_pos))
+    _LIB.DLGpuSliceGradient(in_arr.handle, out_arr.handle,
+                            pointer, stream.handle if stream else None)
+
+
+def matrix_slice_simple(in_arr, out_arr, gpu_buf, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    assert isinstance(gpu_buf, _nd.NDArray)
+    _LIB.DLGpuSliceSimple(in_arr.handle, out_arr.handle,
+                          gpu_buf.handle, stream.handle if stream else None)
+
+
+def matrix_slice_gradient_simple(in_arr, out_arr, gpu_buf, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    assert isinstance(gpu_buf, _nd.NDArray)
+    _LIB.DLGpuSliceGradientSimple(
+        in_arr.handle, out_arr.handle, gpu_buf.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/SoftmaxCrossEntropyLink.py b/python/hetu/gpu_links/SoftmaxCrossEntropyLink.py
new file mode 100644
index 0000000..a52d47d
--- /dev/null
+++ b/python/hetu/gpu_links/SoftmaxCrossEntropyLink.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr, stream=None):
+    assert isinstance(in_arr_a, _nd.NDArray)
+    assert isinstance(in_arr_b, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuSoftmaxCrossEntropy(
+        in_arr_a.handle, in_arr_b.handle, out_arr.handle, stream.handle if stream else None)
+
+
+def softmax_cross_entropy_gradient(in_arr_a, in_arr_b, in_arr_c, out_arr, stream=None):
+    assert isinstance(in_arr_a, _nd.NDArray)
+    assert isinstance(in_arr_b, _nd.NDArray)
+    assert isinstance(in_arr_c, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuSoftmaxCrossEntropy_Gradient(
+        in_arr_a.handle, in_arr_b.handle, in_arr_c.handle, out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/SoftmaxLink.py b/python/hetu/gpu_links/SoftmaxLink.py
new file mode 100644
index 0000000..6c1b23e
--- /dev/null
+++ b/python/hetu/gpu_links/SoftmaxLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def softmax(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle,
+                      stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/TanhLink.py b/python/hetu/gpu_links/TanhLink.py
new file mode 100644
index 0000000..730252a
--- /dev/null
+++ b/python/hetu/gpu_links/TanhLink.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def tanh(in_arr, out_arr, stream=None):
+    assert isinstance(in_arr, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuTanh(in_arr.handle, out_arr.handle,
+                   stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/WhereLink.py b/python/hetu/gpu_links/WhereLink.py
new file mode 100644
index 0000000..932b831
--- /dev/null
+++ b/python/hetu/gpu_links/WhereLink.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import
+
+import ctypes
+from .._base import _LIB
+from .. import ndarray as _nd
+
+
+def where(cond, arr1, arr2, out_arr, stream=None):
+    assert isinstance(cond, _nd.NDArray)
+    assert isinstance(arr1, _nd.NDArray)
+    assert isinstance(arr2, _nd.NDArray)
+    assert isinstance(out_arr, _nd.NDArray)
+    _LIB.DLGpuWhere(cond.handle, arr1.handle, arr2.handle,
+                    out_arr.handle, stream.handle if stream else None)
diff --git a/python/hetu/gpu_links/__init__.py b/python/hetu/gpu_links/__init__.py
new file mode 100644
index 0000000..0eb0be3
--- /dev/null
+++ b/python/hetu/gpu_links/__init__.py
@@ -0,0 +1,149 @@
+from __future__ import absolute_import
+from .AddConstLink import *
+from .AddElewiseLink import *
+from .ArraySetLink import *
+from .AvgPoolLink import *
+from .BroadcastLink import *
+from .BinaryCrossEntropyLink import *
+from .ConcatLink import *
+from .Conv2dBroadcastLink import *
+from .Conv2dLink import *
+from .Conv2dReduceSumLink import *
+from .CudnnAvgPoolLink import *
+from .CudnnBnLink import *
+from .CudnnConv2d import *
+from .CudnnDropoutLink import *
+from .CudnnMaxPoolLink import *
+from .MatrixMultLink import *
+from .MaxPoolLink import *
+from .MultiplyConstLink import *
+from .MultiplyElewiseLink import *
+from .PadLink import *
+from .ReduceSumAxisZeroLink import *
+from .ReluLink import *
+from .LeakyReluLink import *
+from .ReshapeLink import *
+from .SoftmaxCrossEntropyLink import *
+from .SoftmaxLink import *
+from .MatrixDivideConstLink import *
+from .MatrixDivideLink import *
+from .CuSparseLink import *
+from .MatrixSqrtLink import *
+from .MatrixRsqrtLink import *
+from .MatrixTransLink import *
+from .OppositeLink import *
+from .SigmoidLink import *
+from .TanhLink import *
+from .SliceLink import *
+from .EmbeddingLookUpLink import *
+from .WhereLink import *
+from .BatchMatrixMultLink import *
+from .LayerNormLink import *
+from .InstanceNorm2dLink import *
+from .BroadcastShapeLink import *
+from .ReduceSumLink import *
+from .ReduceMeanLink import *
+from .OptimizerLink import *
+from .IndexedSliceLink import *
+from .DropoutLink import *
+from .Dropout2dLink import *
+from .CudnnSoftmaxLink import *
+from .CudnnSoftmaxCrossEntropyLink import *
+from .OneHotLink import *
+from .InitializersLink import *
+from .DotLink import *
+
+__all__ = [
+    'matrix_elementwise_add_by_const',
+    'matrix_elementwise_add',
+    'matrix_elementwise_add_simple',
+    'matrix_elementwise_add_lazy',
+    'array_set',
+    'average_pooling2d',
+    'average_pooling2d_gradient',
+    'broadcast_to',
+    'concat',
+    'concat_gradient',
+    'conv2d_broadcast_to',
+    'conv2d',
+    'conv2d_gradient_of_data',
+    'conv2d_gradient_of_filter',
+    'conv2d_reduce_sum',
+    'CuDNN_average_pooling2d',
+    'CuDNN_average_pooling2d_gradient',
+    'CuDNN_Batch_Normalization',
+    'CuDNN_Batch_Normalization_gradient',
+    'CuDNN_Batch_Normalization_inference',
+    'CuDNN_conv2d',
+    'CuDNN_conv2d_gradient_of_data',
+    'CuDNN_conv2d_gradient_of_filter',
+    'CuDNN_Dropout',
+    'CuDNN_Dropout_gradient',
+    'CuDNN_max_pooling2d',
+    'CuDNN_max_pooling2d_gradient',
+    'matrix_multiply',
+    'max_pooling2d',
+    'max_pooling2d_gradient',
+    'matrix_elementwise_multiply_by_const',
+    'matrix_elementwise_multiply',
+    'pad',
+    'pad_gradient',
+    'reduce_sum_axis_zero',
+    'relu',
+    'relu_gradient',
+    'leaky_relu',
+    'leaky_relu_gradient',
+    'array_reshape',
+    'softmax_cross_entropy',
+    'softmax_cross_entropy_gradient',
+    'softmax',
+    'matrix_elementwise_divide_const',
+    'matrix_elementwise_divide',
+    'matrix_opposite',
+    'matrix_sqrt',
+    'matrix_rsqrt',
+    'CuSparse_Csrmv',
+    'CuSparse_Csrmm',
+    'matrix_transpose',
+    'matrix_transpose_simple',
+    'sigmoid',
+    'tanh',
+    'matrix_slice',
+    'matrix_slice_simple',
+    'matrix_slice_gradient',
+    'matrix_slice_gradient_simple',
+    'embedding_lookup',
+    'embedding_lookup_gradient',
+    'where',
+    'batch_matrix_multiply',
+    'layer_normalization',
+    'layer_normalization_gradient',
+    'layer_normalization_inference',
+    'instance_normalization2d',
+    'broadcast_shape',
+    'broadcast_shape_simple',
+    'reduce_sum',
+    'reduce_mean',
+    'dropout',
+    'dropout_gradient',
+    'dropout2d',
+    'dropout2d_gradient',
+    'CuDNN_softmax',
+    'CuDNN_softmax_gradient',
+    'CuDNN_softmax_cross_entropy',
+    'CuDNN_softmax_cross_entropy_gradient',
+    'one_hot',
+
+    'normal_init',
+    'uniform_init',
+    'truncated_normal_init',
+
+    'sgd_update',
+    'momentum_update',
+    'adagrad_update',
+    'adam_update',
+    'indexedslice_oneside_add',
+    'binary_cross_entropy',
+    'binary_cross_entropy_gradient',
+    'matrix_dot'
+]
diff --git a/python/hetu/gpu_ops/AddConst.py b/python/hetu/gpu_ops/AddConst.py
new file mode 100644
index 0000000..93523c2
--- /dev/null
+++ b/python/hetu/gpu_ops/AddConst.py
@@ -0,0 +1,48 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import matrix_elementwise_add_by_const as cpu_matrix_elementwise_add_by_const
+from ..gpu_links import matrix_elementwise_add_by_const
+
+
+class AddByConstOp(Op):
+    def __init__(self, node_A, const_val, ctx=None):
+        super().__init__(AddByConstOp, [node_A], ctx)
+        self.const_attr = const_val
+        self.desc = self.name + '(%s, %s)' % (node_A.name, str(const_val))
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixElementwiseAddByConst']:
+                cpu_matrix_elementwise_add_by_const(
+                    input_vals[0], self.const_attr, output_val)
+            else:
+                output_val[:] = input_vals[0].asnumpy() + self.const_attr
+        else:
+            matrix_elementwise_add_by_const(
+                input_vals[0], self.const_attr, output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def addbyconst_op(node, const_val, ctx=None):
+    """Make a new instance of AddByConstOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to be added.
+    const_val : scalar value
+        The constant value to be added.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return AddByConstOp(node, const_val, ctx=ctx)
diff --git a/python/hetu/gpu_ops/AddElewise.py b/python/hetu/gpu_ops/AddElewise.py
new file mode 100644
index 0000000..a6b0a0f
--- /dev/null
+++ b/python/hetu/gpu_ops/AddElewise.py
@@ -0,0 +1,203 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from .._base import DNNL_LIB
+from ..cpu_links import matrix_elementwise_add as\
+    cpu_matrix_elementwise_add
+from ..cpu_links import matrix_elementwise_add_by_const as\
+    cpu_matrix_elementwise_add_by_const
+from ..gpu_links import matrix_elementwise_add_by_const,\
+    indexedslice_oneside_add,\
+    array_set,\
+    matrix_elementwise_add_simple,\
+    matrix_elementwise_add_lazy
+from .DataTransfer import DataD2HSparseOp
+from .EmbeddingLookUp import EmbeddingLookUp_Gradient
+import numpy as np
+
+
+class AddOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(AddOp, [node_A, node_B], ctx)
+        self.lazy_execution = True
+        self.compute_to_be_config = False
+
+    def _compute_with_index(self, input_vals, output_val, stream_handle=None):
+        def cpu_oneside_add(sparse, dense):
+            sparse.cpu_deduplicate()
+            dense[sparse.indices.asnumpy().astype(
+                np.int)] += sparse.values.asnumpy()
+            sparse.free_deduplicate()
+        first_indexed = isinstance(input_vals[0], ndarray.IndexedSlices)
+        second_indexed = isinstance(input_vals[1], ndarray.IndexedSlices)
+        if self.on_cpu:
+            if first_indexed and not second_indexed:
+                cpu_output = input_vals[1].numpy()
+                cpu_oneside_add(input_vals[0], cpu_output)
+                output_val[:] = cpu_output
+            elif not first_indexed and second_indexed:
+                cpu_output = input_vals[0].numpy()
+                cpu_oneside_add(input_vals[1], cpu_output)
+                output_val[:] = cpu_output
+            elif first_indexed and second_indexed:
+                cpu_output = np.zeros(output_val.shape).astype(np.float32)
+                cpu_oneside_add(input_vals[0], cpu_output)
+                cpu_oneside_add(input_vals[1], cpu_output)
+                output_val[:] = cpu_output
+            else:
+                assert False
+        else:
+            if first_indexed and not second_indexed:
+                input_vals[1].copyto(output_val)
+                indexedslice_oneside_add(
+                    input_vals[0], output_val, stream_handle)
+            elif not first_indexed and second_indexed:
+                input_vals[0].copyto(output_val)
+                indexedslice_oneside_add(
+                    input_vals[1], output_val, stream_handle)
+            elif first_indexed and second_indexed:
+                array_set(output_val, 0, stream_handle)
+                indexedslice_oneside_add(
+                    input_vals[0], output_val, stream_handle)
+                indexedslice_oneside_add(
+                    input_vals[1], output_val, stream_handle)
+            else:
+                assert False
+
+    def _compute_on_cpu_simple(self, input_vals, output_val, stream_handle=None):
+        assert self.on_cpu
+        if DNNL_LIB['DnnlMatrixElementwiseAdd'] and input_vals[0].shape == input_vals[1].shape:
+            cpu_matrix_elementwise_add(
+                input_vals[0], input_vals[1], output_val)
+        elif DNNL_LIB['DnnlMatrixElementwiseAddByConst'] and (input_vals[1].shape == (1,) or input_vals[0].shape == (1,)):
+            if input_vals[1].shape == (1,):
+                const_val = input_vals[1].asnumpy()[0]
+                cpu_matrix_elementwise_add_by_const(
+                    input_vals[0], const_val, output_val)
+            elif input_vals[0].shape == (1,):
+                const_val = input_vals[0].asnumpy()[0]
+                cpu_matrix_elementwise_add_by_const(
+                    input_vals[1], const_val, output_val)
+        else:
+            # output_val[:] allows modify in-place
+            output_val[:] = input_vals[0].asnumpy() + input_vals[1].asnumpy()
+
+    def _compute_on_gpu_add_const(self, input_vals, output_val, stream_handle=None):
+        assert self.on_gpu
+        if input_vals[1].shape == (1,):
+            const_val = input_vals[1].asnumpy()[0]
+            matrix_elementwise_add_by_const(
+                input_vals[0], const_val, output_val, stream_handle)
+        elif input_vals[0].shape == (1,):
+            const_val = input_vals[0].asnumpy()[0]
+            matrix_elementwise_add_by_const(
+                input_vals[1], const_val, output_val, stream_handle)
+        else:
+            assert False
+
+    def _compute_on_gpu_simple(self, input_vals, output_val, stream_handle=None):
+        assert self.on_gpu
+        matrix_elementwise_add_simple(
+            input_vals[0], input_vals[1], output_val, stream_handle)
+
+    def _compute_on_gpu_lazy(self, input_vals, output_val, stream_handle=None):
+        assert self.on_gpu
+        self._reset_gpu_buffer(input_vals[0], input_vals[1], output_val)
+        matrix_elementwise_add_lazy(
+            input_vals[0], input_vals[1], output_val, self.gpu_buffer, stream_handle)
+
+    def _compute_on_gpu_broadcast_to_0(self, input_vals, output_val, stream_handle=None):
+        assert self.on_gpu
+        input_vals[1].broadcast_to(input_vals[0].shape, self.middle_result)
+        self._reset_gpu_buffer(input_vals[0], self.middle_result, output_val)
+        matrix_elementwise_add_lazy(
+            input_vals[0], self.middle_result, output_val, self.gpu_buffer, stream_handle)
+
+    def _compute_on_gpu_broadcast_to_1(self, input_vals, output_val, stream_handle=None):
+        assert self.on_gpu
+        input_vals[0].broadcast_to(input_vals[1].shape, self.middle_result)
+        self._reset_gpu_buffer(self.middle_result, input_vals[1], output_val)
+        matrix_elementwise_add_lazy(
+            self.middle_result, input_vals[1], output_val, self.gpu_buffer, stream_handle)
+
+    def _reset_gpu_buffer(self, input_val1, input_val2, output_val):
+        if self.check_reset:
+            strides = list(input_val1.stride) + \
+                list(input_val2.stride) + list(output_val.stride)
+            self.gpu_buffer = ndarray.array(
+                strides, self.ctx, data_type=np.uintc)
+            self.check_reset = False
+
+    def gradient(self, output_grad):
+        return [output_grad, output_grad]
+
+    def infer_shape(self, input_shapes):
+        """Need to handle input_vals[0].shape != input_vals[1].shape"""
+        assert len(input_shapes) == 2
+        no_broadcast = input_shapes[0] == input_shapes[1]
+        has_const = input_shapes[0] == (1,) or input_shapes[1] == (1,)
+        if no_broadcast:
+            output = input_shapes[0]
+        elif not has_const:
+            first_size = np.prod(input_shapes[0])
+            second_size = np.prod(input_shapes[1])
+            if first_size > second_size:
+                long_shapes = input_shapes[0]
+                short_shapes = input_shapes[1]
+                first_long = True
+            else:
+                long_shapes = input_shapes[1]
+                short_shapes = input_shapes[0]
+                first_long = False
+            for i in range(len(short_shapes)):
+                if short_shapes[i] != 1 and short_shapes[i] != long_shapes[len(long_shapes)-len(short_shapes) + i]:
+                    assert False, "can't add variables of shapes  " + \
+                        str(input_shapes[0])+str(input_shapes[1])
+            output = long_shapes
+        if self.compute_to_be_config:
+            if has_const:
+                self.compute = self._compute_on_gpu_add_const
+            elif no_broadcast:
+                if self.inputs[0].inplace or self.inputs[1].inplace:
+                    self.compute = self._compute_on_gpu_lazy
+                    self.check_reset = True
+                else:
+                    self.compute = self._compute_on_gpu_simple
+            else:
+                self.middle_result = ndarray.NDArray(None)
+                if first_long:
+                    self.compute = self._compute_on_gpu_broadcast_to_0
+                else:
+                    self.compute = self._compute_on_gpu_broadcast_to_1
+                self.check_reset = True
+        return output
+
+    def forward_hook(self, config):
+        super().forward_hook(config)
+        if isinstance(self.inputs[0], (EmbeddingLookUp_Gradient, DataD2HSparseOp)) or \
+                isinstance(self.inputs[1], (EmbeddingLookUp_Gradient, DataD2HSparseOp)):
+            self.compute = self._compute_with_index
+        elif self.on_cpu:
+            self.compute = self._compute_on_cpu_simple
+        else:
+            # determine in infer_shape
+            self.compute_to_be_config = True
+            self.check_reset = False
+
+
+def add_op(node_A, node_B, ctx=None):
+    """Make a new instance of Node Addition and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The Node to be added.
+    node_B : Node
+        Another Node to be added.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return AddOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/AllReduceCommunicate.py b/python/hetu/gpu_ops/AllReduceCommunicate.py
new file mode 100644
index 0000000..f7ac653
--- /dev/null
+++ b/python/hetu/gpu_ops/AllReduceCommunicate.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from .._base import _LIB, check_call
+from ..stream import create_event_handle
+
+
+class AllReduceCommunicateOp(Op):
+    def __init__(self, nodeA, comm):
+        super().__init__(AllReduceCommunicateOp, [nodeA], nodeA.ctx)
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        self.comm = comm
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            assert not isinstance(
+                input_vals[0], (ndarray.IndexedSlices, ndarray.ND_Sparse_Array))
+            self.comm.dlarrayNcclAllReduce(
+                input_vals[0], output_val, self.dtype, self.reduce_op)
+        else:
+            if self.event == None:
+                self.event = create_event_handle(input_vals[0].ctx)
+            if isinstance(input_vals[0], ndarray.NDArray):
+                self.comm.dlarrayNcclAllReduce(
+                    input_vals[0], output_val, self.dtype, self.reduce_op, stream_handle)
+                self.event.record(stream_handle)
+            elif isinstance(input_vals[0], ndarray.IndexedSlices):
+                # ?should use allgather?
+                self.comm.dlarrayNcclAllReduce(
+                    input_vals[0].indices, output_val.indices, self.dtype, self.reduce_op, stream_handle)
+                self.comm.dlarrayNcclAllReduce(
+                    input_vals[0].values, output_val.values, self.dtype, self.reduce_op, stream_handle)
+                self.event.record(stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+    def forward_hook(self, config):
+        from ..communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+        self.ctx = self.inputs[0].ctx
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        if self.on_gpu and self.inputs[0].event is None:
+            self.inputs[0].event = create_event_handle(self.ctx)
+
+        # disable inplace if not lazy execution
+        # previously we use array reshape lazy callback to do this, which is deprecated (not efficient)
+        self.inputs[0].inplace = False
+        self.dtype = ncclDataType_t.ncclFloat32
+        self.reduce_op = ncclRedOp_t.ncclSum
+
+
+def allreduceCommunicate_op(node, comm):
+    """Make a new instance of AllReduceCommunicateOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to do allreduce
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return AllReduceCommunicateOp(node, comm)
+
+
+class GroupAllReduceCommunicateOp(Op):
+    def __init__(self, nodeA, group_comm):
+        super().__init__(GroupAllReduceCommunicateOp, [nodeA], nodeA.ctx)
+        self.group_comm = group_comm
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        from ..communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+        input_vals[0].copyto(output_val)
+        self.group_comm.dlarrayNcclAllReduce(
+            output_val, output_val, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+def groupallreduceCommunicate_op(node, group_comm):
+    """Make a new instance of GroupAllReduceCommunicateOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to do groupallreduce
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return GroupAllReduceCommunicateOp(node, group_comm)
diff --git a/python/hetu/gpu_ops/AvgPool.py b/python/hetu/gpu_ops/AvgPool.py
new file mode 100644
index 0000000..71281f6
--- /dev/null
+++ b/python/hetu/gpu_ops/AvgPool.py
@@ -0,0 +1,170 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import avg_pool as cpu_avg_pool
+from ..gpu_links import CuDNN_average_pooling2d
+from ..cpu_links import avg_pool_gradient as cpu_avg_pool_gradient
+from ..gpu_links import CuDNN_average_pooling2d_gradient
+
+
+class Avg_Pool2dOp(Op):
+    def __init__(self, node_A, kernel_H, kernel_W, padding, stride, ctx=None):
+        super().__init__(Avg_Pool2dOp, [node_A], ctx)
+        self.padding = padding
+        self.stride = stride
+        self.kernel_H = kernel_H
+        self.kernel_W = kernel_W
+
+    def np_average_pooling(self, input, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, H, W = input.shape
+        assert((H + 2 * padding - kernel_H) % stride == 0)
+        assert((W + 2 * padding - kernel_W) % stride == 0)
+        pooled_H = (H + 2 * padding - kernel_H) // stride + 1
+        pooled_W = (W + 2 * padding - kernel_W) // stride + 1
+        pooled_layer = np.zeros(
+            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                pooled_layer[n][c][h][w] += input[n][c][i][j]
+                        pooled_layer[n][c][h][w] /= pooling_size
+        return pooled_layer
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlAvgPool']:
+                cpu_avg_pool(
+                    input_vals[0], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride)
+            else:
+                output_val[:] = self.np_average_pooling(
+                    input_vals[0].asnumpy(), self.kernel_H, self.kernel_W, self.padding, self.stride)
+        else:
+            CuDNN_average_pooling2d(
+                input_vals[0], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride, stream_handle)
+
+    def gradient(self, output_grad):
+        return [avg_pool2d_gradient_op(self, output_grad, self.inputs[0], self.kernel_H, self.kernel_W, self.padding, self.stride, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """Need to handle input_vals[0].shape != input_vals[1].shape"""
+        assert len(input_shapes) == 1
+        N, C, H, W = input_shapes[0]
+        p_H = (H + 2 * self.padding - self.kernel_H) // self.stride + 1
+        p_W = (W + 2 * self.padding - self.kernel_W) // self.stride + 1
+        return (N, C, p_H, p_W)
+
+
+class Avg_Pool2d_GradientOp(Op):
+    def __init__(self, node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=None):
+        super().__init__(Avg_Pool2d_GradientOp, [
+            node_out, node_out_gradient, node_in], ctx)
+        self.padding = padding
+        self.stride = stride
+        self.kernel_H = kernel_H
+        self.kernel_W = kernel_W
+
+    def np_average_pooling_gradient(self, gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                gradient_x[n][c][i][j] += gradient_y[n][c][h][w] / \
+                                    pooling_size
+
+        return gradient_x
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlAvgPool_Gradient']:
+                cpu_avg_pool_gradient(
+                    input_vals[1], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride)
+            else:
+                output_val[:] = self.np_average_pooling_gradient(
+                    input_vals[1].asnumpy(), self.kernel_H, self.kernel_W, self.padding, self.stride)
+        else:
+            CuDNN_average_pooling2d_gradient(
+                input_vals[0], input_vals[1], input_vals[2], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        return input_shapes[2]
+
+
+def avg_pool2d_op(node_A, kernel_H, kernel_W, padding, stride, ctx=None):
+    """Average pooling node.
+
+    Parameters:
+    ----
+    node_A : Node
+        Input node.
+    kernel_H : 
+        Kernel height.
+    kernel_W :
+        Kernel width.
+    padding :
+        Padding size.
+    stride :
+        Stride size.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Avg_Pool2dOp(node_A, kernel_H, kernel_W, padding, stride, ctx=ctx)
+
+
+def avg_pool2d_gradient_op(node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=None):
+    """Gradient node of average pooling.
+
+    Parameters:
+    ----
+    node_out : Node
+        Output node of average pooling.
+    node_out_gradient : Node
+        Previous gradient node.
+    node_in : Node
+        Input node of average pooling.
+    kernel_H : 
+        Kernel height.
+    kernel_W :
+        Kernel width.
+    padding :
+        Padding size.
+    stride :
+        Stride size.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Avg_Pool2d_GradientOp(node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=ctx)
diff --git a/python/hetu/gpu_ops/BatchMatrixMult.py b/python/hetu/gpu_ops/BatchMatrixMult.py
new file mode 100644
index 0000000..d20473d
--- /dev/null
+++ b/python/hetu/gpu_ops/BatchMatrixMult.py
@@ -0,0 +1,114 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import batch_matrix_multiply
+
+
+class BatchMatMulOp(Op):
+    def __init__(self, node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+        super().__init__(BatchMatMulOp, [node_A, node_B], ctx)
+        self.matmul_attr_trans_A = trans_A
+        self.matmul_attr_trans_B = trans_B
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            ndims = len(input_vals[0])
+            perm = list(range(ndims-2)) + [ndims-1, ndims-2]
+
+            if ((self.matmul_attr_trans_A is False) and
+                    (self.matmul_attr_trans_B is False)):
+                output_val[:] = np.matmul(
+                    input_vals[0].asnumpy(), input_vals[1].asnumpy())
+            elif ((self.matmul_attr_trans_A is True) and
+                    (self.matmul_attr_trans_B is False)):
+                output_val[:] = np.matmul(
+                    np.transpose(input_vals[0].asnumpy(), perm), input_vals[1].asnumpy())
+            elif ((self.matmul_attr_trans_A is False) and
+                    (self.matmul_attr_trans_B is True)):
+                output_val[:] = np.matmul(
+                    input_vals[0].asnumpy(), np.transpose(input_vals[1].asnumpy(), perm))
+            elif ((self.matmul_attr_trans_A is True) and
+                    (self.matmul_attr_trans_B is True)):
+                output_val[:] = np.matmul(
+                    np.transpose(input_vals[0].asnumpy(), perm), np.transpose(input_vals[1].asnumpy(), perm))
+        else:
+            batch_matrix_multiply(
+                input_vals[0], self.matmul_attr_trans_A,
+                input_vals[1], self.matmul_attr_trans_B,
+                output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        if ((self.matmul_attr_trans_A is False) and
+                (self.matmul_attr_trans_B is False)):
+            # if Y=AB, then dA=dY B^T, dB=A^T dY
+            lhs_grad = batch_matmul_op(
+                output_grad, self.inputs[1], trans_A=False, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = batch_matmul_op(
+                self.inputs[0], output_grad, trans_A=True, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is True) and
+                (self.matmul_attr_trans_B is False)):
+            # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A dY
+            lhs_grad = batch_matmul_op(
+                self.inputs[1], output_grad, trans_A=False, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = batch_matmul_op(
+                self.inputs[0], output_grad, trans_A=False, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is False) and
+                (self.matmul_attr_trans_B is True)):
+            # if Y=A B^T, then dA=dY B, dB=(A^T dY)^T=dY^T A
+            lhs_grad = batch_matmul_op(
+                output_grad, self.inputs[1], trans_A=False, trans_B=False, ctx=self.raw_ctx)
+            rhs_grad = batch_matmul_op(
+                output_grad, self.inputs[0], trans_A=True, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is True) and
+                (self.matmul_attr_trans_B is True)):
+            # if Y=A^T B^T, then dA=(dY B)^T=B^T dY^T, dB=(A dY)^T=dY^T A^T
+            lhs_grad = batch_matmul_op(
+                self.inputs[1], output_grad, trans_A=True, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = batch_matmul_op(
+                output_grad, self.inputs[0], trans_A=True, trans_B=True, ctx=self.raw_ctx)
+        return [lhs_grad, rhs_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        A = input_shapes[0]
+        B = input_shapes[1]
+        assert len(A) == len(B)
+        assert len(A) >= 2
+        for i in range(len(A)-2):
+            assert A[i] == B[i]
+        C = list(A)[:-2]
+        shape_A = A[-2]
+        shape_B = B[-1]
+        k1 = A[-1]
+        k2 = B[-2]
+        if self.matmul_attr_trans_A == True:
+            shape_A = A[-1]
+            k1 = A[-2]
+        if self.matmul_attr_trans_B == True:
+            shape_B = B[-2]
+            k2 = B[-1]
+        assert k1 == k2
+        C.extend([shape_A, shape_B])
+        return tuple(C)
+
+
+def batch_matmul_op(node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+    """Make a new instance of Batch Matrix Multiplication and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The left operand of the matrix multiplication.
+    node_B : Node
+        The right operand of the matrix multiplication.
+    trans_A : Boolean 
+        Whether node_A to be transposed
+    trans_B : Boolean 
+        Whether node_B to be transposed
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return BatchMatMulOp(node_A, node_B, trans_A, trans_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/BatchNorm.py b/python/hetu/gpu_ops/BatchNorm.py
new file mode 100644
index 0000000..79de813
--- /dev/null
+++ b/python/hetu/gpu_ops/BatchNorm.py
@@ -0,0 +1,368 @@
+from __future__ import absolute_import
+from .Node import Op
+import numpy as np
+from .. import ndarray
+from .._base import DNNL_LIB
+from ..cpu_links import batch_norm as cpu_batch_norm
+from ..cpu_links import batch_norm_inference as cpu_batch_norm_inference
+from ..cpu_links import batch_norm_gradient as cpu_batch_norm_gradient
+from ..gpu_links import CuDNN_Batch_Normalization
+from ..gpu_links import CuDNN_Batch_Normalization_gradient
+from ..gpu_links import CuDNN_Batch_Normalization_inference
+import numpy as np
+
+
+class Batch_NormalizationOp(Op):
+    def __init__(self, node_in, bn_scale, bn_bias, momentum=0.99, eps=0.01, ctx=None):
+        super().__init__(Batch_NormalizationOp,
+                         [node_in, bn_scale, bn_bias], ctx)
+        self.momentum = momentum
+        self.eps = eps
+        self.save_mean = None
+        self.save_var = None
+        self.running_mean = None
+        self.running_var = None
+
+    def compute(self, input_vals, output_val, stream_handle=None, inference=False):
+        if inference:
+            if self.on_cpu:
+                if DNNL_LIB['DnnlBatchNorm_Inference']:
+                    cpu_batch_norm_inference(input_vals[0], input_vals[1], input_vals[2], output_val, self.save_mean,
+                                             self.save_var, self.momentum, self.eps)
+                else:
+                    output_val[:] = batchnorm_inference(input_vals[0].asnumpy(), input_vals[1].asnumpy(),
+                                                        input_vals[2].asnumpy(
+                    ), self.save_mean, self.save_var,
+                        self.eps)
+            else:
+                CuDNN_Batch_Normalization_inference(
+                    input_vals[0], input_vals[1], input_vals[2], output_val, self.save_mean, self.save_var, self.eps, stream_handle)
+        else:
+            if self.on_cpu:
+                if DNNL_LIB['DnnlBatchNorm']:
+                    if self.save_mean is None:
+                        dev_id = input_vals[0].handle.contents.ctx.device_id
+                        C = input_vals[0].shape[1]
+                        self.save_mean = ndarray.array(
+                            np.zeros([C], dtype=np.float32), ctx=ndarray.cpu(dev_id))
+                        self.save_var = ndarray.array(
+                            np.zeros([C], dtype=np.float32), ctx=ndarray.cpu(dev_id))
+                    cpu_batch_norm(input_vals[0], input_vals[1], input_vals[2], output_val,
+                                   self.save_mean, self.save_var, self.momentum, self.eps)
+                else:
+                    output_val[:], self.save_mean, self.save_var = batchnorm_forward(input_vals[0].asnumpy(),
+                                                                                     input_vals[1].asnumpy(
+                    ),
+                        input_vals[2].asnumpy(
+                    ),
+                        self.save_mean,
+                        self.save_var, self.momentum,
+                        self.eps)
+            else:
+                if self.save_mean == None:
+                    dev_id = input_vals[0].handle.contents.ctx.device_id
+                    C = input_vals[0].shape[1]
+                    self.save_mean = ndarray.array(
+                        np.zeros([1, C, 1, 1]), ctx=ndarray.gpu(dev_id))
+                    self.save_var = ndarray.array(
+                        np.zeros([1, C, 1, 1]), ctx=ndarray.gpu(dev_id))
+                    self.running_mean = ndarray.array(
+                        np.zeros([1, C, 1, 1]), ctx=ndarray.gpu(dev_id))
+                    self.running_var = ndarray.array(
+                        np.zeros([1, C, 1, 1]), ctx=ndarray.gpu(dev_id))
+
+                CuDNN_Batch_Normalization(
+                    input_vals[0], input_vals[1], input_vals[2], output_val, self.save_mean, self.save_var,
+                    self.running_mean, self.running_var, self.momentum,
+                    self.eps, stream_handle)
+
+    def gradient(self, output_grad):
+
+        bn_gradient_node = batch_normalization_gradient_op(
+            output_grad, self.inputs[0], self.inputs[1], self, self.eps, ctx=self.raw_ctx)
+        data_gradient = batch_normalization_gradient_of_data_op(
+            bn_gradient_node, self.inputs[0], ctx=self.raw_ctx)
+        scale_gradient = batch_normalization_gradient_of_scale_op(
+            bn_gradient_node, self.inputs[1], ctx=self.raw_ctx)
+        bias_gradient = batch_normalization_gradient_of_bias_op(
+            bn_gradient_node, self.inputs[2], ctx=self.raw_ctx)
+
+        return [data_gradient, scale_gradient, bias_gradient]
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+class Batch_Normalization_GradientOp(Op):
+    def __init__(self, out_gradient, in_node, bn_scale, forward_node, eps, ctx=None):
+        super().__init__(Batch_Normalization_GradientOp,
+                         [out_gradient, in_node, bn_scale], ctx)
+        self.tmp_gradient_in_arr = None
+        self.tmp_gradient_bn_bias = None
+        self.tmp_gradient_bn_scale = None
+        self.forward_node = forward_node
+        self.eps = eps
+
+    def update_mean_and_var(self, saved_mean, saved_var):
+        self.saved_mean = saved_mean
+        self.saved_var = saved_var
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+
+        if self.on_cpu:
+            if DNNL_LIB['DnnlBatchNorm_Gradient']:
+                if self.tmp_gradient_bn_bias is None:
+                    shapebn = input_vals[2].shape
+                    self.tmp_gradient_bn_bias = np.zeros(
+                        shape=shapebn, dtype=np.float32)
+                    self.tmp_gradient_bn_scale = np.zeros(
+                        shape=shapebn, dtype=np.float32)
+                    self.tmp_gradient_in_arr = np.zeros(
+                        shape=input_vals[1].shape, dtype=np.float32)
+
+                cpu_batch_norm_gradient(input_vals[0], input_vals[1], input_vals[2], bn_bias, self.tmp_gradient_in_arr,
+                                        self.tmp_gradient_bn_scale,
+                                        self.tmp_gradient_bn_bias, self.forward_node.running_mean,
+                                        self.forward_node.running_var, self.eps)
+            else:
+                if self.tmp_gradient_bn_bias is None:
+                    typebn = input_vals[2].asnumpy().dtype
+                    shapebn = input_vals[2].asnumpy().shape
+                    self.tmp_gradient_bn_bias = np.zeros(
+                        shape=shapebn, dtype=typebn)
+                    self.tmp_gradient_bn_scale = np.zeros(
+                        shape=shapebn, dtype=typebn)
+                self.tmp_gradient_in_arr, self.tmp_gradient_bn_scale, self.tmp_gradient_bn_bias = batchnorm_backward(
+                    input_vals[0].asnumpy(), input_vals[1].asnumpy(
+                    ), input_vals[2].asnumpy(),
+                    self.tmp_gradient_bn_scale, self.tmp_gradient_bn_bias,
+                    self.eps, self.forward_node.save_mean, self.forward_node.save_var)
+        else:
+            if self.tmp_gradient_bn_bias == None:
+                shapebn = input_vals[2].shape
+                self.tmp_gradient_bn_scale = ndarray.empty(
+                    shape=shapebn, ctx=input_vals[0].ctx)
+                self.tmp_gradient_bn_bias = ndarray.empty(
+                    shape=shapebn, ctx=input_vals[0].ctx)
+                self.tmp_gradient_in_arr = ndarray.empty(
+                    shape=input_vals[1].shape, ctx=input_vals[0].ctx)
+            CuDNN_Batch_Normalization_gradient(input_vals[0], input_vals[1], input_vals[2],
+                                               self.tmp_gradient_in_arr, self.tmp_gradient_bn_scale,
+                                               self.tmp_gradient_bn_bias, self.forward_node.save_mean,
+                                               self.forward_node.save_var, self.eps, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return (1,)
+
+
+class Batch_Normalization_Gradient_of_DataOp(Op):
+    def __init__(self, bn_gradient, in_arr, ctx=None):
+        super().__init__(Batch_Normalization_Gradient_of_DataOp,
+                         [bn_gradient, in_arr], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_in_arr
+        else:
+            self.inputs[0].tmp_gradient_in_arr.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+class Batch_Normalization_Gradient_of_ScaleOp(Op):
+    def __init__(self, bn_gradient, in_scale, ctx=None):
+        super().__init__(Batch_Normalization_Gradient_of_ScaleOp,
+                         [bn_gradient, in_scale], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_bn_scale
+        else:
+            self.inputs[0].tmp_gradient_bn_scale.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+class Batch_Normalization_Gradient_of_BiasOp(Op):
+    def __init__(self, bn_gradient, in_bias, ctx=None):
+        super().__init__(Batch_Normalization_Gradient_of_BiasOp,
+                         [bn_gradient, in_bias], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_bn_bias
+        else:
+            self.inputs[0].tmp_gradient_bn_bias.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+def batch_normalization_op(node_in, bn_scale, bn_bias, momentum=0.99, eps=0.01, ctx=None):
+    """Batch normalization layer node.
+
+    Parameters:
+    ----
+    node_in : Node
+        Input data.
+    bn_scale : float
+        scaling parameter
+    bn_bias :
+        learnable bias parameter
+    momentum : float
+        Acting on the calculation of mean and variance, the mean and variance values in historical batch are retained.
+    eps : float
+        Epsilon value for numerical stability.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Batch_NormalizationOp(node_in, bn_scale, bn_bias, momentum, eps, ctx=ctx)
+
+
+def batch_normalization_gradient_op(out_gradient, in_node, bn_scale, forward_node, eps, ctx=None):
+    """Gradient node of batch normalization.
+
+    Parameters:
+    ----
+    out_gradient :
+        The gradient array.
+    in_node : Node
+        Input node of bn layer.
+    bn_scale :
+        Scaling parameter.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Batch_Normalization_GradientOp(out_gradient, in_node, bn_scale, forward_node, eps, ctx=ctx)
+
+
+def batch_normalization_gradient_of_data_op(bn_gradient, in_arr, ctx=None):
+    """Gradient node of data of  batch normalization.
+
+    Parameters:
+    ----
+    bn_gradient :
+        The gradient array.
+    in_arr : Node
+        Input array of bn layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Batch_Normalization_Gradient_of_DataOp(bn_gradient, in_arr, ctx=ctx)
+
+
+def batch_normalization_gradient_of_scale_op(bn_gradient, in_scale, ctx=None):
+    """Gradient node of scale parameter of batch normalization.
+
+    Parameters:
+    ----
+    bn_gradient :
+        The gradient array.
+    in_scale :
+        Scaling parameter of bn layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Batch_Normalization_Gradient_of_ScaleOp(bn_gradient, in_scale, ctx=ctx)
+
+
+def batch_normalization_gradient_of_bias_op(bn_gradient, in_bias, ctx=None):
+    """Gradient node of bias parameter of batch normalization.
+
+    Parameters:
+    ----
+    bn_gradient :
+        The gradient array.
+    in_bias :
+        Bias parameter of bn layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Batch_Normalization_Gradient_of_BiasOp(bn_gradient, in_bias, ctx=ctx)
+
+
+def batchnorm_forward(x, bn_scale, bn_bias, save_mean, save_var, momentum=0.99, eps=0.01):
+    D = x.shape[1]
+    if save_mean is None:
+        save_mean = np.zeros(D, dtype=x.dtype)
+    if save_var is None:
+        save_var = np.ones(D, dtype=x.dtype)
+
+    sample_mean = x.mean(axis=(0, 2, 3), dtype=x.dtype)
+    sample_var = x.var(axis=(0, 2, 3), dtype=x.dtype)
+    save_mean = momentum * sample_mean + (1.0 - momentum) * save_mean
+    save_var = momentum * sample_var + (1.0 - momentum) * save_var
+
+    std = np.sqrt(sample_var.reshape(1, D, 1, 1) + eps, dtype=x.dtype)
+    x_centered = x - sample_mean.reshape(1, D, 1, 1)
+    x_norm = x_centered / std
+    out = bn_scale.reshape(1, D, 1, 1) * x_norm + bn_bias.reshape(1, D, 1, 1)
+
+    return out, save_mean, save_mean
+
+
+def batchnorm_inference(x, bn_scale, bn_bias, save_mean, save_var, eps=0.01):
+    D = x.shape[1]
+    std = np.sqrt(save_var.reshape(1, D, 1, 1) + eps, dtype=x.dtype)
+    x_centered = x - save_mean.reshape(1, D, 1, 1)
+    x_norm = x_centered / std
+    out = bn_scale.reshape(1, D, 1, 1) * x_norm + bn_bias.reshape(1, D, 1, 1)
+
+    return out
+
+
+def batchnorm_backward(gradient_Y, x, bn_scale, dbn_scale, dbn_bias, eps, save_mean, save_var):
+    D = gradient_Y.shape[1]
+
+    sample_mean = save_mean
+    sample_var = save_var
+
+    std = np.sqrt(sample_var.reshape(1, D, 1, 1) + eps)
+    x_centered = x - sample_mean.reshape(1, D, 1, 1)
+    x_norm = x_centered / std
+
+    dbn_scale = (gradient_Y * x_norm).sum(axis=(0, 2, 3))
+    dbn_bias = gradient_Y.sum(axis=(0, 2, 3))
+
+    dx_norm = gradient_Y * bn_scale.reshape(1, D, 1, 1)
+    dx_centered = dx_norm / std
+    dmean = -(dx_centered.sum(axis=(0, 2, 3)) + 2 / D *
+              x_centered.sum(axis=(0, 2, 3))).reshape(1, D, 1, 1)
+    dstd = (dx_norm * x_centered * -std ** (-2)
+            ).sum(axis=(0, 2, 3)).reshape(1, D, 1, 1)
+    dvar = dstd / 2 / std
+    dx = dx_centered + (dmean + dvar * 2 * x_centered) / D
+
+    return dx, dbn_scale, dbn_bias
diff --git a/python/hetu/gpu_ops/BinaryCrossEntropy.py b/python/hetu/gpu_ops/BinaryCrossEntropy.py
new file mode 100644
index 0000000..f31f191
--- /dev/null
+++ b/python/hetu/gpu_ops/BinaryCrossEntropy.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import binary_cross_entropy
+from ..gpu_links import binary_cross_entropy_gradient
+
+
+class BinaryCrossEntropyOp(Op):
+    def __init__(self, prediction, label, ctx=None):
+        super().__init__(BinaryCrossEntropyOp, [prediction, label], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            y = input_vals[0].asnumpy()
+            y_ = input_vals[1].asnumpy()
+            output_val[:] = -y_ * np.log(y) - (1 - y_) * np.log(1 - y)
+        else:
+            binary_cross_entropy(
+                input_vals[0], input_vals[1], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+
+        grad_A = binarycrossentropy_gradient_op(
+            self.inputs[0], self.inputs[1], output_grad, ctx=self.raw_ctx)
+        grad_B = None
+        return [grad_A, grad_B]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        assert len(input_shapes[0]) >= 2
+        return input_shapes[0]
+
+
+class BinaryCrossEntropyGradientOp(Op):
+    def __init__(self, prediction, label, output_grad_node, ctx=None):
+        super().__init__(BinaryCrossEntropyGradientOp, [
+            prediction, label, output_grad_node], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            y = input_vals[0].asnumpy()
+            y_ = input_vals[1].asnumpy()
+            output_grad = input_vals[2].asnumpy()
+            output_val[:] = (- y_/y + (1 - y_)/(1-y))*output_grad
+        else:
+            binary_cross_entropy_gradient(
+                input_vals[0], input_vals[1], input_vals[2], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        return input_shapes[0]
+
+
+def binarycrossentropy_op(node_A, node_B, ctx=None):
+    """Computes cross entropy loss for pre-softmax activations.
+
+    Parameters:
+    ----
+    node_A : Node
+        Predicted probability.
+    node_B : Node
+        Labels.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+
+    return BinaryCrossEntropyOp(node_A, node_B, ctx=ctx)
+
+
+def binarycrossentropy_gradient_op(node_A, node_B, node_C, ctx=None):
+
+    return BinaryCrossEntropyGradientOp(node_A, node_B, node_C, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Broadcast.py b/python/hetu/gpu_ops/Broadcast.py
new file mode 100644
index 0000000..0383c5a
--- /dev/null
+++ b/python/hetu/gpu_ops/Broadcast.py
@@ -0,0 +1,85 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .ReduceSum import reduce_sum_op
+from ..gpu_links import broadcast_shape_simple
+from .. import ndarray
+
+
+class BroadcastToOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(BroadcastToOp, [node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            input_shape = list(input_vals[1].shape)
+            output_val[:] = np.broadcast_to(
+                input_vals[0].asnumpy(), input_shape)
+        else:
+            if self.inplace:
+                input_vals[0].broadcast_to(input_vals[1].shape, output_val)
+            else:
+                # broadcast_shape(input_vals[0], output_val, None, stream_handle)
+                broadcast_shape_simple(
+                    input_vals[0], output_val, self.out_strides, self.in_dims, stream_handle)
+
+    def gradient(self, output_grad):
+        self.grad_node = reduce_sum_op(
+            output_grad, None, None, ctx=self.raw_ctx)
+        return [self.grad_node, None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        input_shape = list(input_shapes[0])
+        output_shape = list(input_shapes[1])
+        output_ndim = len(output_shape)
+        assert len(input_shape) <= output_ndim
+        diff = output_ndim - len(input_shape)
+        axes = list(range(diff))
+        keepdims = [False] * diff
+        input_shape = [1] * diff + input_shape
+        for i in range(output_ndim):
+            assert output_shape[i] > 0 and isinstance(output_shape[i], int)
+            assert input_shape[i] == 1 or input_shape[i] == output_shape[i]
+            if i >= diff and input_shape[i] == 1 and output_shape[i] > 1:
+                axes.append(i)
+                keepdims.append(True)
+        if hasattr(self, 'grad_node'):
+            self.grad_node.axes = axes
+            self.grad_node.keepdims = keepdims
+
+        # here we save the output strides and input dimensions for GPU computation
+        if self.on_gpu and not self.inplace:
+            input_shape = list(input_shapes[0])
+            out_strides = [0 for _ in range(output_ndim)]
+            temp_size = 1
+            for i in range(output_ndim - 1, -1, -1):
+                out_strides[i] = temp_size
+                temp_size *= output_shape[i]
+            in_dims = [1 for _ in range(diff)] + input_shape
+
+            self.out_strides = ndarray.array(
+                out_strides, self.ctx, data_type=np.int32)
+            self.in_dims = ndarray.array(in_dims, self.ctx, data_type=np.int32)
+        return input_shapes[1]
+
+    def backward_hook(self, config):
+        self.inplace = config.enable_lazy and self not in config.eval_node_list
+
+
+def broadcastto_op(node_A, node_B, ctx=None):
+    """Creates a node that represents np.broadcast_to(node_A, node_B.shape).
+
+    Parameters:
+    ----
+    node_a : Node
+        The Node to be broadcast.
+    node_b : Node
+        Another Node with the target shape.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return BroadcastToOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/BroadcastShape.py b/python/hetu/gpu_ops/BroadcastShape.py
new file mode 100644
index 0000000..479c187
--- /dev/null
+++ b/python/hetu/gpu_ops/BroadcastShape.py
@@ -0,0 +1,117 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .ReduceSum import reduce_sum_op
+from ..gpu_links import broadcast_shape_simple
+from .. import ndarray
+
+
+class BroadcastShapeOp(Op):
+    def __init__(self, node_A, shape, add_axes=(), ctx=None):
+        super().__init__(BroadcastShapeOp, [node_A], ctx)
+        self.target_shape = shape
+        self.add_axes = add_axes
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert self.target_shape is not None and self.add_axes is not None
+        if self.on_cpu:
+            input_shape = list(input_vals[0].shape)
+            for i in range(len(input_shape)):
+                if self.add_axes and i in self.add_axes:
+                    input_shape[i] = 1
+            output_val[:] = np.broadcast_to(
+                input_vals[0].asnumpy().reshape(input_shape), self.target_shape)
+        else:
+            if self.inplace:
+                input_vals[0].broadcast_to(
+                    self.target_shape, output_val, self.add_axes)
+            else:
+                # broadcast_shape(input_vals[0], output_val, self.add_axes, stream_handle)
+                broadcast_shape_simple(
+                    input_vals[0], output_val, self.out_strides, self.in_dims, stream_handle)
+
+    def gradient(self, output_grad):
+        self.grad_node = reduce_sum_op(
+            output_grad, None, None, ctx=self.raw_ctx)
+        return [self.grad_node]
+
+    def infer_shape(self, input_shapes):
+        assert self.target_shape is not None and self.add_axes is not None
+        assert len(input_shapes) == 1
+        input_shape = list(input_shapes[0])
+        output_shape = list(self.target_shape)
+        output_ndim = len(output_shape)
+        assert len(input_shape) <= output_ndim
+        diff = output_ndim - len(input_shape)
+        if self.add_axes:
+            assert diff == len(self.add_axes) or input_shape == [1]
+            assert all([axis < output_ndim for axis in self.add_axes])
+            in_ind = 0
+            for i in range(output_ndim):
+                if i not in self.add_axes:
+                    assert input_shape[in_ind] == output_shape[i]
+                    in_ind += 1
+            if hasattr(self, 'grad_node'):
+                self.grad_node.axes = tuple(self.add_axes)
+                self.grad_node.axes.keepdims = [False] * len(self.add_axes)
+        else:
+            axes = list(range(diff))
+            keepdims = [False] * diff
+            input_shape = [1] * diff + input_shape
+            for i in range(output_ndim):
+                if output_shape[i] == -1:
+                    output_shape[i] = input_shape[i]
+                assert output_shape[i] > 0 and isinstance(output_shape[i], int)
+                assert input_shape[i] == 1 or input_shape[i] == output_shape[i]
+                if i >= diff and input_shape[i] == 1 and output_shape[i] > 1:
+                    axes.append(i)
+                    keepdims.append(True)
+            if hasattr(self, 'grad_node'):
+                self.grad_node.axes = axes
+                self.grad_node.keepdims = keepdims
+
+        # here we save the output strides and input dimensions for GPU computation
+        if self.on_gpu and not self.inplace:
+            input_shape = list(input_shapes[0])
+            out_strides = [0 for _ in range(output_ndim)]
+            temp_size = 1
+            for i in range(output_ndim - 1, -1, -1):
+                out_strides[i] = temp_size
+                temp_size *= output_shape[i]
+            if self.add_axes:
+                in_dims = [0 for _ in range(output_ndim)]
+                for i in range(diff):
+                    in_dims[self.add_axes[i]] = 1
+                temp_ind = 0
+                for dim in input_shape:
+                    while in_dims[temp_ind] == 1:
+                        temp_ind += 1
+                    in_dims[temp_ind] = dim
+            else:
+                in_dims = [1 for _ in range(diff)] + input_shape
+
+            self.out_strides = ndarray.array(
+                out_strides, self.ctx, data_type=np.int32)
+            self.in_dims = ndarray.array(in_dims, self.ctx, data_type=np.int32)
+        return tuple(output_shape)
+
+    def backward_hook(self, config):
+        self.inplace = config.enable_lazy and self not in config.eval_node_list
+
+
+def broadcast_shape_op(node_A, shape, add_axes=(), ctx=None):
+    """Creates a node that represents np.broadcast_to(node_A, shape).
+
+    Parameters:
+    ----
+    node_a : Node
+        The Node to be broadcast.
+    shape : tuple
+        Target shape.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return BroadcastShapeOp(node_A, shape, add_axes=add_axes, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Concat.py b/python/hetu/gpu_ops/Concat.py
new file mode 100755
index 0000000..e53ff83
--- /dev/null
+++ b/python/hetu/gpu_ops/Concat.py
@@ -0,0 +1,125 @@
+from __future__ import absolute_import
+from .Node import Op
+import numpy as np
+from .._base import DNNL_LIB
+from ..cpu_links import concat as cpu_concat
+from ..cpu_links import concat_gradient as cpu_concat_gradient
+from ..gpu_links import concat
+from ..gpu_links import concat_gradient
+
+
+class ConcatOp(Op):
+    def __init__(self, node_A, node_B, axis=0, ctx=None):
+        super().__init__(ConcatOp, [node_A, node_B], ctx)
+        self.axis = axis
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlConcat']:
+                cpu_concat(input_vals[0], input_vals[1], output_val, self.axis)
+            else:
+                output_val[:] = np.concatenate(
+                    (input_vals[0].asnumpy(), input_vals[1].asnumpy()), self.axis)
+        else:
+            concat(input_vals[0], input_vals[1],
+                   output_val, self.axis, stream_handle)
+
+    def gradient(self, output_grad):
+        return [concat_gradient_op(output_grad, self.inputs[0], self.axis, idx=0, ctx=self.raw_ctx),
+                concat_gradient_op(output_grad, self.inputs[1], self.axis, idx=1, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        assert len(input_shapes[0]) == len(input_shapes[1])
+        for i in range(self.axis):
+            assert input_shapes[0][i] == input_shapes[1][i]
+        for i in range(self.axis+1, len(input_shapes[0])):
+            assert input_shapes[0][i] == input_shapes[1][i]
+        out_shape = list(input_shapes[0])
+        out_shape[self.axis] = out_shape[self.axis] + \
+            input_shapes[1][self.axis]
+
+        return tuple(out_shape)
+
+
+class Concat_gradientOP(Op):
+    def __init__(self, grad_node, input_node, axis, idx, ctx=None):
+        super().__init__(Concat_gradientOP, [grad_node, input_node], ctx)
+        self.axis = axis
+        self.idx = idx
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if 'cpu_Concat_Gradient' in DNNL_LIB:
+                cpu_concat_gradient(
+                    input_vals[0], output_val, self.axis, self.idx)
+            else:
+                output_val[:] = concat_backward(
+                    input_vals[0].asnumpy(), self.idx, self.axis)
+        else:
+            concat_gradient(input_vals[0], output_val,
+                            self.axis, self.idx, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        return input_shapes[1]
+
+
+def concat_op(node_A, node_B, axis=0, ctx=None):
+    """Concatenates given variables along an axis.
+
+    Parameters:
+    ----
+    node_A : Node
+        The first node to be concated.
+    node_B : Node
+        The second node to be concated.
+    axis :
+        The axis along which two nodes are concated.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ConcatOp(node_A, node_B, axis, ctx=ctx)
+
+
+def concat_gradient_op(grad_node, input_node, axis, idx, ctx=None):
+    """Gradient node of concat operation.
+
+    Parameters:
+    ----
+    grad_node : Node
+        Previous gradient node.
+    input_node : Node
+    axis :
+        Axis along which to be concatenated.
+    idx :
+        The index of concatenation.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Concat_gradientOP(grad_node, input_node, axis, idx, ctx=ctx)
+
+
+def concat_backward(grad, idx, axis=0):
+    if axis == 0:
+        gradient_x1 = grad[:idx]
+        gradient_x2 = grad[idx:]
+    elif axis == 1:
+        gradient_x1 = grad[:, :idx]
+        gradient_x2 = grad[:, idx:]
+    elif axis == 2:
+        gradient_x1 = grad[:, :, :idx]
+        gradient_x2 = grad[:, :, idx:]
+    else:
+        gradient_x1 = grad[:, :, :, :idx]
+        gradient_x2 = grad[:, :, :, idx:]
+    return [gradient_x1, gradient_x2]
diff --git a/python/hetu/gpu_ops/Conv2d.py b/python/hetu/gpu_ops/Conv2d.py
new file mode 100644
index 0000000..c64a3bc
--- /dev/null
+++ b/python/hetu/gpu_ops/Conv2d.py
@@ -0,0 +1,321 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import conv2d as cpu_conv2d
+from ..cpu_links import conv2d_gradient_of_data as cpu_conv2d_gradient_of_data
+from ..cpu_links import conv2d_gradient_of_filter as cpu_conv2d_gradient_of_filter
+from ..gpu_links import CuDNN_conv2d
+from ..gpu_links import CuDNN_conv2d_gradient_of_data
+from ..gpu_links import CuDNN_conv2d_gradient_of_filter
+
+
+class Conv2dOp(Op):
+    # nodeA : x  nodeB : filter
+    def __init__(self, node_A, node_B, padding=0, stride=1, ctx=None):
+        super().__init__(Conv2dOp, [node_A, node_B], ctx)
+        self.padding = padding
+        self.stride = stride
+
+    def im2col(self, X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(self, X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        im2col_matrix = self.im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlConv2d']:
+                cpu_conv2d(input_vals[0], input_vals[1],
+                           output_val, self.padding, self.stride)
+            else:
+                output_val[:] = self.np_conv2d(
+                    input_vals[0].asnumpy(), input_vals[1].asnumpy(), self.padding, self.stride)
+        else:
+            CuDNN_conv2d(input_vals[0], input_vals[1],
+                         output_val, self.padding, self.stride, stream_handle)
+
+    def gradient(self, output_grad):
+        return [conv2d_gradient_of_data_op(self.inputs[1], output_grad, self.padding, self.stride, ctx=self.raw_ctx),
+                conv2d_gradient_of_filter_op(self.inputs[0], output_grad, self.padding, self.stride, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        N, _, H, W = input_shapes[0]
+        f_O, _, f_H, f_W = input_shapes[1]
+        padding = self.padding
+        stride = self.stride
+        filter_H = input_shapes[1][2]
+        filter_W = input_shapes[1][3]
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+        return (N, f_O, out_H, out_W)
+
+
+class Conv2d_Gradient_of_DataOp(Op):
+    # nodeA : filter  nodeB : Y_gradient
+    def __init__(self, node_A, node_B, padding=0, stride=1, ctx=None):
+        super().__init__(Conv2d_Gradient_of_DataOp, [node_A, node_B], ctx)
+        self.padding = padding
+        self.stride = stride
+
+    def im2col_transpose(self, N, C, H, W, filter_H, filter_W, Y, padding, stride):
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+        _, y_row_size, y_col_size = Y.shape
+
+        der_X_shape = (N, C, H, W)
+        der_X = np.zeros(der_X_shape, dtype=Y.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                der_X[batch_index, c, y,
+                                      x] += Y[batch_index, row_idx, col_index]
+                            row_idx += 1
+        return der_X
+
+    def np_Conv2dGradient_data(self, X_N, X_C, X_H, X_W, Filter, Y, padding=0, stride=1):
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        Y_N, Y_C, Y_H, Y_W = Y.shape
+        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))    # transformed to im2col Y
+        F_filter = Filter.reshape((filter_outChannel, -1))
+
+        gradient_im2col_XX = np.matmul(F_filter.T, YY)
+        gradient_X = self.im2col_transpose(
+            X_N, X_C, X_H, X_W, filter_H, filter_W, gradient_im2col_XX, padding, stride)    # gradient of x
+        return gradient_X
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlConv2d_Gradient_of_Data']:
+                cpu_conv2d_gradient_of_data(
+                    input_vals[0], input_vals[1], output_val, self.padding, self.stride)
+            else:
+                N = input_vals[1].shape[0]
+                C = input_vals[0].shape[1]
+                H = (input_vals[1].shape[2] - 1) * self.stride + \
+                    input_vals[0].shape[2] - 2 * self.padding
+                W = (input_vals[1].shape[3] - 1) * self.stride + \
+                    input_vals[0].shape[3] - 2 * self.padding
+                output_val[:] = self.np_Conv2dGradient_data(
+                    N, C, H, W, input_vals[0].asnumpy(), input_vals[1].asnumpy(), padding=self.padding, stride=self.stride)
+        else:
+            CuDNN_conv2d_gradient_of_data(
+                input_vals[0], input_vals[1], output_val, padding=self.padding, stride=self.stride, stream=stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        N = input_shapes[1][0]
+        C = input_shapes[0][1]
+        H = (input_shapes[1][2] - 1) * self.stride + \
+            input_shapes[0][2] - 2 * self.padding
+        W = (input_shapes[1][3] - 1) * self.stride + \
+            input_shapes[0][3] - 2 * self.padding
+        return (N, C, H, W)
+
+
+class Conv2d_Gradient_of_FilterOp(Op):
+    # nodeA : input_x  nodeB : gradient_Y
+    def __init__(self, input_X, gradient_Y, padding=0, stride=1, ctx=None):
+        super().__init__(Conv2d_Gradient_of_FilterOp,
+                         [input_X, gradient_Y], ctx)
+        self.padding = padding
+        self.stride = stride
+
+    def im2col(self, X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_Conv2dGradient_Filter(self, filter_outChannel, filter_inChannel, filter_H, filter_W, X, Y, padding=0, stride=1):
+        """Implement a conv2d_transpose as a matrix multiply after im2col."""
+        X_N, X_C, X_H, X_W = X.shape
+        Y_N, Y_C, Y_H, Y_W = Y.shape
+        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))    # transformed to im2col Y
+        # XX = X.reshape((X_N, X_C, X_W * X_H))   # transformed to im2col X
+        im2col_XX = self.im2col(X, filter_H, filter_W, padding, stride)
+        gradient_filter = np.zeros(shape=(
+            filter_outChannel, filter_inChannel * filter_H * filter_W), dtype=Y.dtype)
+
+        for i in range(X_N):
+            gradient_filter += np.matmul(YY[i], im2col_XX[i].T)
+        gradient_filter = gradient_filter.reshape(
+            (filter_outChannel, filter_inChannel, filter_H, filter_W))
+
+        return gradient_filter
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlConv2d_Gradient_of_Filter']:
+                cpu_conv2d_gradient_of_filter(
+                    input_vals[0], input_vals[1], output_val, self.padding, self.stride)
+            else:
+                f_N = input_vals[1].shape[1]
+                f_C = input_vals[0].shape[1]
+                f_H = input_vals[1].shape[2] + 2 * self.padding - \
+                    (input_vals[1].shape[2] - 1) * self.stride
+                f_W = input_vals[1].shape[3] + 2 * self.padding - \
+                    (input_vals[1].shape[3] - 1) * self.stride
+                output_val[:] = self.np_Conv2dGradient_Filter(
+                    f_N, f_C, f_H, f_W, input_vals[0].asnumpy(), input_vals[1].asnumpy(), padding=self.padding, stride=self.stride)
+        else:
+            CuDNN_conv2d_gradient_of_filter(
+                input_vals[0], input_vals[1], output_val, padding=self.padding, stride=self.stride, stream=stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        f_N = input_shapes[1][1]
+        f_C = input_shapes[0][1]
+        f_H = input_shapes[0][2] + 2 * self.padding - \
+            (input_shapes[1][2] - 1) * self.stride
+        f_W = input_shapes[0][3] + 2 * self.padding - \
+            (input_shapes[1][3] - 1) * self.stride
+
+        return (f_N, f_C, f_H, f_W)
+
+
+def conv2d_op(node_A, node_B, padding=0, stride=1, ctx=None):
+    """Conv2d node.
+
+    Parameters:
+    ----
+    node_A : Node
+        Input data node.
+    node_B : Node
+        Input filter node.
+    padding :
+        Padding size.
+    stride :
+        Stride size.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Conv2dOp(node_A, node_B, padding, stride, ctx=ctx)
+
+
+def conv2d_gradient_of_data_op(node_A, node_B, padding=0, stride=1, ctx=None):
+    """Gradient node of data of conv2d.
+
+    Parameters:
+    ----
+    node_A : Node
+        Filter node.
+    node_B : Node
+        Previous gradient node.
+    padding :
+        Padding size.
+    stride :
+        Stride size.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Conv2d_Gradient_of_DataOp(node_A, node_B, padding, stride, ctx=ctx)
+
+
+def conv2d_gradient_of_filter_op(input_X, gradient_Y, padding=0, stride=1, ctx=None):
+    """Gradient node of filters of conv2d.
+
+    Parameters:
+    ----
+    input_X :
+        Input data of conv2d.
+    gradient_Y :
+        Gradient array.
+    padding :
+        Padding size.
+    stride :
+        Stride size.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Conv2d_Gradient_of_FilterOp(input_X, gradient_Y, padding, stride, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Conv2dBroadcast.py b/python/hetu/gpu_ops/Conv2dBroadcast.py
new file mode 100644
index 0000000..0390114
--- /dev/null
+++ b/python/hetu/gpu_ops/Conv2dBroadcast.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .Conv2dReduceSum import conv2d_reducesum_op
+from .ZerosLike import zeroslike_op
+from ..gpu_links import broadcast_to
+
+
+class Conv2d_BroadcastToOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(Conv2d_BroadcastToOp, [node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            shapeW = input_vals[1].shape
+            shapeW = list(shapeW)
+            tmp = shapeW[1]
+            shapeW[1] = shapeW[3]
+            shapeW[3] = tmp
+            output_val[:] = np.broadcast_to(
+                input_vals[0].asnumpy(), input_vals[1].asnumpy().shape).swapaxes(1, 3)
+        else:
+            broadcast_to(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+
+        grad_A = conv2d_reducesum_op(output_grad, ctx=self.raw_ctx)
+        return [grad_A, None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        return input_shapes[1]
+
+
+def conv2d_broadcastto_op(node_A, node_B, ctx=None):
+    """Creates a node that represents np.broadcast_to(node_A, node_B.shape).
+
+    Parameters:
+    ----
+    node_a : Node
+        The Node to be bcast.
+    node_b : Node
+        Another Node with the target shape.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Conv2d_BroadcastToOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Conv2dReduceSum.py b/python/hetu/gpu_ops/Conv2dReduceSum.py
new file mode 100644
index 0000000..b66713e
--- /dev/null
+++ b/python/hetu/gpu_ops/Conv2dReduceSum.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import conv2d_reduce_sum
+
+
+class Conv2d_ReduceSumOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(Conv2d_ReduceSumOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = np.sum(input_vals[0].asnumpy(), axis=(0, 2, 3))
+        else:
+            conv2d_reduce_sum(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        from .Conv2dBroadcast import conv2d_broadcastto_op
+        return [conv2d_broadcastto_op(output_grad, self.inputs[0], ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """summation reduction axis = 0
+        e.g. (3,4,5)->(4,5)
+        for vector, simpler to do (3,)->(1,)
+        """
+        assert len(input_shapes) == 1
+        channels = input_shapes[0][1]
+        return (channels,)
+
+
+def conv2d_reducesum_op(node, ctx=None):
+    """Creates a node that represents np.sum(node_A, axis=0). 
+    Only support common-case axis=0 reduction for simplicity of gradient.
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be summed.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Conv2d_ReduceSumOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/CuSparse.py b/python/hetu/gpu_ops/CuSparse.py
new file mode 100644
index 0000000..fbff712
--- /dev/null
+++ b/python/hetu/gpu_ops/CuSparse.py
@@ -0,0 +1,188 @@
+from __future__ import absolute_import
+import numpy as np
+import scipy.sparse
+from .Node import Op
+from .. import ndarray
+from .Transpose import transpose_op
+from ..gpu_links import CuSparse_Csrmv
+from ..gpu_links import CuSparse_Csrmm
+
+
+class CsrmvOp(Op):
+    def __init__(self, node_A, node_B, trans=False, ctx=None):
+        super().__init__(CsrmvOp, [node_A, node_B], ctx)
+        self.csrmv_attr_trans = trans
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            assert isinstance(input_vals[0], scipy.sparse.spmatrix)
+            if self.csrmv_attr_trans is False:
+                output_val[:] = input_vals[0].dot(input_vals[1].asnumpy())
+            else:
+                output_val[:] = input_vals[0].T.dot(input_vals[1].asnumpy())
+        else:
+            assert isinstance(input_vals[0], ndarray.ND_Sparse_Array)
+            CuSparse_Csrmv(
+                input_vals[0], self.csrmv_attr_trans,
+                input_vals[1], output_val, stream_handle)
+
+    # ND_Sparse_Array gradient not implemented
+    def gradient(self, output_grad):
+        if self.csrmv_attr_trans is False:
+            # if Y=AB, then dA=dY B^T, dB=A^T dY
+            # lhs_grad = matmul_op(
+            #     output_grad, self.inputs[1], trans_A=False, trans_B=True)
+            rhs_grad = csrmv_op(
+                self.inputs[0], output_grad, trans=True, ctx=self.raw_ctx)
+        else:
+            # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A dY
+            # lhs_grad = matmul_op(
+            #     self.inputs[1], output_grad, trans_A=False, trans_B=True)
+            rhs_grad = csrmv_op(
+                self.inputs[0], output_grad, trans=False, ctx=self.raw_ctx)
+        return [None, rhs_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        A = input_shapes[0]
+        B = input_shapes[1]
+        assert len(A) == 2 and len(B) == 1
+        shape_A = A[0]
+        shape_mid_1 = A[1]
+        shape_mid_2 = B[0]
+        if self.csrmv_attr_trans == True:
+            shape_A = A[1]
+            shape_mid_1 = A[0]
+        assert shape_mid_1 == shape_mid_2
+        return (shape_A, )
+
+
+class CsrmmOp(Op):
+    def __init__(self, node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+        super().__init__(CsrmmOp, [node_A, node_B], ctx)
+        self.csrmm_attr_trans_A = trans_A
+        self.csrmm_attr_trans_B = trans_B
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            assert isinstance(input_vals[0], scipy.sparse.spmatrix)
+            if ((self.csrmm_attr_trans_A is False) and
+                    (self.csrmm_attr_trans_B is False)):
+                output_val[:] = input_vals[0].dot(input_vals[1].asnumpy())
+            elif ((self.csrmm_attr_trans_A is True) and
+                    (self.csrmm_attr_trans_B is False)):
+                output_val[:] = input_vals[0].T.dot(input_vals[1].asnumpy())
+            elif ((self.csrmm_attr_trans_A is False) and
+                    (self.csrmm_attr_trans_B is True)):
+                output_val[:] = input_vals[0].dot(
+                    np.transpose(input_vals[1].asnumpy()))
+            elif ((self.csrmm_attr_trans_A is True) and
+                    (self.csrmm_attr_trans_B is True)):
+                output_val[:] = input_vals[0].T.dot(
+                    np.transpose(input_vals[1].asnumpy()))
+        else:
+            assert isinstance(input_vals[0], ndarray.ND_Sparse_Array)
+            CuSparse_Csrmm(
+                input_vals[0], self.csrmm_attr_trans_A,
+                input_vals[1], self.csrmm_attr_trans_B,
+                output_val, stream_handle)
+
+    # ND_Sparse_Array gradient not implemented
+    def gradient(self, output_grad):
+        if ((self.csrmm_attr_trans_A is False) and
+                (self.csrmm_attr_trans_B is False)):
+            # if Y=AB, then dA=dY B^T, dB=A^T dY
+            # lhs_grad = matmul_op(
+            #     output_grad, self.inputs[1], trans_A=False, trans_B=True)
+            # Notice: cuSparse not support left trans right not trans
+            rhs_grad = csrmm_op(
+                self.inputs[0], output_grad, trans_A=True, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.csrmm_attr_trans_A is True) and
+                (self.csrmm_attr_trans_B is False)):
+            # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A dY
+            # lhs_grad = matmul_op(
+            #     self.inputs[1], output_grad, trans_A=False, trans_B=True)
+            rhs_grad = csrmm_op(
+                self.inputs[0], output_grad, trans_A=False, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.csrmm_attr_trans_A is False) and
+                (self.csrmm_attr_trans_B is True)):
+            # if Y=A B^T, then dA=dY B, dB=(A^T dY)^T=dY^T A
+            # lhs_grad = matmul_op(
+            #     output_grad, self.inputs[1], trans_A=False, trans_B=False)
+            # rhs_grad = matmul_op(
+            #     output_grad, self.inputs[0], trans_A=True, trans_B=False)
+            # Notice: cuSparse not support left trans right not trans
+            rhs_grad = transpose_op(csrmm_op(
+                self.inputs[0], output_grad, trans_A=True, trans_B=False, ctx=self.raw_ctx))
+        elif ((self.csrmm_attr_trans_A is True) and
+                (self.csrmm_attr_trans_B is True)):
+            # if Y=A^T B^T, then dA=(dY B)^T=B^T dY^T, dB=(A dY)^T=dY^T A^T
+            # lhs_grad = matmul_op(
+            #     self.inputs[1], output_grad, trans_A=True, trans_B=True)
+            # rhs_grad = matmul_op(
+            #     output_grad, self.inputs[0], trans_A=True, trans_B=True)
+            rhs_grad = transpose_op(csrmm_op(
+                self.inputs[0], output_grad, trans_A=False, trans_B=False, ctx=self.raw_ctx))
+        # return [lhs_grad, rhs_grad]
+        return [None, rhs_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        A = input_shapes[0]
+        B = input_shapes[1]
+        assert len(A) == 2 and len(B) == 2
+        shape_A = A[0]
+        shape_B = B[1]
+        shape_mid_1 = A[1]
+        shape_mid_2 = B[0]
+        if self.csrmm_attr_trans_A == True:
+            shape_A = A[1]
+            shape_mid_1 = A[0]
+        if self.csrmm_attr_trans_B == True:
+            shape_B = B[0]
+            shape_mid_2 = B[1]
+        assert shape_mid_1 == shape_mid_2
+        return (shape_A, shape_B)
+
+
+def csrmv_op(node_A, node_B, trans=False, ctx=None):
+    """Make a new instance of multiplication of a sparse matrix and a vector,
+        and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The left operand, a sparse matrix.
+    node_B : Node
+        The right operand, a vector.
+    trans : Boolean
+        Whether node_A to be transposed, default to be False.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return CsrmvOp(node_A, node_B, trans, ctx=ctx)
+
+
+def csrmm_op(node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+    """Make a new instance of Sparse Matrix Multiplication and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The left operand, a sparse matrix.
+    node_B : Node
+        The right operand, a dense matrix.
+    trans_A : Boolean
+        Whether node_A to be transposed, default to be False.
+    trans_B : Boolean
+        Whether node_B to be transposed, default to be False.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return CsrmmOp(node_A, node_B, trans_A, trans_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/DataTransfer.py b/python/hetu/gpu_ops/DataTransfer.py
new file mode 100644
index 0000000..295a297
--- /dev/null
+++ b/python/hetu/gpu_ops/DataTransfer.py
@@ -0,0 +1,165 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from .. import stream
+from .EmbeddingLookUp import EmbeddingLookUp_Gradient
+
+
+class DataH2DOp(Op):
+    # not support sparse matrix!!!
+    # for sparse matrix, please set Variable's ctx to gpu and pass value in feed_dict
+    def __init__(self, node_A, ctx):
+        super().__init__(DataH2DOp, [node_A], ctx)
+        assert ndarray.is_gpu_ctx(ctx)
+        assert not ndarray.is_gpu_ctx(node_A.ctx)
+        self.event = None
+        self.on_cpu = False
+        self.on_gpu = True
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if stream_handle:
+            if self.event is None:
+                self.event = stream.create_event_handle(self.ctx)
+            output_val.async_h2d(input_vals[0], stream_handle, self.event)
+        else:
+            input_vals[0].copyto(output_val)
+
+    def gradient(self, output_grad):
+        if isinstance(output_grad, EmbeddingLookUp_Gradient):
+            return [datad2h_sparse_op(output_grad)]
+        else:
+            return [datad2h_op(output_grad)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+    def forward_hook(self, config):
+        pass
+
+    def backward_hook(self, config):
+        pass
+
+
+class DataD2HOp(Op):
+    def __init__(self, node_A):
+        assert not isinstance(node_A, EmbeddingLookUp_Gradient)
+        super().__init__(DataD2HOp, [node_A], ndarray.cpu(0))
+        assert ndarray.is_gpu_ctx(node_A.ctx)
+        self.event = None
+        self.on_cpu = True
+        self.on_gpu = False
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if stream_handle:
+            if self.event is None:
+                self.event = stream.create_event_handle(self.inputs[0].ctx)
+            output_val.async_d2h(input_vals[0], stream_handle, self.event)
+        else:
+            input_vals[0].copyto(output_val)
+
+    def gradient(self, output_grad):
+        return [datah2d_op(output_grad, ctx=self.inputs[0].ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+    def forward_hook(self, config):
+        pass
+
+    def backward_hook(self, config):
+        pass
+
+
+class DataD2HSparseOp(Op):
+    # here sparse means indexed slices
+    def __init__(self, node_A):
+        assert isinstance(node_A, EmbeddingLookUp_Gradient)
+        super().__init__(DataD2HSparseOp, [node_A], ndarray.cpu(0))
+        assert ndarray.is_gpu_ctx(node_A.ctx)
+        self.event = None
+        self.on_cpu = True
+        self.on_gpu = False
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert isinstance(input_vals[0], ndarray.IndexedSlices)
+        assert isinstance(output_val, ndarray.IndexedSlices)
+        # TODO: include all these parts into memory allocation management!!!
+        # TODO: also consider how to deduplicate
+        if output_val.indices is None or output_val.indices.shape != input_vals[0].indices.shape:
+            output_val.indices = ndarray.empty(
+                input_vals[0].indices.shape, ctx=ndarray.cpu(0))
+            output_val.values = ndarray.empty(
+                input_vals[0].values.shape, ctx=ndarray.cpu(0))
+        if stream_handle:
+            if self.event is None:
+                self.event = stream.create_event_handle(self.inputs[0].ctx)
+            output_val.indices.async_d2h(
+                input_vals[0].indices, stream_handle, self.event)
+            output_val.values.async_d2h(
+                input_vals[0].values, stream_handle, self.event)
+        else:
+            input_vals[0].indices.copyto(output_val.indices)
+            input_vals[0].values.copyto(output_val.values)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+    def forward_hook(self, config):
+        pass
+
+    def backward_hook(self, config):
+        pass
+
+
+def datah2d_op(node, ctx):
+    """Transfer data from host(CPU) to device(GPU).
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DataH2DOp(node, ctx=ctx)
+
+
+def datad2h_op(node):
+    """Transfer data from device(GPU) to host(CPU).
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DataD2HOp(node)
+
+
+def datad2h_sparse_op(node):
+    """Transfer sparse data from device(GPU) to host(CPU).
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DataD2HSparseOp(node)
diff --git a/python/hetu/gpu_ops/Dispatch.py b/python/hetu/gpu_ops/Dispatch.py
new file mode 100644
index 0000000..4f9f3eb
--- /dev/null
+++ b/python/hetu/gpu_ops/Dispatch.py
@@ -0,0 +1,66 @@
+from __future__ import absolute_import
+from .Node import Op
+
+
+class DispatchOp(Op):
+    def __init__(self, node, parts, duplicate=1):
+        super().__init__(DispatchOp, [node], None)
+        self.parts = parts
+        self.duplicate = duplicate
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert False, "This Op should be replaced in preprocessing phase."
+
+    def gradient(self, output_grad):
+        return [dispatch_gradient(output_grad, self.inputs[0])]
+
+    def infer_shape(self, input_shapes):
+        assert False, "This Op should be replaced in preprocessing phase."
+
+
+class DispatchGradientOp(Op):
+    def __init__(self, node, forward_input):
+        super().__init__(DispatchGradientOp, [node, forward_input], None)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert False, "This Op should be replaced in preprocessing phase."
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert False, "This Op should be replaced in preprocessing phase."
+
+
+def dispatch(node, parts, duplicate=1):
+    """Dispatch a node into several parts, so the nodes following up can use model parallel.
+
+    Parameters:
+    ----
+    node : Node
+        The input Node.
+    parts: tuple
+        Indicates number of partitions in each dimension.
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DispatchOp(node, parts, duplicate)
+
+
+def dispatch_gradient(node, forward_input):
+    """Gradient node for Dispatch.
+
+    Parameters:
+    ----
+    node : Node
+        The input Node.
+    forward_input: Node
+        The original input node in forward phase.
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DispatchGradientOp(node, forward_input)
diff --git a/python/hetu/gpu_ops/DistGCN_15d.py b/python/hetu/gpu_ops/DistGCN_15d.py
new file mode 100644
index 0000000..38935dd
--- /dev/null
+++ b/python/hetu/gpu_ops/DistGCN_15d.py
@@ -0,0 +1,156 @@
+from __future__ import absolute_import
+from .Node import Op
+import math
+from .. import ndarray
+import numpy as np
+from ctypes import *
+
+
+def row_num(node_count, rank, size):
+    n_per_proc = math.ceil(float(node_count) / size)
+    if (node_count % size == 0):
+        return int(node_count / size)
+    if (rank < size - 1):
+        return int(n_per_proc)
+    else:
+        return int(node_count % n_per_proc)
+
+
+def broad_func(node_count, adj_matrix, inputs, rank, size, replication, row_groups, col_groups, ctx, comm=None, stream_handle=None):
+    assert size % (replication ** 2) == 0
+
+    n_per_proc = math.ceil(float(node_count) / (size // replication))
+    proc_node_count = row_num(
+        node_count, rank//replication, size // replication)
+
+    z_loc = ndarray.empty((proc_node_count, inputs.shape[1]), ctx=ctx)
+    tmp = ndarray.empty((proc_node_count, inputs.shape[1]), ctx=ctx)
+    inputs_recv = ndarray.empty((int(n_per_proc), inputs.shape[1]), ctx=ctx)
+
+    rank_c = rank // replication
+    rank_col = rank % replication
+
+    stages = size // (replication ** 2)
+    node_count_col = stages * n_per_proc
+    if rank_col == replication - 1:
+        stages = (size // replication) - (replication - 1) * stages
+        node_count_col = node_count - (replication - 1) * node_count_col
+
+    start_pos = list(range(0, int(node_count_col), int(n_per_proc)))
+    end_pos = start_pos[1:]+[int(node_count_col)]
+
+    for i in range(stages):
+        q = (rank_col * (size // (replication ** 2)) + i) * \
+            replication + rank_col
+        q_c = q // replication
+
+        if q_c == size // replication - 1:
+            inputs_recv = ndarray.empty((row_num(
+                node_count, size//replication - 1, size//replication), inputs.shape[1]), ctx=ctx)
+        if q == rank:
+            inputs.copyto(inputs_recv)
+
+        from ..communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+        if replication > 1:
+            col_groups[rank_col].dlarrayBroadcast(
+                inputs_recv, inputs_recv, ncclDataType_t.ncclFloat32, q)
+        else:
+            comm.dlarrayBroadcast(inputs_recv, inputs_recv,
+                                  ncclDataType_t.ncclFloat32, q)
+
+        from ..gpu_links import CuSparse_Csrmm, matrix_elementwise_add
+        CuSparse_Csrmm(adj_matrix, False, inputs_recv, False, tmp,
+                       stream=stream_handle, start_pos=int(start_pos[i]), end_pos=int(end_pos[i]))
+        matrix_elementwise_add(z_loc, tmp, z_loc, stream_handle)
+
+    if replication > 1:
+        row_groups[rank_c].dlarrayNcclAllReduce(
+            z_loc, z_loc, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum)
+
+    return z_loc
+
+
+class DistGCN_15dOp(Op):
+    def __init__(self, node_A, node_B, node_C, node_Count_Self, node_Count_All, size, replication, device_id, comm, comm_groups=[None, None], need_W=True):
+        super().__init__(DistGCN_15dOp, [
+            node_A, node_B, node_C], ctx=ndarray.gpu(device_id))
+        self.need_W = need_W
+        self.node_Count_Self = node_Count_Self
+        self.node_Count_All = node_Count_All
+        self.replication = replication
+        self.size = size
+        self.comm = comm
+        self.comm_groups = comm_groups
+        self.device_id = device_id
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        adj_matrix = input_vals[0]
+        inputs_H = input_vals[1]
+        weight = input_vals[2]
+        node_count = self.node_Count_All
+        comm = self.comm
+        rank = comm.localRank.value
+        ctx = ndarray.gpu(self.device_id)
+
+        if weight.shape[1] < inputs_H.shape[1]:
+            HW = ndarray.empty((inputs_H.shape[0], weight.shape[1]), ctx=ctx)
+            if (self.need_W == True):
+                from ..gpu_links import matrix_multiply
+                matrix_multiply(inputs_H, False, weight,
+                                False, HW, stream_handle)
+            else:
+                HW = inputs_H
+            z = broad_func(node_count, adj_matrix, HW, rank, self.size, self.replication,
+                           row_groups=self.comm_groups[0], col_groups=self.comm_groups[1], ctx=ctx, comm=comm, stream_handle=stream_handle)
+            z.copyto(output_val)
+        else:
+            AH = broad_func(node_count, adj_matrix, inputs_H, rank, self.size, self.replication,
+                            row_groups=self.comm_groups[0], col_groups=self.comm_groups[1], ctx=ctx, comm=comm, stream_handle=stream_handle)
+            z = ndarray.empty((AH.shape[0], weight.shape[1]), ctx=ctx)
+            if (self.need_W == True):
+                from ..gpu_links import matrix_multiply
+                matrix_multiply(AH, False, weight, False, z, stream_handle)
+            else:
+                z = AH
+            z.copyto(output_val)
+
+    def gradient(self, output_grad):
+        adj_matrix = self.inputs[0]
+        inputs_H = self.inputs[1]
+        weight = self.inputs[2]
+        node_Count_Self = self.node_Count_Self
+        node_Count_All = self.node_Count_All
+        comm = self.comm
+        rank = comm.localRank.value
+        ag = distgcn_15d_op(adj_matrix, output_grad, weight, node_Count_Self, node_Count_All,
+                            self.size, self.replication, self.device_id, comm, self.comm_groups, need_W=False)
+
+        from . import matmul_op
+        grad_A = None
+        grad_H = matmul_op(ag, weight, trans_B=True)
+        grad_weight = matmul_op(inputs_H, ag, trans_A=True)
+        from . import groupallreduceCommunicate_op
+        if self.replication > 1:
+            weight_groups = self.comm_groups[1]
+            if len(self.comm_groups) == 3:
+                weight_groups = self.comm_groups[2]
+            grad_W = groupallreduceCommunicate_op(
+                grad_weight, weight_groups[rank % self.replication])
+        else:
+            grad_W = groupallreduceCommunicate_op(grad_weight, comm)
+        return [grad_A, grad_H, grad_W]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        H = input_shapes[1]
+        W = input_shapes[2]
+        shape_H = H[1]
+        shape_W = W[1]
+        if (self.need_W == True):
+            return (self.node_Count_Self, shape_W)
+        else:
+            return (self.node_Count_Self, shape_H)
+
+
+def distgcn_15d_op(node_A, node_B, node_C, node_Count_Self, node_Count_All, size, replication, device_id, comm, comm_groups=[None, None], need_W=True):
+    return DistGCN_15dOp(node_A, node_B, node_C, node_Count_Self, node_Count_All, size, replication, device_id, comm, comm_groups, need_W=need_W)
diff --git a/python/hetu/gpu_ops/Division.py b/python/hetu/gpu_ops/Division.py
new file mode 100644
index 0000000..ecfd7be
--- /dev/null
+++ b/python/hetu/gpu_ops/Division.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from .Opposite import opposite_op
+from ..cpu_links import matrix_elementwise_divide as\
+    cpu_matrix_elementwise_divide
+from ..cpu_links import matrix_elementwise_divide_by_const as\
+    cpu_matrix_elementwise_divide_by_const
+from ..gpu_links import matrix_elementwise_divide
+from ..gpu_links import matrix_elementwise_divide_const
+
+
+class DivOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(DivOp, [node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert input_vals[0].shape == input_vals[1].shape, \
+            "can't do elementwise division between variables of different sizes."
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixElementwiseDivide']:
+                cpu_matrix_elementwise_divide(
+                    input_vals[0], input_vals[1], output_val)
+            else:
+                output_val[:] = input_vals[0].asnumpy() / \
+                    input_vals[1].asnumpy()
+        else:
+            matrix_elementwise_divide(
+                input_vals[0], input_vals[1], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        dividend_grad = div_const_op(1, self.inputs[1], ctx=self.raw_ctx)
+        divisor_grad = opposite_op(div_op(div_op(
+            self.inputs[0], self.inputs[1], ctx=self.raw_ctx), self.inputs[1], ctx=self.raw_ctx))
+        return [dividend_grad * output_grad, divisor_grad * output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        assert input_shapes[0] == input_shapes[1], \
+            "can't do elementwise division between variables of different sizes."
+        output = input_shapes[0]
+        return output
+
+
+class DivConstOp(Op):
+    def __init__(self, const_val, node_A, ctx=None):
+        super().__init__(DivConstOp, [node_A], ctx)
+        self.const_attr = const_val
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixElementwiseDivideByConst']:
+                cpu_matrix_elementwise_divide_by_const(
+                    input_vals[0], self.const_attr, output_val)
+            else:
+                output_val[:] = self.const_attr / input_vals[0].asnumpy()
+        else:
+            matrix_elementwise_divide_const(
+                self.const_attr, input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        divisor_grad = div_op(div_const_op(-self.const_attr,
+                                           self.inputs[0], ctx=self.raw_ctx), self.inputs[0], ctx=self.raw_ctx)
+        return [divisor_grad * output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def div_op(node_A, node_B, ctx=None):
+    """Make a new instance of matrixs elementwise division and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The Node where elements are numerators.
+    node_B : Node
+        Another Node where elements are denominators.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DivOp(node_A, node_B, ctx=ctx)
+
+
+def div_const_op(const_val, node_A, ctx=None):
+    """Make a new instance of matrix elementwise devide a constant value and call the instance.
+
+    Parameters:
+    ----
+    const_val: scalar value
+        The constant value to be mutiplied.
+    node_A : Node
+        The Node where elements are denominators.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return DivConstOp(const_val, node_A, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Dropout.py b/python/hetu/gpu_ops/Dropout.py
new file mode 100644
index 0000000..9886e9d
--- /dev/null
+++ b/python/hetu/gpu_ops/Dropout.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from .Node import Op
+import ctypes
+import numpy as np
+from .._base import DNNL_LIB
+from ..cpu_links import dropout as cpu_dropout
+from ..cpu_links import dropout_gradient as cpu_dropout_gradient
+from ..gpu_links import dropout_gradient
+from ..gpu_links import dropout
+
+
+class DropoutOp(Op):
+    def __init__(self, node_in, keep_prob, ctx=None):
+        super().__init__(DropoutOp, [node_in], ctx)
+        self.seed = ctypes.c_ulonglong(0)
+        self.mask = None
+        self.keep_prob = keep_prob
+
+    def compute(self, input_vals, output_val, stream_handle=None, inference=False):
+        if inference == False:
+            if self.on_cpu:
+                if DNNL_LIB['cpu_Dropout']:
+                    cpu_dropout(input_vals[0], self.keep_prob, output_val)
+                else:
+                    np.random.seed(self.seed.value)
+                    if self.mask is None:
+                        self.mask = np.random.uniform(
+                            0, 1.0, input_vals[0].shape) >= (1-self.keep_prob)
+                    output_val[:] = dropout_np(
+                        input_vals[0].asnumpy(), self.keep_prob, output_val, self.mask)
+            else:
+                dropout(input_vals[0], 1 - self.keep_prob,
+                        output_val, self.seed, stream_handle)
+
+    def gradient(self, output_grad):
+        return [dropout_gradient_op(output_grad, self.keep_prob, self, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+class Dropout_GradientOp(Op):
+    def __init__(self, node_in, keep_prob, forward_node, ctx=None):
+        super().__init__(Dropout_GradientOp, [node_in], ctx)
+        self.forward_node = forward_node
+        self.keep_prob = keep_prob
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Dropout_Gradient']:
+                cpu_dropout_gradient(input_vals[0], self.keep_prob, output_val)
+            else:
+                output_val[:] = dropout_np_gradient(
+                    input_vals[0].asnumpy(), self.keep_prob, self.forward_node.mask)
+        else:
+            dropout_gradient(input_vals[0], 1 - self.keep_prob,
+                             output_val, self.forward_node.seed, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+def dropout_op(node_in, keep_prob, ctx=None):
+    """Drops elements of input variable randomly.
+    Parameters:
+    ----
+    node_in : Node
+        Input variable.
+    keep_prob : float
+        Probability of the results to be kept.
+    Returns:
+    ----
+    A new Node instance created by Op.
+    """
+    return DropoutOp(node_in, keep_prob, ctx=ctx)
+
+
+def dropout_gradient_op(node_in, keep_prob, forward_node, ctx=None):
+    """Gradient node of dropout operation.
+    Parameters:
+    ----
+    node_in : Node
+        Input variable.
+    keep_prob : float
+        Probability of the results to be kept.
+    Returns:
+    ----
+    A new Node instance created by Op.
+    """
+    return Dropout_GradientOp(node_in, keep_prob, forward_node, ctx=ctx)
+
+
+def dropout_np(inputs, keep_prob, out_arr, mask):
+    return mask*inputs*(1/keep_prob)
+
+
+def dropout_np_gradient(in_gradient_y, keep_prob, mask):
+    out_grads = in_gradient_y
+    out_grads *= mask * (1 / keep_prob)
+    return out_grads
diff --git a/python/hetu/gpu_ops/Dropout2d.py b/python/hetu/gpu_ops/Dropout2d.py
new file mode 100644
index 0000000..7623b8d
--- /dev/null
+++ b/python/hetu/gpu_ops/Dropout2d.py
@@ -0,0 +1,86 @@
+from __future__ import absolute_import
+from .Node import Op
+import ctypes
+import numpy as np
+from .._base import DNNL_LIB
+#from ..cpu_links import dropout as cpu_dropout
+#from ..cpu_links import dropout_gradient as cpu_dropout_gradient
+from ..gpu_links import dropout2d_gradient
+from ..gpu_links import dropout2d
+
+
+class Dropout2dOp(Op):
+    def __init__(self, node_in, keep_prob, ctx=None):
+        super().__init__(Dropout2dOp, [node_in], ctx)
+        self.seed = ctypes.c_ulonglong(0)
+        self.mask = None
+        self.keep_prob = keep_prob
+
+    def compute(self, input_vals, output_val, stream_handle=None, inference=False):
+        if inference:
+            if self.on_cpu:
+                output_val[:] = input_vals[0].asnumpy()
+            else:
+                input_vals[0].copyto(output_val)
+        else:
+            if self.on_cpu:
+                raise NotImplementedError
+            else:
+                dropout2d(input_vals[0], 1 - self.keep_prob,
+                          output_val, self.seed, stream_handle)
+
+    def gradient(self, output_grad):
+        return [dropout2d_gradient_op(output_grad, self.keep_prob, self, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+class Dropout2d_GradientOp(Op):
+    def __init__(self, node_in, keep_prob, forward_node, ctx=None):
+        super().__init__(Dropout2d_GradientOp, [node_in], ctx)
+        self.forward_node = forward_node
+        self.keep_prob = keep_prob
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            raise NotImplementedError
+        else:
+            dropout2d_gradient(
+                input_vals[0], 1 - self.keep_prob, output_val, self.forward_node.seed, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+def dropout2d_op(node_in, keep_prob, ctx=None):
+    """Drops elements of input variable randomly.
+    Parameters:
+    ----
+    node_in : Node
+        Input variable.
+    keep_prob : float
+        Probability of the results to be kept.
+    Returns:
+    ----
+    A new Node instance created by Op.
+    """
+    return Dropout2dOp(node_in, keep_prob, ctx=ctx)
+
+
+def dropout2d_gradient_op(node_in, keep_prob, forward_node, ctx=None):
+    """Gradient node of dropout2d operation.
+    Parameters:
+    ----
+    node_in : Node
+        Input variable.
+    keep_prob : float
+        Probability of the results to be kept.
+    Returns:
+    ----
+    A new Node instance created by Op.
+    """
+    return Dropout2d_GradientOp(node_in, keep_prob, forward_node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/EmbeddingLookUp.py b/python/hetu/gpu_ops/EmbeddingLookUp.py
new file mode 100644
index 0000000..62dff29
--- /dev/null
+++ b/python/hetu/gpu_ops/EmbeddingLookUp.py
@@ -0,0 +1,144 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from .._base import DNNL_LIB
+import numpy as np
+from ..cpu_links import embedding_lookup as cpu_embedding_lookup
+from ..gpu_links import embedding_lookup
+
+
+class EmbeddingLookUp(Op):
+    def __init__(self, embedding, index, ctx=None):
+        super().__init__(EmbeddingLookUp, [embedding, index], ctx)
+        embedding.is_embed = True
+
+    def _compute_cpu_dnnl(self, input_vals, output_val, stream_handle=None):
+        cpu_embedding_lookup(input_vals[0], input_vals[1], output_val)
+
+    def _compute_cpu_numpy(self, input_vals, output_val, stream_handle=None):
+        flatten_index = input_vals[1].asnumpy().reshape(-1).astype(np.int32)
+        output_val[:] = input_vals[0].asnumpy(
+        )[flatten_index].reshape(output_val.shape)
+
+    def _compute_gpu(self, input_vals, output_val, stream_handle=None):
+        embedding_lookup(input_vals[0], input_vals[1],
+                         output_val, stream_handle)
+
+    def _compute_sparsepull_from_ps(self, input_vals, output_val, stream_handle=None):
+        self.event.sync()
+        if self.bsp:
+            self.comm.BarrierWorker()
+        self.comm.SparsePull(
+            self.ps_id, input_vals[1].handle, output_val.handle)
+        self.event.update()
+
+    def _compute_sparsepull_from_cache(self, input_vals, output_val, stream_handle=None):
+        self.event.sync()
+        if self.bsp:
+            self.comm.BarrierWorker()
+        ts = self.inputs[0].cache.embedding_lookup(input_vals[1], output_val)
+        self.event.update_ts(ts)
+
+    def gradient(self, output_grad):
+        self.grad_node = embedding_lookup_gradient_op(
+            output_grad, self.inputs[1], None, ctx=self.inputs[0].ctx)
+        return [self.grad_node, None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        if hasattr(self, 'grad_node'):
+            self.grad_node.embed_shape = input_shapes[0]
+        output_shape = list(input_shapes[1])
+        output_shape.append(input_shapes[0][1])
+        return tuple(output_shape)
+
+    def forward_hook(self, config):
+        super().forward_hook(config)
+        # insert data transfer op if needed
+        if config.use_sparse_pull or config.cstable_policy:
+            self.event = self.inputs[0].event
+            if not config.prefetch:
+                self.bsp = config.bsp
+                self.comm = config.ps_comm
+                if config.cstable_policy:
+                    self.compute = self._compute_sparsepull_from_cache
+                else:
+                    self.ps_id = self.inputs[0].id
+                    self.compute = self._compute_sparsepull_from_ps
+        else:
+            if self.on_cpu and DNNL_LIB['cpu_EmbeddingLookup']:
+                self.compute = self._compute_cpu_dnnl
+            elif self.on_cpu:
+                self.compute = self._compute_cpu_numpy
+            else:
+                self.compute = self._compute_gpu
+
+    def backward_hook(self, config):
+        # insert data transfer op if needed
+        local_comm_mode = config.node_strategy.get(self, config.comm_mode)
+        assert local_comm_mode != 'AllReduce' and local_comm_mode == config.node_strategy.get(self.inputs[0], config.comm_mode), \
+            'Embedding lookup communication mode invalid. Should conform with embedding parameter and not be AllReduce.'
+        if local_comm_mode in ('PS', 'Hybrid'):
+            cpu_ctx = ndarray.cpu(0)
+            self.ctx = cpu_ctx
+            for n in self.inputs:
+                n.ctx = cpu_ctx
+
+
+class EmbeddingLookUp_Gradient(Op):
+    def __init__(self, vectors, index, embed_shape, ctx=None):
+        super().__init__(EmbeddingLookUp_Gradient, [vectors, index], ctx)
+        self.embed_shape = embed_shape
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert self.embed_shape
+        output_val.update(
+            values=input_vals[0], indices=input_vals[1], dense_shape=self.embed_shape)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert self.embed_shape
+        return self.embed_shape
+
+    def backward_hook(self, config):
+        # insert data transfer op if needed
+        if config.comm_mode == 'PS' or config.comm_mode == "Hybrid":
+            self.ctx = ndarray.cpu(0)
+
+
+def embedding_lookup_op(embedding, index, ctx=None):
+    """Make a new instance of EmbeddingLookUp and call the instance.
+
+    Parameters:
+    ----
+    embedding : Node
+        The Node of Embedding.
+    index : Node
+        The index to be looked up.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return EmbeddingLookUp(embedding, index, ctx=ctx)
+
+
+def embedding_lookup_gradient_op(vectors, index, embed_shape, ctx=None):
+    """Make a new instance of EmbeddingLookUp_Gradient and call the instance.
+
+    Parameters:
+    ----
+    vectors : Node
+        Vectors which looked up from Embedding.
+    index : Node
+        The index to be looked up.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return EmbeddingLookUp_Gradient(vectors, index, embed_shape, ctx=ctx)
diff --git a/python/hetu/gpu_ops/InstanceNorm2d.py b/python/hetu/gpu_ops/InstanceNorm2d.py
new file mode 100644
index 0000000..2b27bba
--- /dev/null
+++ b/python/hetu/gpu_ops/InstanceNorm2d.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from .Node import Op
+import numpy as np
+from .. import ndarray
+from ..gpu_links import instance_normalization2d
+from ..gpu_links import instance_normalization2d_gradient
+
+
+class Instance_Normalization2dOp(Op):
+    def __init__(self, node_in, eps=0.0000001, ctx=None):
+        super().__init__(Instance_Normalization2dOp, [node_in], ctx)
+        self.eps = eps
+        self.save_mean = None
+        self.save_var = None
+        self.data_shape = None
+
+    def compute(self, input_vals, output_val, stream_handle=None, inference=False):
+        local_shape = list(input_vals[0].shape)
+        assert len(local_shape) == 4
+        local_shape[-1] = 1
+        local_shape[-2] = 1
+        local_shape = tuple(local_shape)
+        if self.on_cpu:
+            raise NotImplementedError
+        else:
+            if self.data_shape is None:
+                dev_id = input_vals[0].handle.contents.ctx.device_id
+                self.save_mean = ndarray.empty(
+                    local_shape, ctx=ndarray.gpu(dev_id))
+                self.save_var = ndarray.empty(
+                    local_shape, ctx=ndarray.gpu(dev_id))
+                self.data_shape = local_shape
+            elif self.data_shape != local_shape:
+                del self.save_mean
+                del self.save_var
+                dev_id = input_vals[0].handle.contents.ctx.device_id
+                self.save_mean = ndarray.empty(
+                    local_shape, ctx=ndarray.gpu(dev_id))
+                self.save_var = ndarray.empty(
+                    local_shape, ctx=ndarray.gpu(dev_id))
+                self.data_shape = local_shape
+            instance_normalization2d(input_vals[0], self.save_mean, self.save_var,
+                                     output_val, self.eps, stream_handle)
+
+    def gradient(self, output_grad):
+        return [instance_normalization2d_gradient_op(output_grad, self.inputs[0], self, ctx=self.ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+class Instance_Normalization2d_GradientOp(Op):
+    def __init__(self, out_gradient, in_node, forward_node, ctx=None):
+        super().__init__(Instance_Normalization2d_GradientOp,
+                         [out_gradient, in_node], ctx)
+        self.tmp_gradient_in_arr = None
+        self.data_shape = None
+        self.forward_node = forward_node
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            raise NotImplementedError
+        else:
+            instance_normalization2d_gradient(input_vals[0], input_vals[1], output_val,
+                                              self.forward_node.save_mean, self.forward_node.save_var,
+                                              self.forward_node.eps, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+
+def instance_normalization2d_op(node_in, eps=0.01, ctx=None):
+    """Layer normalization node.
+
+    Parameters:
+    ----
+    node_in : Node
+        Input data.
+    eps : float
+        Epsilon value for numerical stability.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Instance_Normalization2dOp(node_in, eps, ctx=ctx)
+
+
+def instance_normalization2d_gradient_op(out_gradient, in_node, forward_node, ctx=None):
+    """Gradient node of layer normalization.
+
+    Parameters:
+    ----
+    out_gradient :
+        The gradient array.
+    in_node : Node
+        Input node of ln layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Instance_Normalization2d_GradientOp(out_gradient, in_node, forward_node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/LayerNorm.py b/python/hetu/gpu_ops/LayerNorm.py
new file mode 100644
index 0000000..0698fb9
--- /dev/null
+++ b/python/hetu/gpu_ops/LayerNorm.py
@@ -0,0 +1,328 @@
+from __future__ import absolute_import
+from .Node import Op
+import numpy as np
+from .. import ndarray
+from ..gpu_links import layer_normalization
+from ..gpu_links import layer_normalization_gradient
+from ..gpu_links import layer_normalization_inference
+
+
+class Layer_NormalizationOp(Op):
+    def __init__(self, node_in, ln_scale, ln_bias, eps=0.01, ctx=None):
+        super().__init__(Layer_NormalizationOp,
+                         [node_in, ln_scale, ln_bias], ctx)
+        self.eps = eps
+        self.save_mean = None
+        self.save_var = None
+        self.data_shape = None
+
+    def compute(self, input_vals, output_val, stream_handle=None, inference=False):
+        if inference:
+            if self.on_cpu:
+                input_vals = [n.asnumpy() for n in input_vals]
+                data_type = input_vals[0].dtype
+                std = np.sqrt(self.save_var + self.eps, dtype=data_type)
+                centered_input = input_vals[0] - self.save_mean
+                normed_input = centered_input / std
+
+                bc_shape = [1] * len(input_vals[0].shape)
+                bc_shape[-1] = input_vals[0].shape[-1]
+
+                output_val[:] = input_vals[1].reshape(bc_shape) * normed_input + \
+                    input_vals[2].reshape(bc_shape)
+
+            else:
+                layer_normalization_inference(input_vals[0], input_vals[1], input_vals[2],
+                                              self.save_mean, self.save_var, output_val, self.eps, stream_handle)
+        else:
+            local_shape = list(input_vals[0].shape)
+            local_shape[-1] = 1
+            local_shape = tuple(local_shape)
+            if self.on_cpu:
+                input_vals = [n.asnumpy() for n in input_vals]
+                data_type = input_vals[0].dtype
+                if self.data_shape is None:
+                    self.save_mean = np.empty(local_shape, dtype=np.float32)
+                    self.save_var = np.empty(local_shape, dtype=np.float32)
+                    self.data_shape = local_shape
+                elif self.data_shape != local_shape:
+                    del self.save_mean
+                    del self.save_var
+                    self.save_mean = np.empty(local_shape, dtype=np.float32)
+                    self.save_var = np.empty(local_shape, dtype=np.float32)
+                    self.data_shape = local_shape
+                self.save_mean[:] = input_vals[0].mean(
+                    axis=-1, dtype=data_type, keepdims=True)
+                self.save_var[:] = input_vals[0].var(
+                    axis=-1, dtype=data_type, keepdims=True)
+                std = np.sqrt(self.save_var + self.eps, dtype=data_type)
+                centered_input = input_vals[0] - self.save_mean
+                normed_input = centered_input / std
+
+                bc_shape = [1] * len(input_vals[0].shape)
+                bc_shape[-1] = input_vals[0].shape[-1]
+
+                output_val[:] = input_vals[1].reshape(bc_shape) * normed_input + \
+                    input_vals[2].reshape(bc_shape)
+
+            else:
+                if self.data_shape is None:
+                    dev_id = input_vals[0].handle.contents.ctx.device_id
+                    self.save_mean = ndarray.empty(
+                        local_shape, ctx=ndarray.gpu(dev_id))
+                    self.save_var = ndarray.empty(
+                        local_shape, ctx=ndarray.gpu(dev_id))
+                    self.data_shape = local_shape
+                elif self.data_shape != local_shape:
+                    del self.save_mean
+                    del self.save_var
+                    dev_id = input_vals[0].handle.contents.ctx.device_id
+                    self.save_mean = ndarray.empty(
+                        local_shape, ctx=ndarray.gpu(dev_id))
+                    self.save_var = ndarray.empty(
+                        local_shape, ctx=ndarray.gpu(dev_id))
+                    self.data_shape = local_shape
+                layer_normalization(input_vals[0], input_vals[1], input_vals[2],
+                                    self.save_mean, self.save_var, output_val, self.eps, stream_handle)
+
+    def gradient(self, output_grad):
+        ln_gradient_node = layer_normalization_gradient_op(
+            output_grad, self.inputs[0], self.inputs[1], self, self.eps, ctx=self.raw_ctx)
+        data_gradient = layer_normalization_gradient_of_data_op(
+            ln_gradient_node, self.inputs[0], ctx=self.raw_ctx)
+        scale_gradient = layer_normalization_gradient_of_scale_op(
+            ln_gradient_node, self.inputs[1], ctx=self.raw_ctx)
+        bias_gradient = layer_normalization_gradient_of_bias_op(
+            ln_gradient_node, self.inputs[2], ctx=self.raw_ctx)
+        return [data_gradient, scale_gradient, bias_gradient]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        assert len(input_shapes[1]) == len(input_shapes[2]) == 1
+        assert input_shapes[0][-1] == input_shapes[1][0] == input_shapes[2][0]
+        return input_shapes[0]
+
+
+class Layer_Normalization_GradientOp(Op):
+    def __init__(self, out_gradient, in_node, ln_scale, forward_node, eps, ctx=None):
+        super().__init__(Layer_Normalization_GradientOp,
+                         [out_gradient, in_node, ln_scale], ctx)
+        self.tmp_gradient_in_arr = None
+        self.tmp_gradient_ln_bias = None
+        self.tmp_gradient_ln_scale = None
+        self.data_shape = None
+        self.forward_node = forward_node
+        self.eps = eps
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if self.tmp_gradient_ln_bias is None:
+                shapeln = input_vals[2].shape
+                self.data_shape = tuple(input_vals[0].shape)
+                self.tmp_gradient_ln_scale = np.empty(
+                    shape=shapeln, dtype=np.float32)
+                self.tmp_gradient_ln_bias = np.empty(
+                    shape=shapeln, dtype=np.float32)
+                self.tmp_gradient_in_arr = np.empty(
+                    shape=self.data_shape, dtype=np.float32)
+            elif self.data_shape != tuple(input_vals[0].shape):
+                self.data_shape = tuple(input_vals[0].shape)
+                del self.tmp_gradient_in_arr
+                self.tmp_gradient_in_arr = np.empty(
+                    shape=self.data_shape, dtype=np.float32)
+
+            red_axis = tuple(range(input_vals[0].ndim - 1))
+            self.tmp_gradient_ln_bias[:] = input_vals[0].sum(red_axis)  # (X,)
+
+            std = np.sqrt(self.forward_node.save_var + self.eps)  # (N, 1)
+            x_centered = input_vals[1] - self.forward_node.save_mean  # (N, X)
+            x_norm = x_centered / std  # (N, X)
+            self.tmp_gradient_ln_scale[:] = (
+                input_vals[0] * x_norm).sum(red_axis)  # (X,)
+
+            last_dim = input_vals[1].shape[-1]
+            dx_norm = input_vals[0] * input_vals[2].reshape(
+                [1] * (input_vals[0].ndim - 1) + [-1])  # (N, X)
+            dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (
+                self.forward_node.save_var + self.eps) / std  # (N, 1)
+            dx_mu_1 = dx_norm / std  # (N, X)
+            dx_mu_2 = dvar * 2 * x_centered / last_dim  # (N, X)
+            dx_1 = dx_mu_1 + dx_mu_2  # (N, X)
+            dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim  # (N, 1)
+            self.tmp_gradient_in_arr[:] = dx_1 + dx_2  # (N, X)
+        else:
+            if self.tmp_gradient_ln_bias is None:
+                shapeln = input_vals[2].shape
+                self.data_shape = tuple(input_vals[0].shape)
+                self.tmp_gradient_ln_bias = ndarray.empty(
+                    shape=shapeln, ctx=input_vals[0].ctx)
+                self.tmp_gradient_ln_scale = ndarray.empty(
+                    shape=shapeln, ctx=input_vals[0].ctx)
+                self.tmp_gradient_in_arr = ndarray.empty(
+                    shape=self.data_shape, ctx=input_vals[0].ctx)
+            elif self.data_shape != tuple(input_vals[0].shape):
+                self.data_shape = tuple(input_vals[0].shape)
+                del self.tmp_gradient_in_arr
+                self.tmp_gradient_in_arr = ndarray.empty(
+                    shape=self.data_shape, ctx=input_vals[0].ctx)
+
+            layer_normalization_gradient(input_vals[0], input_vals[1], input_vals[2],
+                                         self.tmp_gradient_in_arr, self.tmp_gradient_ln_scale,
+                                         self.tmp_gradient_ln_bias, self.forward_node.save_mean,
+                                         self.forward_node.save_var, self.eps, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return (1,)
+
+
+class Layer_Normalization_Gradient_of_DataOp(Op):
+    def __init__(self, ln_gradient, in_arr, ctx=None):
+        super().__init__(Layer_Normalization_Gradient_of_DataOp,
+                         [ln_gradient, in_arr], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_in_arr
+        else:
+            self.inputs[0].tmp_gradient_in_arr.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+class Layer_Normalization_Gradient_of_ScaleOp(Op):
+    def __init__(self, ln_gradient, in_scale, ctx=None):
+        super().__init__(Layer_Normalization_Gradient_of_ScaleOp,
+                         [ln_gradient, in_scale], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_ln_scale
+        else:
+            self.inputs[0].tmp_gradient_ln_scale.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+class Layer_Normalization_Gradient_of_BiasOp(Op):
+    def __init__(self, ln_gradient, in_bias, ctx=None):
+        super().__init__(Layer_Normalization_Gradient_of_BiasOp,
+                         [ln_gradient, in_bias], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = self.inputs[0].tmp_gradient_ln_bias
+        else:
+            self.inputs[0].tmp_gradient_ln_bias.copyto(output_val)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[1]
+
+
+def layer_normalization_op(node_in, ln_scale, ln_bias, eps=0.01, ctx=None):
+    """Layer normalization node.
+
+    Parameters:
+    ----
+    node_in : Node
+        Input data.
+    ln_scale : float
+        scaling parameter
+    ln_bias :
+        learnable bias parameter
+    eps : float
+        Epsilon value for numerical stability.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Layer_NormalizationOp(node_in, ln_scale, ln_bias, eps, ctx=ctx)
+
+
+def layer_normalization_gradient_op(out_gradient, in_node, ln_scale, forward_node, eps, ctx=None):
+    """Gradient node of layer normalization.
+
+    Parameters:
+    ----
+    out_gradient :
+        The gradient array.
+    in_node : Node
+        Input node of ln layer.
+    ln_scale :
+        Scaling parameter.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Layer_Normalization_GradientOp(out_gradient, in_node, ln_scale, forward_node, eps, ctx=ctx)
+
+
+def layer_normalization_gradient_of_data_op(ln_gradient, in_arr, ctx=None):
+    """Gradient node of data of layer normalization.
+
+    Parameters:
+    ----
+    ln_gradient :
+        The gradient array.
+    in_arr : Node
+        Input array of ln layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Layer_Normalization_Gradient_of_DataOp(ln_gradient, in_arr, ctx=ctx)
+
+
+def layer_normalization_gradient_of_scale_op(ln_gradient, in_scale, ctx=None):
+    """Gradient node of scale parameter of layer normalization.
+
+    Parameters:
+    ----
+    ln_gradient :
+        The gradient array.
+    in_scale :
+        Scaling parameter of ln layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Layer_Normalization_Gradient_of_ScaleOp(ln_gradient, in_scale, ctx=ctx)
+
+
+def layer_normalization_gradient_of_bias_op(ln_gradient, in_bias, ctx=None):
+    """Gradient node of bias parameter of layer normalization.
+
+    Parameters:
+    ----
+    ln_gradient :
+        The gradient array.
+    in_bias :
+        Bias parameter of ln layer.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Layer_Normalization_Gradient_of_BiasOp(ln_gradient, in_bias, ctx=ctx)
diff --git a/python/hetu/gpu_ops/LeakyRelu.py b/python/hetu/gpu_ops/LeakyRelu.py
new file mode 100644
index 0000000..32c8449
--- /dev/null
+++ b/python/hetu/gpu_ops/LeakyRelu.py
@@ -0,0 +1,91 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+"""
+from ..cpu_links import leaky_relu as cpu_leaky_relu
+from ..cpu_links import leaky_relu_gradient as cpu_leaky_relu_gradient
+"""
+from ..gpu_links import leaky_relu
+from ..gpu_links import leaky_relu_gradient
+
+
+class LeakyReluOp(Op):
+    def __init__(self, node_A, const_val, ctx=None):
+        super().__init__(LeakyReluOp, [node_A], ctx)
+        self.const_attr = const_val
+        self.desc = self.name + '(%s, %s)' % (node_A.name, str(const_val))
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            raise NotImplementedError
+        else:
+            leaky_relu(input_vals[0], self.const_attr,
+                       output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [leaky_relu_gradient_op(self.inputs[0], output_grad, self.const_attr, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+class LeakyReluGradientOp(Op):
+    def __init__(self, node_A, node_B, const_val, ctx=None):
+        super().__init__(LeakyReluGradientOp, [node_A, node_B], ctx)
+        self.const_attr = const_val
+        self.desc = self.name + \
+            '(%s, %s, %s)' % (node_A.name, node_B.name, str(const_val))
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            raise NotImplementedError
+        else:
+            leaky_relu_gradient(
+                input_vals[0], input_vals[1], self.const_attr, output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        return input_shapes[0]
+
+
+def leaky_relu_op(node, alpha, ctx=None):
+    """Rectified Linear Unit.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+    alpha : float
+        LeakyRelu's alpha 
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return LeakyReluOp(node, alpha, ctx=ctx)
+
+
+def leaky_relu_gradient_op(node_A, node_B, alpha, ctx=None):
+    """Computes the gradient of the ReLU function.  
+
+    Parameters:
+    ----
+    node_A : Node
+        LeakyRelu input.
+    node_B : Node
+        Previous gradient node.
+    alpha : float
+        LeakyRelu alpha
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return LeakyReluGradientOp(node_A, node_B, alpha, ctx=ctx)
diff --git a/python/hetu/gpu_ops/MatrixDot.py b/python/hetu/gpu_ops/MatrixDot.py
new file mode 100644
index 0000000..fecea60
--- /dev/null
+++ b/python/hetu/gpu_ops/MatrixDot.py
@@ -0,0 +1,48 @@
+from __future__ import absolute_import
+from .Node import Op
+import numpy as np
+from .MultiplyElewise import mul_op
+from .ReduceSum import reduce_sum_op
+from ..gpu_links import matrix_dot
+
+# TODO: This op may have bugs and is not complete!
+# Use other ops to replace it
+
+
+class MatrixDotOp(Op):
+    def __init__(self, node_A, node_B, axes=0, ctx=None):
+        super().__init__(MatrixDotOp, [node_A, node_B], ctx)
+        self.axes = axes
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = np.tensordot(
+                input_vals[0], input_vals[1], axes=self.axes)
+        else:
+            matrix_dot(input_vals[0], input_vals[1], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [matrix_dot_op(output_grad, self.inputs[1], axes=1, ctx=self.raw_ctx),
+                reduce_sum_op(mul_op(self.inputs[0], output_grad, ctx=self.raw_ctx), axes=1, keepdims=True, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """Need to handle input_vals[0].shape != input_vals[1].shape"""
+        return input_shapes[0]
+
+
+def matrix_dot_op(node_A, node_B, axes=0, ctx=None):
+    """Make a new instance of matrixs elementwise multiplication and call the instance.
+
+    Parameters:
+    ----
+    node_a : Node
+        The Node to be multiplied.
+    node_b : Node
+        Another Node to be multiplied.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return MatrixDotOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/MatrixMult.py b/python/hetu/gpu_ops/MatrixMult.py
new file mode 100644
index 0000000..753eb31
--- /dev/null
+++ b/python/hetu/gpu_ops/MatrixMult.py
@@ -0,0 +1,130 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..gpu_links import matrix_multiply
+from ..cpu_links import matrix_multiply as cpu_matrix_multiply
+
+
+class MatMulOp(Op):
+    def __init__(self, node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+        super().__init__(MatMulOp, [node_A, node_B], ctx)
+        self.matmul_attr_trans_A = trans_A
+        self.matmul_attr_trans_B = trans_B
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixMultiply']:
+                cpu_matrix_multiply(
+                    input_vals[0], self.matmul_attr_trans_A,
+                    input_vals[1], self.matmul_attr_trans_B,
+                    output_val)
+            else:
+                input_vals = [n.asnumpy() for n in input_vals]
+                if ((self.matmul_attr_trans_A is False) and
+                        (self.matmul_attr_trans_B is False)):
+                    output_val[:] = np.matmul(input_vals[0], input_vals[1])
+                elif ((self.matmul_attr_trans_A is True) and
+                        (self.matmul_attr_trans_B is False)):
+                    output_val[:] = np.matmul(
+                        np.transpose(input_vals[0]), input_vals[1])
+                elif ((self.matmul_attr_trans_A is False) and
+                        (self.matmul_attr_trans_B is True)):
+                    output_val[:] = np.matmul(
+                        input_vals[0], np.transpose(input_vals[1]))
+                elif ((self.matmul_attr_trans_A is True) and
+                        (self.matmul_attr_trans_B is True)):
+                    output_val[:] = np.matmul(
+                        np.transpose(input_vals[0]), np.transpose(input_vals[1]))
+        else:
+            matrix_multiply(
+                input_vals[0], self.matmul_attr_trans_A,
+                input_vals[1], self.matmul_attr_trans_B,
+                output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        if ((self.matmul_attr_trans_A is False) and
+                (self.matmul_attr_trans_B is False)):
+            # if Y=AB, then dA=dY B^T, dB=A^T dY
+            lhs_grad = matmul_op(
+                output_grad, self.inputs[1], trans_A=False, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = matmul_op(
+                self.inputs[0], output_grad, trans_A=True, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is True) and
+                (self.matmul_attr_trans_B is False)):
+            # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A dY
+            lhs_grad = matmul_op(
+                self.inputs[1], output_grad, trans_A=False, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = matmul_op(
+                self.inputs[0], output_grad, trans_A=False, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is False) and
+                (self.matmul_attr_trans_B is True)):
+            # if Y=A B^T, then dA=dY B, dB=(A^T dY)^T=dY^T A
+            lhs_grad = matmul_op(
+                output_grad, self.inputs[1], trans_A=False, trans_B=False, ctx=self.raw_ctx)
+            rhs_grad = matmul_op(
+                output_grad, self.inputs[0], trans_A=True, trans_B=False, ctx=self.raw_ctx)
+        elif ((self.matmul_attr_trans_A is True) and
+                (self.matmul_attr_trans_B is True)):
+            # if Y=A^T B^T, then dA=(dY B)^T=B^T dY^T, dB=(A dY)^T=dY^T A^T
+            lhs_grad = matmul_op(
+                self.inputs[1], output_grad, trans_A=True, trans_B=True, ctx=self.raw_ctx)
+            rhs_grad = matmul_op(
+                output_grad, self.inputs[0], trans_A=True, trans_B=True, ctx=self.raw_ctx)
+        return [lhs_grad, rhs_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        A = input_shapes[0]
+        B = input_shapes[1]
+        shape_A = A[0]
+        shape_B = B[1]
+        if self.matmul_attr_trans_A == True:
+            shape_A = A[1]
+        if self.matmul_attr_trans_B == True:
+            shape_B = B[0]
+        return (shape_A, shape_B)
+
+    def deduce_states(self, states, duplicates):
+        def revert(x):
+            return (x[1], x[0])
+
+        def gcd(x, y):
+            return y if x % y == 0 else gcd(y, x % y)
+        if states[0] is None and states[1] is None:
+            return None, min(duplicates)
+        if states[0] is None:
+            states[0] = (1, 1)
+        if states[1] is None:
+            states[1] = (1, 1)
+        assert len(states[0]) == 2 and len(states[1]) == 2
+        assert np.prod(states[0]) * \
+            duplicates[0] == np.prod(states[1]) * duplicates[1]
+        if self.matmul_attr_trans_A:
+            states[0] = revert(states[0])
+        if self.matmul_attr_trans_B:
+            states[1] = revert(states[1])
+        assert states[0][1] == states[1][0], 'Partition number of left matrix column shoule match that of right matrix row.'
+        return (states[0][0], states[1][1]), gcd(max(duplicates), min(duplicates)) * states[0][1]
+
+
+def matmul_op(node_A, node_B, trans_A=False, trans_B=False, ctx=None):
+    """Make a new instance of Matrix Multiplication and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        The left operand of the matrix multiplication.
+    node_B : Node
+        The right operand of the matrix multiplication.
+    trans_A : Boolean 
+        Whether node_A to be transposed
+    trans_B : Boolean 
+        Whether node_B to be transposed
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return MatMulOp(node_A, node_B, trans_A, trans_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/MaxPool.py b/python/hetu/gpu_ops/MaxPool.py
new file mode 100644
index 0000000..348554b
--- /dev/null
+++ b/python/hetu/gpu_ops/MaxPool.py
@@ -0,0 +1,184 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import max_pool as cpu_max_pooling
+from ..cpu_links import max_pool_gradient as cpu_max_pooling_gradient
+from ..gpu_links import CuDNN_max_pooling2d
+from ..gpu_links import CuDNN_max_pooling2d_gradient
+
+
+def np_max_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
+    N, C, H, W = input.shape
+    assert((H + 2 * padding - kernel_H) % stride == 0)
+    assert((W + 2 * padding - kernel_W) % stride == 0)
+    pooled_H = (H + 2 * padding - kernel_H) // stride + 1
+    pooled_W = (W + 2 * padding - kernel_W) // stride + 1
+
+    pooled_layer = np.zeros(shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+    pooling_size = kernel_H * kernel_W
+
+    for n in range(N):
+        for c in range(C):
+            for h in range(pooled_H):
+                for w in range(pooled_W):
+                    hs = h * stride - padding
+                    ws = w * stride - padding
+                    hend = min(hs + kernel_H, H)
+                    wend = min(ws + kernel_W, W)
+                    hs = max(hs, 0)
+                    ws = max(ws, 0)
+
+                    hargmax = hs
+                    wargmax = ws
+                    for i in range(hs, hend):
+                        for j in range(ws, wend):
+                            if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
+                                hargmax = i
+                                wargmax = j
+                    pooled_layer[n][c][h][w] = input[n][c][hargmax][wargmax]
+
+    return pooled_layer
+
+
+def np_max_pooling_gradient(input, gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+    N, C, pooled_H, pooled_W = gradient_y.shape
+    H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+    W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+    gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+    pooling_size = kernel_H * kernel_W
+
+    for n in range(N):
+        for c in range(C):
+            for h in range(pooled_H):
+                for w in range(pooled_W):
+                    hs = h * stride - padding
+                    ws = w * stride - padding
+                    hend = min(hs + kernel_H, H)
+                    wend = min(ws + kernel_W, W)
+                    hs = max(hs, 0)
+                    ws = max(ws, 0)
+
+                    hargmax = hs
+                    wargmax = ws
+                    for i in range(hs, hend):
+                        for j in range(ws, wend):
+                            if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
+                                hargmax = i
+                                wargmax = j
+                    gradient_x[n][c][hargmax][wargmax] += gradient_y[n][c][h][w]
+
+    return gradient_x
+
+
+class Max_Pool2dOp(Op):
+    def __init__(self, node_A, kernel_H, kernel_W, padding, stride, ctx=None):
+        super().__init__(Max_Pool2dOp, [node_A], ctx)
+        self.padding = padding
+        self.stride = stride
+        self.kernel_H = kernel_H
+        self.kernel_W = kernel_W
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMaxPool']:
+                cpu_max_pooling(
+                    input_vals[0], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride)
+            else:
+                output_val[:] = np_max_pooling(input_vals[0].asnumpy(
+                ), self.kernel_H, self.kernel_W, self.padding, self.stride)
+        else:
+            CuDNN_max_pooling2d(
+                input_vals[0], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride, stream_handle)
+
+    def gradient(self, output_grad):
+        return [max_pool2d_gradient_op(self, output_grad, self.inputs[0], self.kernel_H, self.kernel_W, self.padding, self.stride, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """Need to handle input_vals[0].shape != input_vals[1].shape"""
+        assert len(input_shapes) == 1
+        N, C, H, W = input_shapes[0]
+        p_H = (H + 2 * self.padding - self.kernel_H) // self.stride + 1
+        p_W = (W + 2 * self.padding - self.kernel_W) // self.stride + 1
+        return (N, C, p_H, p_W)
+
+
+class Max_Pool2d_GradientOp(Op):
+    def __init__(self, node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=None):
+        super().__init__(Max_Pool2d_GradientOp, [
+            node_out, node_out_gradient, node_in], ctx)
+        self.padding = padding
+        self.stride = stride
+        self.kernel_H = kernel_H
+        self.kernel_W = kernel_W
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMaxPool_Gradient']:
+                cpu_max_pooling_gradient(
+                    input_vals[2], input_vals[1], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride)
+            else:
+                output_val[:] = np_max_pooling_gradient(input_vals[2].asnumpy(
+                ), input_vals[1].asnumpy(), self.kernel_H, self.kernel_W, self.padding, self.stride)
+        else:
+            CuDNN_max_pooling2d_gradient(
+                input_vals[0], input_vals[1], input_vals[2], self.kernel_H, self.kernel_W, output_val, self.padding, self.stride, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        return input_shapes[2]
+
+
+def max_pool2d_op(node_A, kernel_H, kernel_W, padding, stride, ctx=None):
+    """Make a new instance of Max_Pool2dOp and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        Input Node
+    kernel_H : scalar value
+        Size of pool(height)
+    kernel_W : scalar value
+        Size of pool(width)
+    padding : scalar value
+        Padding edge
+    stride : scalar value
+        Step Length of the kernel
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Max_Pool2dOp(node_A, kernel_H, kernel_W, padding, stride, ctx=ctx)
+
+
+def max_pool2d_gradient_op(node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=None):
+    """Make a new instance of Max_Pool2d_GradientOp and call the instance.
+
+    Parameters:
+    ----
+    node_out : Node
+        Output Node
+    node_out_gradient : Node
+        Gradient array
+    node_in : Node
+        Input Node
+    kernel_H : scalar value
+        Size of pool(height)
+    kernel_W : scalar value
+        Size of pool(width)
+    padding : scalar value
+        Padding edge
+    stride : scalar value
+        Step Length of the kernel
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Max_Pool2d_GradientOp(node_out, node_out_gradient, node_in, kernel_H, kernel_W, padding, stride, ctx=ctx)
diff --git a/python/hetu/gpu_ops/MultiplyConst.py b/python/hetu/gpu_ops/MultiplyConst.py
new file mode 100644
index 0000000..7328970
--- /dev/null
+++ b/python/hetu/gpu_ops/MultiplyConst.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import matrix_elementwise_multiply_by_const as cpu_matrix_elementwise_multiply_by_const
+from ..gpu_links import matrix_elementwise_multiply_by_const
+
+
+class MulByConstOp(Op):
+    def __init__(self, node_A, const_val, ctx=None):
+        super().__init__(MulByConstOp, [node_A], ctx)
+        self.const_attr = const_val
+        self.desc = self.name + '(%s, %s)' % (node_A.name, str(const_val))
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert self.const_attr is not None
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixElementwiseMultiplyByConst']:
+                cpu_matrix_elementwise_multiply_by_const(
+                    input_vals[0], self.const_attr, output_val)
+            else:
+                output_val[:] = input_vals[0].asnumpy() * self.const_attr
+        else:
+            matrix_elementwise_multiply_by_const(
+                input_vals[0], self.const_attr, output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [self.const_attr * output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def mul_byconst_op(node_A, const_val, ctx=None):
+    """Make a new instance of MulByConstOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to be multiplied.
+    const_val : scalar value
+        The constant value to be mutiplied.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return MulByConstOp(node_A, const_val, ctx=ctx)
diff --git a/python/hetu/gpu_ops/MultiplyElewise.py b/python/hetu/gpu_ops/MultiplyElewise.py
new file mode 100644
index 0000000..d943ebf
--- /dev/null
+++ b/python/hetu/gpu_ops/MultiplyElewise.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import matrix_elementwise_multiply as\
+    cpu_matrix_elementwise_multiply
+from ..cpu_links import matrix_elementwise_multiply_by_const as\
+    cpu_matrix_elementwise_multiply_by_const
+from ..gpu_links import matrix_elementwise_multiply,\
+    matrix_elementwise_multiply_by_const
+
+
+class MulOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(MulOp, [node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlMatrixElementwiseMultiply'] and input_vals[0].shape == input_vals[1].shape:
+                cpu_matrix_elementwise_multiply(
+                    input_vals[0], input_vals[1], output_val)
+            elif DNNL_LIB['DnnlMatrixElementwiseMultiplyByConst'] and (input_vals[0].shape == (1,) or input_vals[1].shape == (1,)):
+                if input_vals[1].shape == (1,):
+                    const_val = input_vals[1].asnumpy()[0]
+                    cpu_matrix_elementwise_multiply_by_const(
+                        input_vals[0], const_val, output_val)
+                elif input_vals[0].shape == (1,):
+                    const_val = input_vals[0].asnumpy()[0]
+                    cpu_matrix_elementwise_multiply_by_const(
+                        input_vals[1], const_val, output_val)
+            else:
+                output_val[:] = input_vals[0].asnumpy() * \
+                    input_vals[1].asnumpy()
+        else:
+            if input_vals[0].shape == input_vals[1].shape:
+                matrix_elementwise_multiply(
+                    input_vals[0], input_vals[1], output_val, stream_handle)
+            else:
+                if input_vals[1].shape == (1,):
+                    const_val = input_vals[1].asnumpy()[0]
+                    matrix_elementwise_multiply_by_const(
+                        input_vals[0], const_val, output_val, stream_handle)
+                elif input_vals[0].shape == (1,):
+                    const_val = input_vals[0].asnumpy()[0]
+                    matrix_elementwise_multiply_by_const(
+                        input_vals[1], const_val, output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [mul_op(self.inputs[1], output_grad, ctx=self.raw_ctx),
+                mul_op(self.inputs[0], output_grad, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """Need to handle input_vals[0].shape != input_vals[1].shape"""
+        assert len(input_shapes) == 2
+        if input_shapes[0] == input_shapes[1]:
+            output = input_shapes[0]
+        else:
+            if input_shapes[0] == (1,):
+                output = input_shapes[1]
+            elif input_shapes[1] == (1,):
+                output = input_shapes[0]
+            else:
+                assert False, "can't do elementwise multiply between variables of different sizes."
+        return output
+
+
+def mul_op(node_A, node_B, ctx=None):
+    """Make a new instance of matrixs elementwise multiplication and call the instance.
+
+    Parameters:
+    ----
+    node_a : Node
+        The Node to be multiplied.
+    node_b : Node
+        Another Node to be multiplied.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return MulOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Node.py b/python/hetu/gpu_ops/Node.py
new file mode 100644
index 0000000..5e5f331
--- /dev/null
+++ b/python/hetu/gpu_ops/Node.py
@@ -0,0 +1,172 @@
+from __future__ import absolute_import
+import numpy as np
+from .. import ndarray
+from .. import stream
+from ..context import get_current_context, DeviceGroup
+G_NODE_ID = 0
+
+
+class Op(object):
+    """Node in a computation graph."""
+
+    def __init__(self, op_type, inputs, ctx=None):
+        """Constructor
+            Instance variables
+            ------------------
+            self.inputs: the list of input nodes.
+            self.const_attr: the add or multiply constant.
+                e.g. self.const_attr=5 if this node is created by x+5.
+            self.name: node name for debugging.
+        """
+        self.inputs = inputs
+        self.raw_ctx = get_current_context() if ctx is None else DeviceGroup(ctx)
+        self.ctx = ctx
+        self.const_attr = None
+        self.dtype = None
+        self.inplace = False
+        self.lazy_execution = False
+        self.event = None
+        self.op_type = op_type.__name__
+        global G_NODE_ID
+        self.id = G_NODE_ID
+        G_NODE_ID = G_NODE_ID + 1
+        self.name = self.op_type + str(self.id)
+        self.desc = self.name + \
+            '(' + ', '.join([inp.name for inp in inputs]) + ')'
+
+    def __add__(self, other):
+        """Adding two nodes return a new node."""
+        from .AddElewise import add_op
+        from .AddConst import addbyconst_op
+
+        # here the operator does NOT specify context
+        # please explicitly specify the context in gradients!!
+        if isinstance(other, Op):
+            new_node = add_op(self, other)
+        else:
+            # Add by a constant stores the constant in new node's const_attr
+            # 'other' argument is a constant
+            new_node = addbyconst_op(self, other)
+        return new_node
+
+    def __mul__(self, other):
+        """Multiplying two nodes return a new node."""
+        from .MultiplyElewise import mul_op
+        from .MultiplyConst import mul_byconst_op
+
+        if isinstance(other, Op):
+            new_node = mul_op(self, other)
+        else:
+            # Mul by a constant stores the constant in new node's const_attr
+            # 'other' argument is a constant
+            new_node = mul_byconst_op(self, other)
+        return new_node
+
+    # Allow left-hand-side add and multiply.
+    __radd__ = __add__
+    __rmul__ = __mul__
+
+    def __str__(self):
+        """Allow print to display node name."""
+        return self.name
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        """Given values of input nodes, compute the output value.
+        Parameters
+        ----------
+        node: node that performs the compute.
+        input_vals: values of input nodes.
+        output_val: output value of the node, modified in-place.
+        """
+        raise NotImplementedError
+
+    def gradient(self, output_grad):
+        """Given output gradient, compute partial gradient to each input node.
+        Parameters
+        ----------
+        node: node that performs the gradient.
+        output_grad: output gradient summed from children nodes' contributions
+        Returns
+        -------
+        A list of gradient contributions to each input node respectively.
+        """
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        """Given shapes of input nodes, compute shape of output node.
+        Implementation note:
+        It's simpler to treat shape of constants as (1,), so that constants can
+        be stored as a numpy array too and you would need fewer special case
+        handling.
+        Parameters
+        ----------
+        node: node whose shape is being inferred.
+        input_vals: shapes of input nodes.
+        Returns
+        -------
+        A tuple representing the shape of output node.
+        """
+        raise NotImplementedError
+
+    def add_transfer_op(self, src_node, dst_ctx, h2d_ops, d2h_ops):
+        from .DataTransfer import datah2d_op, datad2h_op, datad2h_sparse_op
+
+        def add_h2d(prev_node, cur_ctx):
+            if prev_node not in h2d_ops:
+                h2d_ops[prev_node] = datah2d_op(prev_node, cur_ctx)
+            return h2d_ops[prev_node]
+
+        def add_d2h(prev_node):
+            from .EmbeddingLookUp import EmbeddingLookUp_Gradient
+            if prev_node not in d2h_ops:
+                if isinstance(prev_node, EmbeddingLookUp_Gradient):
+                    d2h_ops[prev_node] = datad2h_sparse_op(prev_node)
+                else:
+                    d2h_ops[prev_node] = datad2h_op(prev_node)
+                if prev_node.event is None:
+                    # here we should ensure the computation complete before d2h
+                    prev_node.event = stream.create_event_handle(prev_node.ctx)
+            return d2h_ops[prev_node]
+        src_ctx = src_node.ctx
+        result = src_node
+        if src_ctx != dst_ctx:
+            if ndarray.is_gpu_ctx(dst_ctx):
+                if ndarray.is_gpu_ctx(src_ctx):
+                    assert False, 'Please use NCCL to P2P communicate!'
+                else:
+                    result = add_h2d(result, dst_ctx)
+            else:
+                result = add_d2h(result)
+        return result
+
+    def forward_hook(self, config):
+        # disable inplace if not lazy execution
+        # previously we use array reshape lazy callback to do this, which is deprecated (not efficient)
+        if not self.lazy_execution:
+            for node in self.inputs:
+                node.inplace = False
+
+        # insert data transfer op if needed
+        input_ctxs = set([n.ctx for n in self.inputs])
+        assert None not in input_ctxs, 'Inputs contexts should already be determined.'
+        if self.ctx is None:
+            self.ctx = config.context
+        for i in range(len(self.inputs)):
+            self.inputs[i] = self.add_transfer_op(
+                self.inputs[i], self.ctx, config.h2d_ops, config.d2h_ops)
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        if self in config.eval_node_list and self.on_gpu and self.event is None:
+            self.event = stream.create_event_handle(self.ctx)
+
+    def backward_hook(self, config):
+        pass
+
+    def deduce_states(self, input_states, input_duplicates):
+        assert len(input_states) == len(self.inputs)
+        assert len(input_states) == len(input_duplicates)
+        if len(input_states) == 1:
+            return input_states[0], input_duplicates[0]
+        else:
+            assert all([x is None or x == (1, 1) for x in input_states])
+            return None, 1
diff --git a/python/hetu/gpu_ops/OneHot.py b/python/hetu/gpu_ops/OneHot.py
new file mode 100644
index 0000000..89b1d62
--- /dev/null
+++ b/python/hetu/gpu_ops/OneHot.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import one_hot
+
+
+class OneHotOp(Op):
+    def __init__(self, node_A, num_classes, ctx=None):
+        super().__init__(OneHotOp, [node_A], ctx)
+        self.num_classes = num_classes
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            inputs = input_vals[0].asnumpy().astype(np.int)
+            res = np.eye(self.num_classes)[inputs.reshape(-1)]
+            output_val[:] = res.reshape(
+                list(inputs.shape) + [self.num_classes]).astype(np.float32)
+        else:
+            one_hot(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return tuple(list(input_shapes[0]) + [self.num_classes])
+
+
+def one_hot_op(node, num_classes, ctx=None):
+    """Creates a node that represents one hot.
+
+    Parameters:
+    ----
+    node : Node
+        The input Node.
+    num_classes: int
+        Number of classes.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return OneHotOp(node, num_classes, ctx=ctx)
diff --git a/python/hetu/gpu_ops/OnesLike.py b/python/hetu/gpu_ops/OnesLike.py
new file mode 100644
index 0000000..ff3ba9c
--- /dev/null
+++ b/python/hetu/gpu_ops/OnesLike.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import array_set as cpu_array_set
+from ..gpu_links import array_set
+
+
+class OnesLikeOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(OnesLikeOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_ArraySet']:
+                cpu_array_set(output_val, 1)
+            else:
+                output_val[:] = np.ones(input_vals[0].shape)
+        else:
+            array_set(output_val, 1, stream_handle)
+
+    def gradient(self, output_grad):
+        return [None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def oneslike_op(node, ctx=None):
+    """Creates a node that represents np.ones(node_A.shape).
+
+    Parameters:
+    ----
+    node : Node
+        The Node to pad with 1.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return OnesLikeOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Opposite.py b/python/hetu/gpu_ops/Opposite.py
new file mode 100644
index 0000000..adbb969
--- /dev/null
+++ b/python/hetu/gpu_ops/Opposite.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import opposite as cpu_opposite
+from ..gpu_links import matrix_opposite
+
+
+class OppositeOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(OppositeOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlOpposite']:
+                cpu_opposite(input_vals[0], output_val)
+            else:
+                output_val[:] = -input_vals[0].asnumpy()
+        else:
+            matrix_opposite(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [opposite_op(output_grad, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def opposite_op(node, ctx=None):
+    """Calculate the opposite of a matrix elementwisely.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return OppositeOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Pad.py b/python/hetu/gpu_ops/Pad.py
new file mode 100644
index 0000000..bbc0d17
--- /dev/null
+++ b/python/hetu/gpu_ops/Pad.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import pad as cpu_pad
+from ..cpu_links import pad_gradient as cpu_pad_gradient
+from ..gpu_links import pad
+from ..gpu_links import pad_gradient
+
+
+class PadOp(Op):
+    def __init__(self, node_A, paddings, mode="CONSTANT", constant_values=0, ctx=None):
+        """Creates a node that represents np.sum(node_A, axis=0).
+        Only support common-case axis=0 reduction for simplicity of gradient.
+        """
+        super().__init__(PadOp, [node_A], ctx)
+        self.paddings = paddings
+        self.mode = mode
+        self.constant_values = constant_values
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Pad']:
+                cpu_pad(input_vals[0], output_val,
+                        self.paddings, self.mode, constant_values=0)
+            else:
+                output_val[:] = pad_np(input_vals[0].asnumpy(
+                ), self.paddings, self.mode, constant_values=0)
+        else:
+            pad(input_vals[0], output_val, self.paddings,
+                self.mode, self.constant_values, stream_handle)
+
+    def gradient(self, output_grad):
+        return [pad_gradient_op(output_grad, self.paddings, self.mode, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        out_shape = list(input_shapes[0])
+        pad_len = len(self.paddings)
+        for i in range(4):
+            if(i - (4 - pad_len) >= 0):
+                out_shape[i] = out_shape[i] + self.paddings[i -
+                                                            (4 - pad_len)][0] + self.paddings[i - (4 - pad_len)][1]
+        return tuple(out_shape)
+
+
+class Pad_GradientOp(Op):
+    def __init__(self, node_A, paddings, mode="CONSTANT", ctx=None):
+        """Creates a node that represents np.sum(node_A, axis=0).
+        Only support common-case axis=0 reduction for simplicity of gradient.
+        """
+        super().__init__(Pad_GradientOp, [node_A], ctx)
+        self.paddings = paddings
+        self.mode = mode
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Pad_Gradient']:
+                cpu_pad_gradient(
+                    input_vals[0], output_val, self.paddings, self.mode)
+            else:
+                output_val[:] = pad_np_gradient(
+                    input_vals[0].asnumpy(), self.paddings)
+        else:
+            pad_gradient(input_vals[0], output_val,
+                         self.paddings, self.mode, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        out_shape = list(input_shapes[0])
+        pad_len = len(self.paddings)
+        for i in range(4):
+            if(i - (4 - pad_len) >= 0):
+                out_shape[i] = out_shape[i] - self.paddings[i -
+                                                            (4 - pad_len)][0] - self.paddings[i - (4 - pad_len)][1]
+        return tuple(out_shape)
+
+
+def pad_op(node_A, paddings, mode="CONSTANT", constant_values=0, ctx=None):
+    """Pad an input variable.
+
+    Parameters:
+    ----
+    node_A : Node
+        The Node to be padded.
+    paddings : Node
+        padding edge
+    mode :
+        CONSTANT/REFLECT/SYMMETRIC
+    constant_values: scalar value
+        padding values
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return PadOp(node_A, paddings, mode, constant_values, ctx=ctx)
+
+
+def pad_gradient_op(node_A, paddings, mode="CONSTANT", ctx=None):
+    """Gradient node of pad operation.
+
+    Parameters:
+    ----
+    node_A : Node
+        The Node to be padded.
+    paddings : Node
+        padding edge
+    mode :
+        CONSTANT/REFLECT/SYMMETRIC
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Pad_GradientOp(node_A, paddings, mode, ctx=ctx)
+
+
+def pad_np(node_A, paddings, mode="constant", constant_values=0):
+    import numpy as np
+    return np.pad(node_A, paddings, mode=mode.lower(), constant_values=(constant_values, constant_values))
+
+
+def pad_np_gradient(grad, paddings):
+    slices = []
+    for c in paddings:
+        e = None if c[1] == 0 else -c[1]
+        slices.append(slice(c[0], e))
+    return grad[tuple(slices)]
diff --git a/python/hetu/gpu_ops/ParameterServerCommunicate.py b/python/hetu/gpu_ops/ParameterServerCommunicate.py
new file mode 100644
index 0000000..da47fb2
--- /dev/null
+++ b/python/hetu/gpu_ops/ParameterServerCommunicate.py
@@ -0,0 +1,326 @@
+
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from ..gpu_links import matrix_elementwise_multiply_by_const
+from .. import stream
+import os
+import numpy as np
+import ctypes
+
+
+class ParameterServerCommunicateOp(Op):
+
+    def __init__(self, nodeA, parameter, optimizer):
+        super().__init__(ParameterServerCommunicateOp, [nodeA], nodeA.ctx)
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        self.parameter = parameter
+        self.optimizer = optimizer
+        # the optimizer not implemented yet! only SGD is supported, calculate on worker
+        # the optimizer only support fixed learning rate, no scheduler supported.
+        # TODO: implement optimizer on Servers(already implemented, not in use) and Caches(not implemented yet)
+        # TODO: implement learning rate scheduler
+        self.learning_rate = -optimizer[1][0] / \
+            int(os.environ['DMLC_NUM_WORKER'])
+        self.ps_id = ctypes.c_int(self.parameter.id)
+        self.psevent = None
+
+    def _get_event(self, input_val, stream_handle):
+        if stream_handle:
+            self.push_val.async_d2h(input_val, stream_handle, self.psevent)
+            evt = self.psevent.handle
+        else:
+            input_val.copyto(self.push_val)
+            evt = None
+        return evt
+
+    def _compute_asp_prefetch(self, input_vals, output_val, stream_handle=None):
+        self._mult_lr(input_vals[0], stream_handle)
+        self._update_event(self._push_pull(input_vals[0], stream_handle))
+
+    def _compute_bsp_prefetch(self, input_vals, output_val, stream_handle=None):
+        self._mult_lr(input_vals[0], stream_handle)
+        self._wait(self._push(input_vals[0], stream_handle))
+        self.comm.BarrierWorker()
+        self._update_event(self._pull())
+
+    def _compute_no_prefetch(self, input_vals, output_val, stream_handle=None):
+        self._mult_lr(input_vals[0], stream_handle)
+        self._update_event(self._push(input_vals[0], stream_handle))
+
+    def _mult_lr_sparse_cpu(self, input_val, stream_handle):
+        input_val.values[:] = input_val.values.asnumpy() * self.learning_rate
+
+    def _mult_lr_dense_cpu(self, input_val, stream_handle):
+        input_val[:] = input_val.asnumpy() * self.learning_rate
+
+    def _mult_lr_dense_gpu(self, input_val, stream_handle):
+        matrix_elementwise_multiply_by_const(
+            input_val, self.learning_rate, input_val, stream_handle)
+
+    def _push_pull_cache(self, input_val, stream_handle):
+        return self.cache.embedding_push_pull(
+            pullkeys=self.dl_node.get_next_arr(self.dl_name), dest=self.sparse_pull_val,
+            pushkeys=input_val.indices, grads=input_val.values
+        )
+
+    def _push_pull_sparse_cpu(self, input_val, stream_handle):
+        return self.comm.SSPushPull(self.ps_id, input_val.indices.handle, input_val.values.handle,
+                                    self.dl_node.get_next_arr(self.dl_name).handle, self.sparse_pull_val.handle, None)
+
+    def _push_pull_halfsparse_cpu(self, input_val, stream_handle):
+        return self.comm.SDPushPull(self.ps_id, input_val.indices.handle, input_val.values.handle, self.pull_val.handle, None)
+
+    def _push_pull_dense_cpu(self, input_val, stream_handle):
+        return self.comm.DDPushPull(self.ps_id, input_val.handle, self.pull_val.handle, None)
+
+    def _push_pull_dense_gpu(self, input_val, stream_handle):
+        evt = self._get_event(input_val, stream_handle)
+        return self.comm.DDPushPull(self.ps_id, self.push_val.handle, self.pull_val.handle, evt)
+
+    def _push_cache(self, input_val, stream_handle):
+        return self.cache.embedding_update(input_val.indices, input_val.values)
+
+    def _push_sparse_cpu(self, input_val, stream_handle):
+        return self.comm.SparsePush(self.ps_id, input_val.indices.handle, input_val.values.handle, None)
+
+    def _push_dense_cpu(self, input_val, stream_handle):
+        return self.comm.Push(self.ps_id, input_val.handle, None)
+
+    def _push_dense_gpu(self, input_val, stream_handle):
+        evt = self._get_event(input_val, stream_handle)
+        return self.comm.Push(self.ps_id, self.push_val.handle, evt)
+
+    def _pull_cache(self):
+        return self.cache.embedding_lookup(self.dl_node.get_next_arr(self.dl_name), self.sparse_pull_val)
+
+    def _pull_sparse(self):
+        return self.comm.SparsePull(self.ps_id, self.dl_node.get_next_arr(self.dl_name).handle, self.sparse_pull_val.handle)
+
+    def _pull_dense(self):
+        return self.comm.Pull(self.ps_id, self.pull_val.handle)
+
+    def _wait_cache(self, ts):
+        ts.wait()
+
+    def _wait_ps(self, ts):
+        self.comm.Wait(self.ps_id)
+
+    def _update_event_cache(self, ts):
+        self.parameter.event.update_ts(ts)
+
+    def _update_event_ps(self, ts):
+        self.parameter.event.update()
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return None
+
+    def forward_hook(self, config):
+        # disable inplace if not lazy execution
+        # previously we use array reshape lazy callback to do this, which is deprecated (not efficient)
+        self.inputs[0].inplace = False
+
+        self.ctx = self.inputs[0].ctx
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        if self.on_gpu and self.inputs[0].event is None:
+            self.inputs[0].event = stream.create_event_handle(self.ctx)
+        self.comm = config.ps_comm
+        node_shape = self.parameter.shape
+
+        # using cache
+        if config.cstable_policy is not None and self.parameter.is_embed:
+            assert len(node_shape) == 2
+            from hetu.cstable import CacheSparseTable
+            self._wait = self._wait_cache
+            self._update_event = self._update_event_cache
+            self._mult_lr = self._mult_lr_sparse_cpu
+            if config.bsp and config.prefetch:
+                self._push = self._push_cache
+                self._pull = self._pull_cache
+                self.compute = self._compute_bsp_prefetch
+            elif config.prefetch:
+                self._push_pull = self._push_pull_cache
+                self.compute = self._compute_asp_prefetch
+            else:
+                self._push = self._push_cache
+                self.compute = self._compute_no_prefetch
+            limit = node_shape[0] // 10  # TODO: need tuning
+            # only worker 0 will do the initialization on server,
+            # this function synchronously initialize meta information and do the initialization,
+            # ALREADY has barrier!
+            self.parameter.initializer.init_on_ps(
+                self.comm, self.ps_id, 2, seed=config.seed + self.ps_id.value, opt=self.optimizer)
+            self.cache = CacheSparseTable(
+                limit, node_shape[0], node_shape[1], self.parameter.id, config.cstable_policy, config.cache_bound)
+            self.parameter.cache = self.cache
+            if config.prefetch:
+                self.dl_name = config.train_name
+                self.dl_node = self.inputs[0].inputs[1]
+                local_shape = list(self.dl_node.get_cur_shape(self.dl_name))
+                local_shape.append(node_shape[-1])
+                self.sparse_pull_val = ndarray.empty(
+                    tuple(local_shape), ctx=ndarray.cpu(0))
+                self.parameter.event.update_ts(self.cache.embedding_lookup(
+                    self.dl_node.get_next_arr(self.dl_name), self.sparse_pull_val))
+                config.ps_map[self.parameter] = self.sparse_pull_val
+            return
+
+        # initialize
+        self_sparse = self.parameter.is_embed and config.use_sparse_pull
+        if self.on_gpu:
+            self.push_val = ndarray.empty(node_shape, ctx=ndarray.cpu(0))
+            if config.d2h_stream:
+                self.psevent = stream.create_event_handle(self.ctx)
+        # only worker 0 will do the initialization on server,
+        # this function synchronously initialize meta information and do the initialization,
+        # ALREADY has barrier!
+        self.parameter.initializer.init_on_ps(self.comm, self.ps_id, int(
+            self.parameter.is_embed), seed=config.seed + self.ps_id.value, opt=self.optimizer)
+        if self_sparse:
+            if config.prefetch:
+                self.dl_name = config.train_name
+                self.dl_node = self.inputs[0].inputs[1]
+                local_shape = list(self.dl_node.get_cur_shape(self.dl_name))
+                local_shape.append(node_shape[-1])
+                self.sparse_pull_val = ndarray.empty(
+                    tuple(local_shape), ctx=ndarray.cpu(0))
+                self.comm.SparsePull(self.ps_id, self.dl_node.get_next_arr(
+                    self.dl_name).handle, self.sparse_pull_val.handle)
+                config.ps_map[self.parameter] = self.sparse_pull_val
+                self.parameter.event.update()
+        else:
+            self.pull_val = ndarray.empty(node_shape, ctx=ndarray.cpu(0))
+            self.comm.Pull(self.ps_id, self.pull_val.handle)
+            config.ps_map[self.parameter] = self.pull_val
+            config.placeholder_to_arr_map[self.parameter] = self.pull_val
+            self.parameter.event.update()
+
+        # config compute function
+        self._wait = self._wait_ps
+        self._update_event = self._update_event_ps
+        if self_sparse:
+            self._mult_lr = self._mult_lr_sparse_cpu
+            self._push = self._push_sparse_cpu
+            self._pull = self._pull_sparse
+            self._push_pull = self._push_pull_sparse_cpu
+        elif self.parameter.is_embed:
+            self._mult_lr = self._mult_lr_sparse_cpu
+            self._push = self._push_sparse_cpu
+            self._pull = self._pull_dense
+            self._push_pull = self._push_pull_halfsparse_cpu
+        elif self.on_cpu:
+            self._mult_lr = self._mult_lr_dense_cpu
+            self._push = self._push_dense_cpu
+            self._pull = self._pull_dense
+            self._push_pull = self._push_pull_dense_cpu
+        else:
+            self._mult_lr = self._mult_lr_dense_gpu
+            self._push = self._push_dense_gpu
+            self._pull = self._pull_dense
+            self._push_pull = self._push_pull_dense_gpu
+        if config.bsp and (config.prefetch or not self_sparse):
+            self.compute = self._compute_bsp_prefetch
+        elif config.prefetch or not self_sparse:
+            self.compute = self._compute_asp_prefetch
+        else:
+            self.compute = self._compute_no_prefetch
+
+# 只在正向图插入sparse pull的op dense pull的op在init时完成
+
+
+class ParameterServerSparsePullOp(Op):
+    def __init__(self, node, deps_node):
+        super().__init__(ParameterServerSparsePullOp,
+                         [node] + deps_node, node.ctx)
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        self.parameter = node.inputs[0]
+        self.ps_id = ctypes.c_int(self.parameter.id)
+        self.psevent = None
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        comm = self.comm
+        if self.use_cache_table:
+            ts = self.cache.embedding_lookup(
+                self.dl_node.get_next_arr(self.dl_name), self.sparse_pull_val)
+            self.parameter.event.update_ts(ts)
+            return
+        assert self.on_cpu == True
+        assert isinstance(input_vals[0], ndarray.NDArray)
+        comm.SparsePull(self.ps_id, self.dl_node.get_next_arr(
+            self.dl_name).handle, self.sparse_pull_val.handle)
+        self.parameter.event.update()
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return None
+
+    def forward_hook(self, config):
+        self.comm = config.ps_comm
+        self.use_cache_table = config.cstable_policy is not None
+        node_shape = self.parameter.shape
+        assert (
+            config.use_sparse_pull or self.use_cache_table) and self.parameter.is_embed
+        self.dl_name = config.val_name
+        self.dl_node = self.inputs[0].inputs[1]
+        local_shape = list(self.dl_node.get_cur_shape(self.dl_name))
+        local_shape.append(node_shape[-1])
+        self.sparse_pull_val = ndarray.empty(
+            tuple(local_shape), ctx=ndarray.cpu(0))
+        config.infer_ps_map[self.parameter] = self.sparse_pull_val
+        if self.use_cache_table:
+            self.cache = self.parameter.cache
+            self.parameter.event.sync()
+            ts = self.cache.embedding_lookup(
+                self.dl_node.get_next_arr(self.dl_name), self.sparse_pull_val)
+            self.parameter.event.update_ts(ts)
+        else:
+            self.parameter.event.sync()
+            self.comm.SparsePull(self.ps_id, self.dl_node.get_next_arr(
+                self.dl_name).handle, self.sparse_pull_val.handle)
+            self.parameter.event.update()
+
+
+def parameterServerCommunicate_op(node, parameter, optimizer):
+    """Make a new instance of ParameterServerCommunicateOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to do allreduce
+    parameter: Node
+        The parameter Node that corresponding to the gradient
+    learning_rate: float
+        Adjusted learning rate
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ParameterServerCommunicateOp(node, parameter, optimizer)
+
+
+def parameterServerSparsePull_op(parameter, deps_node):
+    """Make a new instance of ParameterServerCommunicateOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to do Pull data
+    parameter: Node
+        The parameter Node that corresponding to the gradient
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ParameterServerSparsePullOp(parameter, deps_node)
diff --git a/python/hetu/gpu_ops/PipelineReceive.py b/python/hetu/gpu_ops/PipelineReceive.py
new file mode 100644
index 0000000..33ae85b
--- /dev/null
+++ b/python/hetu/gpu_ops/PipelineReceive.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from ..communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+from ..stream import create_event_handle, create_stream_handle
+
+
+class PipelineReceiveOp(Op):
+    def __init__(self, source, comm, stream=None, ctx=None):
+        assert ctx, "PipelineReceiveOp must be initialized with the ctx argument!"
+        super().__init__(PipelineReceiveOp, [], ctx)
+        self.const_attr = source
+        self.comm = comm
+        self.comm_stream = stream
+        self.desc = self.name + \
+            '(%s receive from %s)' % (str(self.ctx.device_id), str(source))
+        self.shape = None
+        self.shape_is_received = False
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert not self.on_cpu, "PipelineReceiveOp only support P2P communication between gpus"
+
+        assert self.comm_stream, "communicate stream should not be None"
+        if self.event == None:
+            self.event = create_event_handle(self.ctx)
+        self.comm.dlarrayRecv(output_val,
+                              ncclDataType_t.ncclFloat32,
+                              self.const_attr,
+                              self.comm_stream)
+        self.event.record(self.comm_stream)
+
+    def gradient(self, output_grad):
+        return []
+
+    def infer_shape(self, input_shapes):
+        if not self.shape_is_received:
+            # receive
+            shape_arr = ndarray.array([0, 0, 0], self.ctx)
+            self.comm.dlarrayRecv(shape_arr,
+                                  ncclDataType_t.ncclFloat32,
+                                  self.const_attr,
+                                  self.comm_stream)
+
+            # remove padding and save
+            shape_arr = [int(x) for x in list(shape_arr.asnumpy()) if x != 0]
+            self.shape = tuple(shape_arr)
+            self.shape_is_received = True
+
+        return self.shape
+
+    def forward_hook(self, config):
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+
+
+def pipeline_receive_op(source, comm, stream=None, ctx=None):
+    """Make a new instance of PipelineReceiveOp and call the instance.
+
+    Parameters:
+    ----
+    source : scalar value
+        The gpu index for source.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return PipelineReceiveOp(source, comm, stream=stream, ctx=ctx)
diff --git a/python/hetu/gpu_ops/PipelineSend.py b/python/hetu/gpu_ops/PipelineSend.py
new file mode 100644
index 0000000..9e4e038
--- /dev/null
+++ b/python/hetu/gpu_ops/PipelineSend.py
@@ -0,0 +1,71 @@
+from __future__ import absolute_import
+from .Node import Op
+from .. import ndarray
+from ..communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+from ..stream import create_event_handle, create_stream_handle
+
+
+class PipelineSendOp(Op):
+    def __init__(self, node_A, destination, comm, stream=None, ctx=None):
+        super().__init__(PipelineSendOp, [node_A], ctx)
+        self.const_attr = destination
+        self.comm = comm
+        self.comm_stream = stream
+        self.desc = self.name + \
+            '(send node %s to rank %s)' % (node_A.name, str(destination))
+        self.shape = None
+        self.shape_is_sent = False
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert not self.on_cpu, "PipelineSendOp only support P2P communication between gpus"
+        # we dont need sync and event for send
+        self.comm.dlarraySend(input_vals[0],
+                              ncclDataType_t.ncclFloat32,
+                              self.const_attr,
+                              self.comm_stream)
+
+    def gradient(self, output_grad):
+        return []
+
+    def infer_shape(self, input_shapes):
+        shape = input_shapes[0]
+        self.shape = shape
+        if not self.shape_is_sent:
+            self.shape_is_sent = True
+            # pad shape so that len=3
+            if len(shape) < 3:
+                shape = [0] * (3 - len(shape)) + list(shape)
+            # construct and send
+            payload = ndarray.array(shape, self.ctx)
+            self.comm.dlarraySend(payload,
+                                  ncclDataType_t.ncclFloat32,
+                                  self.const_attr,
+                                  self.comm_stream)
+        return shape
+
+    def forward_hook(self, config):
+        self.ctx = self.inputs[0].ctx
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+        # add event to the previous node, ensure that the send is
+        # blocked until previous computations are finished
+        if self.on_gpu and self.inputs[0].event is None:
+            self.inputs[0].event = create_event_handle(self.ctx)
+
+
+def pipeline_send_op(node, destination, comm, stream=None, ctx=None):
+    """Make a new instance of PipelineSendOp and call the instance.
+
+    Parameters:
+    ----
+    node : Node
+        The Node to be send.
+    destination : scalar value
+        The gpu index for destination.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return PipelineSendOp(node, destination, comm, stream=stream, ctx=ctx)
diff --git a/python/hetu/gpu_ops/README.md b/python/hetu/gpu_ops/README.md
new file mode 100644
index 0000000..7dc0d8f
--- /dev/null
+++ b/python/hetu/gpu_ops/README.md
@@ -0,0 +1,89 @@
+# gpu_ops
+This directory contains executor and operators for computation and communication. Though the name of directory is "gpu_ops", in each operator we call different API for computation in NumPy(CPU), DNNL(CPU), CUDA(GPU) according to the context specified in executor and the environment.
+
+## Executor
+* Defined in executor.py, contains all the configurations and controls the training/inference process.
+
+## Operators
+### Computation
+| Operator | NumPy(CPU) | DNNL(CPU) | CUDA(GPU) | CUDA Backend |
+| :----: | :----: | :----: | :----: | :----: |
+| AddByConstOp | ✔ | ✔ | ✔ | / |
+| AddOp | ✔ | ✔ | ✔ | / |
+| Avg_Pool2dOp | ✔ | ✔ | ✔ | CuDNN |
+| Avg_Pool2d_GradientOp | ✔ | ✔ | ✔ | CuDNN |
+| BatchMatMulOp | ✔ | ✖ | ✔ | CuBLAS |
+| Batch_NormalizationOp | ✔ | ✔ | ✔ | CuDNN |
+| Batch_Normalization_GradientOp | ✔ | ✔ | ✔ | CuDNN |
+| BinaryCrossEntropyOp | ✔ | ✖ | ✔ | / |
+| BroadcastToOp | ✔ | ✖ | ✔ | / |
+| BroadcastShapeOp | ✔ | ✖ | ✔ | / |
+| ConcatOp | ✔ | ✔ | ✔ | / |
+| Concat_gradientOP | ✔ | ✔ | ✔ | / |
+| Conv2dOp | ✔ | ✔ | ✔ | / |
+| Conv2d_Gradient_of_DataOp | ✔ | ✔ | ✔ | / |
+| Conv2d_Gradient_of_FilterOp | ✔ | ✔ | ✔ | / |
+| Conv2d_BroadcastToOp | ✔ | ✖ | ✔ | / |
+| Conv2d_ReduceSumOp | ✔ | ✖ | ✔ | / |
+| CsrmvOp | ✔ | ✖ | ✔ | / |
+| CsrmmOp | ✔ | ✖ | ✔ | / |
+| DistGCN_15dOp | ✖ | ✖ | ✔ | / |
+| DivOp | ✔ | ✔ | ✔ | / |
+| DivConstOp | ✔ | ✔ | ✔ | / |
+| DropoutOp | ✔ | ✔ | ✔ | CuRAND |
+| Dropout_GradientOp | ✔ | ✔ | ✔ | CuRAND |
+| Dropout2dOp | ✖ | ✖ | ✔ | CuRAND |
+| Dropout2d_GradientOp | ✖ | ✖ | ✔ | CuRAND |
+| EmbeddingLookUp | ✔ | ✖ | ✔ | / |
+| EmbeddingLookUp_Gradient | ✔ | ✖ | ✔ | / |
+| Instance_Normalization2dOp | ✖ | ✖ | ✔ | CuDNN |
+| Instance_Normalization2d_GradientOp | ✖ | ✖ | ✔ | CuDNN |
+| Layer_NormalizationOp | ✔ | ✖ | ✔ | CuDNN |
+| Layer_Normalization_GradientOp | ✔ | ✖ | ✔ | CuDNN |
+| LeakyReluOp | ✖ | ✖ | ✔ | / |
+| LeakyReluGradientOp | ✖ | ✖ | ✔ | / |
+| MatrixDotOp | ✔ | ✖ | ✔ | / |
+| MatMulOp | ✔ | ✔ | ✔ | CuBLAS |
+| Max_Pool2dOp | ✔ | ✔ | ✔ | CuDNN |
+| Max_Pool2d_GradientOp | ✔ | ✔ | ✔ | CuDNN |
+| MulByConstOp | ✔ | ✔ | ✔ | / |
+| MulOp | ✔ | ✔ | ✔ | / |
+| OneHotOp | ✔ | ✖ | ✔ | / |
+| OnesLikeOp | ✔ | ✔ | ✔ | / |
+| OppositeOp | ✔ | ✔ | ✔ | / |
+| PadOp | ✔ | ✔ | ✔ | / |
+| Pad_GradientOp | ✔ | ✔ | ✔ | / |
+| ReduceMeanOp | ✔ | ✖ | ✔ | CuDNN |
+| ReduceSumOp | ✔ | ✖ | ✔ | CuDNN |
+| ReduceSumAxisZeroOp | ✔ | ✔ | ✔ | / |
+| ReluOp | ✔ | ✔ | ✔ | / |
+| ReluGradientOp | ✔ | ✔ | ✔ | / |
+| Array_ReshapeOp | ✔ | ✔ | ✔ | / |
+| SigmoidOp | ✔ | ✔ | ✔ | / |
+| SliceOp | ✔ | ✖ | ✔ | / |
+| SliceGradientOp | ✔ | ✖ | ✔ | / |
+| SoftmaxOp | ✔ | ✔ | ✔ | CuDNN |
+| SoftmaxGradientOp | ✔ | ✖ | ✔ | CuDNN |
+| SoftmaxCrossEntropyOp | ✔ | ✔ | ✔ | CuDNN (Optional) |
+| SoftmaxCrossEntropyGradientOp | ✔ | ✖ | ✔ | CuDNN (Optional) |
+| SplitOp | ✔ | ✖ | ✔ | / |
+| SplitGradientOp | ✔ | ✖ | ✔ | / |
+| SqrtOp | ✔ | ✔ | ✔ | / |
+| ReciprocalSqrtOp | ✔ | ✔ | ✔ | / |
+| TanhOp | ✔ | ✔ | ✔ | / |
+| TransposeOp | ✔ | ✔ | ✔ | / |
+| WhereOp | ✔ | ✖ | ✔ | / |
+| ZerosLikeOp | ✔ | ✔ | ✔ | / |
+| OptimizerOp | ✔ | ✔ | ✔ | / |
+| OptimizerOp for sparse | ✔ | ✖ | ✔ | / |
+| DataloaderOp | ✔ | ✔ | / | / |
+
+### Communication
+* DataH2DOp
+* DataD2HOp
+* DataD2HSparseOp
+* AllReduceCommunicateOp
+* ParameterServerCommunicateOp
+* PipelineSendOp
+* PipelineReceiveOp
+* Dispatch
diff --git a/python/hetu/gpu_ops/ReduceMean.py b/python/hetu/gpu_ops/ReduceMean.py
new file mode 100644
index 0000000..a7df0c9
--- /dev/null
+++ b/python/hetu/gpu_ops/ReduceMean.py
@@ -0,0 +1,91 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import reduce_mean
+
+
+class ReduceMeanOp(Op):
+    def __init__(self, node_A, axes, keepdims=False, ctx=None):
+        super().__init__(ReduceMeanOp, [node_A], ctx)
+        if axes is not None:
+            if isinstance(axes, int):
+                axes = [axes]
+            self.axes = list(axes)
+            assert all(map(lambda x: isinstance(x, int), self.axes))
+        if keepdims is not None:
+            if keepdims is True or keepdims is False:
+                self.keepdims = [keepdims] * len(self.axes)
+            else:
+                keepdims = list(keepdims)
+                assert len(keepdims) == len(self.axes)
+                assert all(map(lambda x: isinstance(x, bool), keepdims))
+                self.keepdims = keepdims
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert self.axes is not None and self.keepdims is not None
+        if self.on_cpu:
+            if all(self.keepdims) or not any(self.keepdims):
+                output_val[:] = np.mean(input_vals[0].asnumpy(), axis=tuple(
+                    self.axes), keepdims=self.keepdims[0])
+            else:
+                temp = input_vals[0].asnumpy()
+                for i in range(len(self.keepdims))[::-1]:
+                    temp = np.mean(
+                        temp, self.axes[i], keepdims=self.keepdims[i])
+                output_val[:] = temp
+        else:
+            reduce_mean(input_vals[0], output_val, self.axes, stream_handle)
+
+    def gradient(self, output_grad):
+        from .MultiplyConst import mul_byconst_op
+        from .BroadcastShape import broadcast_shape_op
+        # Here we don't know how to calculate gradient since we don't have shape information
+        # The const is determined in infer_shape phase.
+        self.grad_node = mul_byconst_op(broadcast_shape_op(
+            output_grad, None, None, ctx=self.raw_ctx), None, ctx=self.raw_ctx)
+        return [self.grad_node]
+
+    def infer_shape(self, input_shapes):
+        assert self.axes is not None and self.keepdims is not None
+        assert len(input_shapes) == 1
+        input_shape = list(input_shapes[0])
+        mean_multiplier = 1
+        for i in range(len(self.axes)):
+            if self.axes[i] < 0:
+                self.axes[i] += len(input_shape)
+            assert 0 <= self.axes[i] < len(input_shape)
+            mean_multiplier *= input_shape[self.axes[i]]
+            input_shape[self.axes[i]] = 1 if self.keepdims[i] else 0
+        if hasattr(self, 'grad_node'):
+            self.grad_node.const_attr = 1.0 / mean_multiplier
+            self.grad_node.inputs[0].target_shape = tuple(input_shapes[0])
+            add_axes = []
+            for i in range(len(self.axes)):
+                if not self.keepdims[i]:
+                    add_axes.append(self.axes[i])
+            self.grad_node.inputs[0].add_axes = add_axes
+        input_shape = [x for x in input_shape if x > 0]
+        if input_shape == []:
+            return (1,)
+        else:
+            return tuple(input_shape)
+
+
+def reduce_mean_op(node, axes, keepdims=False, ctx=None):
+    """Creates a node that represents np.mean(node_A, axis, keepdims).
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be averaged.
+    axes : int or list
+        The axis/axes needed to be averaged.
+    keepdims: bool or list
+        Whether to keep the dimension(s).
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReduceMeanOp(node, axes, keepdims, ctx=ctx)
diff --git a/python/hetu/gpu_ops/ReduceSum.py b/python/hetu/gpu_ops/ReduceSum.py
new file mode 100644
index 0000000..bc4a202
--- /dev/null
+++ b/python/hetu/gpu_ops/ReduceSum.py
@@ -0,0 +1,84 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import reduce_sum
+
+
+class ReduceSumOp(Op):
+    def __init__(self, node_A, axes, keepdims=False, ctx=None):
+        super().__init__(ReduceSumOp, [node_A], ctx)
+        if axes is not None:
+            if isinstance(axes, int):
+                axes = [axes]
+            self.axes = list(axes)
+            assert all(map(lambda x: isinstance(x, int), self.axes))
+        if keepdims is not None:
+            if keepdims is True or keepdims is False:
+                self.keepdims = [keepdims] * len(self.axes)
+            else:
+                keepdims = list(keepdims)
+                assert len(keepdims) == len(self.axes)
+                assert all(map(lambda x: isinstance(x, bool), keepdims))
+                self.keepdims = keepdims
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if all(self.keepdims) or not any(self.keepdims):
+                output_val[:] = np.sum(input_vals[0].asnumpy(), axis=tuple(
+                    self.axes), keepdims=self.keepdims[0])
+            else:
+                temp = input_vals[0].asnumpy()
+                for i in range(len(self.keepdims))[::-1]:
+                    temp = np.sum(
+                        temp, self.axes[i], keepdims=self.keepdims[i])
+                output_val[:] = temp
+        else:
+            reduce_sum(input_vals[0], output_val, self.axes, stream_handle)
+
+    def gradient(self, output_grad):
+        from .BroadcastShape import broadcast_shape_op
+        self.grad_node = broadcast_shape_op(
+            output_grad, None, None, ctx=self.raw_ctx)
+        return [self.grad_node]
+
+    def infer_shape(self, input_shapes):
+        assert self.axes is not None and self.keepdims is not None
+        assert len(input_shapes) == 1
+        input_shape = list(input_shapes[0])
+        if hasattr(self, 'grad_node'):
+            self.grad_node.target_shape = tuple(input_shape)
+            add_axes = []
+            for i in range(len(self.axes)):
+                if not self.keepdims[i]:
+                    add_axes.append(self.axes[i])
+            self.grad_node.add_axes = add_axes
+        for i in range(len(self.axes)):
+            if self.axes[i] < 0:
+                self.axes[i] += len(input_shape)
+            assert 0 <= self.axes[i] < len(input_shape)
+            input_shape[self.axes[i]] = 1 if self.keepdims[i] else 0
+        input_shape = [x for x in input_shape if x > 0]
+        if input_shape == []:
+            return (1,)
+        else:
+            return tuple(input_shape)
+
+
+def reduce_sum_op(node, axes, keepdims=False, ctx=None):
+    """Creates a node that represents np.sum(node_A, axis, keepdims).
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be summed.
+    axes : int or list
+        The axis/axes needed to be summed.
+    keepdims: bool or list
+        Whether to keep the dimension(s).
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReduceSumOp(node, axes, keepdims, ctx=ctx)
diff --git a/python/hetu/gpu_ops/ReduceSumAxisZero.py b/python/hetu/gpu_ops/ReduceSumAxisZero.py
new file mode 100644
index 0000000..cb66bb8
--- /dev/null
+++ b/python/hetu/gpu_ops/ReduceSumAxisZero.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import reduce_sum_axis_zero as cpu_reduce_sum_axis_zero
+from ..gpu_links import reduce_sum_axis_zero
+
+
+class ReduceSumAxisZeroOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(ReduceSumAxisZeroOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+
+            if DNNL_LIB['cpu_ReduceSumAxisZero']:
+                cpu_reduce_sum_axis_zero(input_vals[0], output_val)
+            else:
+                output_val[:] = np.sum(input_vals[0].asnumpy(), axis=0)
+        else:
+            reduce_sum_axis_zero(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        from .Broadcast import broadcastto_op
+        return [broadcastto_op(output_grad, self.inputs[0], ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        """summation reduction axis = 0
+        e.g. (3,4,5)->(4,5)
+        for vector, simpler to do (3,)->(1,)
+        """
+        assert len(input_shapes) == 1
+        input_shape = input_shapes[0]
+        if len(input_shape) == 1:
+            return (1,)
+        else:
+            return input_shape[1:]
+
+
+def reducesumaxiszero_op(node, ctx=None):
+    """Creates a node that represents np.sum(node_A, axis=0).
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be summed.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReduceSumAxisZeroOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Relu.py b/python/hetu/gpu_ops/Relu.py
new file mode 100644
index 0000000..bbec757
--- /dev/null
+++ b/python/hetu/gpu_ops/Relu.py
@@ -0,0 +1,87 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import relu as cpu_relu
+from ..cpu_links import relu_gradient as cpu_relu_gradient
+from ..gpu_links import relu
+from ..gpu_links import relu_gradient
+
+
+class ReluOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(ReluOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlRelu']:
+                cpu_relu(input_vals[0], output_val)
+            else:
+                output_val[:] = np.maximum(input_vals[0].asnumpy(), 0)
+        else:
+            relu(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [relu_gradient_op(self.inputs[0], output_grad, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+class ReluGradientOp(Op):
+    def __init__(self, node_A, node_B, ctx=None):
+        super().__init__(ReluGradientOp, [node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlRelu_Gradient']:
+                cpu_relu_gradient(input_vals[0], input_vals[1], output_val)
+            # heaviside function, 0.5 at x=0
+            else:
+                output_val[:] = (np.sign(input_vals[0].asnumpy()) +
+                                 1) * 0.5 * input_vals[1].asnumpy()
+        else:
+            relu_gradient(input_vals[0], input_vals[1],
+                          output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        return input_shapes[0]
+
+
+def relu_op(node, ctx=None):
+    """Rectified Linear Unit.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReluOp(node, ctx=ctx)
+
+
+def relu_gradient_op(node_A, node_B, ctx=None):
+    """Computes the gradient of the ReLU function.  
+
+    Parameters:
+    ----
+    node_A : Node
+        Relu input.
+    node_B : Node
+        Previous gradient node.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReluGradientOp(node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Reshape.py b/python/hetu/gpu_ops/Reshape.py
new file mode 100644
index 0000000..0f40549
--- /dev/null
+++ b/python/hetu/gpu_ops/Reshape.py
@@ -0,0 +1,147 @@
+from __future__ import absolute_import
+import ctypes
+from .Node import Op
+from .. import ndarray
+from .._base import DNNL_LIB
+from ..cpu_links import reshape as cpu_reshape
+from ..gpu_links import array_reshape
+
+
+class Array_ReshapeOp(Op):
+    def __init__(self, node_A, output_shape, ctx=None):
+        super().__init__(Array_ReshapeOp, [node_A], ctx)
+        self.output_shape = output_shape
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+
+        assert(len(input_vals) == 1)
+        input_size = 1
+        for i in range(len(input_vals[0].shape)):
+            input_size *= input_vals[0].shape[i]
+        # check if there exists -1 in output_shape
+        idx = -1
+        cnt = 0
+        output_size = 1
+        output_shape = list(self.output_shape)
+        for i in range(len(output_shape)):
+            if(output_shape[i] == -1):
+                idx = i
+                cnt = cnt + 1
+                assert(cnt != 2)
+            output_size *= output_shape[i]
+
+        if(idx == -1):
+            assert input_size == output_size
+        else:
+            output_size = output_size * (-1)
+            assert (input_size % output_size == 0)
+            output_shape[idx] = input_size // output_size
+        output_shape = tuple(output_shape)
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Reshape']:
+                cpu_reshape(input_vals[0], output_val)
+            else:
+                output_val[:] = input_vals[0].asnumpy().reshape(output_shape)
+        else:
+            if self.inplace:
+                input_vals[0].reshape(output_shape, output_val)
+            else:
+                array_reshape(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [array_reshape_gradient_op(self.inputs[0], output_grad, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+
+        assert (len(input_shapes) == 1)
+        input_size = 1
+        input_shape = input_shapes[0]
+        for i in range(len(input_shape)):
+            input_size *= input_shape[i]
+
+        # check if there exists -1 in output_shape
+        idx = -1
+        cnt = 0
+        output_size = 1
+        output_shape = list(self.output_shape)
+        for i in range(len(output_shape)):
+            if(output_shape[i] == -1):
+                idx = i
+                cnt = cnt + 1
+                assert(cnt != 2)
+            output_size *= output_shape[i]
+        if(idx == -1):
+            assert input_size == output_size
+        else:
+            output_size = output_size * (-1)
+            assert (input_size % output_size == 0)
+            output_shape[idx] = input_size // output_size
+        output_shape = tuple(output_shape)
+        return output_shape
+
+    def backward_hook(self, config):
+        self.inplace = config.enable_lazy and self not in config.eval_node_list
+
+
+class Array_Reshape_GradientOp(Op):
+    def __init__(self, node_in, node_out, ctx=None):
+        super().__init__(Array_Reshape_GradientOp, [node_in, node_out], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        # the size of input_array
+        shapeIn = input_vals[0].shape
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Reshape']:
+                cpu_reshape(input_vals[1], output_val)
+            else:
+                output_val[:] = input_vals[1].asnumpy().reshape(shapeIn)
+        else:
+            if self.inplace:
+                input_vals[1].reshape(shapeIn, output_val)
+            else:
+                array_reshape(input_vals[1], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return input_shapes[0]
+
+    def backward_hook(self, config):
+        self.inplace = config.enable_lazy and self not in config.eval_node_list
+
+
+def array_reshape_op(node, output_shape, ctx=None):
+    """Reshapes an input array without copy.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+    output_shape: tuple(int)
+        Expected shape of the output array.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Array_ReshapeOp(node, output_shape, ctx=ctx)
+
+
+def array_reshape_gradient_op(node_in, node_out, ctx=None):
+    """Gradient of reshape operation.
+
+    Parameters:
+    ----
+    node_in : Node
+        Input node of reshape operation.
+    node_out: Node
+        Previous gradient node.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return Array_Reshape_GradientOp(node_in, node_out, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Sigmoid.py b/python/hetu/gpu_ops/Sigmoid.py
new file mode 100644
index 0000000..897e805
--- /dev/null
+++ b/python/hetu/gpu_ops/Sigmoid.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import sigmoid as cpu_sigmoid
+from ..gpu_links import sigmoid
+
+
+class SigmoidOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(SigmoidOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlSigmoid']:
+                cpu_sigmoid(input_vals[0], output_val)
+            else:
+                output_val[:] = 1.0/(1.0+1.0/np.exp(input_vals[0].asnumpy()))
+        else:
+            sigmoid(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        # ds=s(1-s)
+        grad_A = sigmoid_op(self.inputs[0], ctx=self.raw_ctx) * \
+            (1 + -1*sigmoid_op(self.inputs[0], ctx=self.raw_ctx))
+        return [grad_A*output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def sigmoid_op(node, ctx=None):
+    """Calculate sigmoid of a matrix elementwisely.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SigmoidOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Slice.py b/python/hetu/gpu_ops/Slice.py
new file mode 100644
index 0000000..4fe384b
--- /dev/null
+++ b/python/hetu/gpu_ops/Slice.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import matrix_slice_simple
+from ..gpu_links import matrix_slice_gradient_simple
+from .. import ndarray
+
+
+class SliceOp(Op):
+    def __init__(self, node_A, begin_pos, output_shape, ctx=None):
+        super().__init__(SliceOp, [node_A], ctx)
+        self.begin_pos = tuple(begin_pos)
+        self.output_shape = list(output_shape)
+        self.ori_output_shape = list(output_shape)
+        assert len(self.begin_pos) == len(self.output_shape)
+        for i in range(len(self.begin_pos)):
+            assert self.begin_pos[i] >= 0
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            index = tuple([slice(i, i+j)
+                           for i, j in zip(self.begin_pos, self.output_shape)])
+            output_val[:] = input_vals[0].asnumpy()[index]
+        else:
+            # matrix_slice(input_vals[0], output_val, self.begin_pos, stream_handle)
+            matrix_slice_simple(
+                input_vals[0], output_val, self.gpu_buffer, stream_handle)
+
+    def gradient(self, output_grad):
+        self.grad_node = slice_gradient_op(
+            output_grad, self.begin_pos, None, ctx=self.raw_ctx)
+        return [self.grad_node]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        ori_shape = list(input_shapes[0])
+        assert len(ori_shape) == len(self.begin_pos)
+        for i in range(len(ori_shape)):
+            if self.ori_output_shape[i] == -1:
+                self.output_shape[i] = ori_shape[i] - self.begin_pos[i]
+            assert self.output_shape[i] > 0
+            assert self.begin_pos[i] + self.output_shape[i] <= ori_shape[i]
+        self.ori_shape = tuple(ori_shape)
+        if hasattr(self, 'grad_node'):
+            self.grad_node.output_shape = self.ori_shape
+            assert len(self.ori_shape) == len(self.grad_node.begin_pos)
+
+        # here we save the information on device for GPU computation
+        if self.on_gpu:
+            ndim = len(ori_shape)
+            gpu_buf = [0 for _ in range(3 * ndim)]
+            for i in range(ndim):
+                gpu_buf[i] = self.begin_pos[i]
+                gpu_buf[ndim + i] = ori_shape[i]
+                gpu_buf[2 * ndim + i] = self.output_shape[i]
+            self.gpu_buffer = ndarray.array(
+                gpu_buf, self.ctx, data_type=np.uintc)
+        return self.output_shape
+
+
+class SliceGradientOp(Op):
+    def __init__(self, node_A, begin_pos, output_shape, ctx=None):
+        super().__init__(SliceGradientOp, [node_A], ctx)
+        self.begin_pos = tuple(begin_pos)
+        self.output_shape = None
+        if output_shape != None:
+            self.output_shape = tuple(output_shape)
+            assert len(self.begin_pos) == len(self.output_shape)
+        for i in range(len(self.begin_pos)):
+            assert self.begin_pos[i] >= 0
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = np.zeros(self.output_shape, dtype=np.float32)
+            index = tuple([slice(i, i+j)
+                           for i, j in zip(self.begin_pos, self.ori_shape)])
+            output_val[index] = input_vals[0]
+        else:
+            # matrix_slice_gradient(input_vals[0], output_val, self.begin_pos, stream_handle)
+            matrix_slice_gradient_simple(
+                input_vals[0], output_val, self.gpu_buffer, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert self.output_shape != None
+        assert len(input_shapes) == 1
+        ori_shape = list(input_shapes[0])
+        assert len(ori_shape) == len(self.begin_pos)
+        for i in range(len(ori_shape)):
+            assert self.begin_pos[i] + ori_shape[i] <= self.output_shape[i]
+        self.ori_shape = tuple(ori_shape)
+
+        # here we save the information on device for GPU computation
+        if self.on_gpu:
+            ndim = len(ori_shape)
+            gpu_buf = [0 for _ in range(3 * ndim)]
+            for i in range(ndim):
+                gpu_buf[i] = self.begin_pos[i]
+                gpu_buf[ndim + i] = ori_shape[i]
+                gpu_buf[2 * ndim + i] = self.output_shape[i]
+            self.gpu_buffer = ndarray.array(
+                gpu_buf, self.ctx, data_type=np.uintc)
+        return self.output_shape
+
+
+def slice_op(node, begin, size, ctx=None):
+    """Creates a node that represents tf.slice(node, begin, size).
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be summed.
+    begin: tuple
+        The beginning position of slice operation.
+    size: tuple
+        The shape(size) of output tensor.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SliceOp(node, begin, size, ctx=ctx)
+
+
+def slice_gradient_op(node, begin, size=None, ctx=None):
+    """Creates a node that represents the gradient of tf.slice.
+
+    Parameters:
+    ----
+    node : Node
+        The Node needed to be summed.
+    begin: tuple
+        The beginning position of slice operation.
+    size: tuple
+        The shape(size) of output tensor.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SliceGradientOp(node, begin, size, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Softmax.py b/python/hetu/gpu_ops/Softmax.py
new file mode 100644
index 0000000..802c6ef
--- /dev/null
+++ b/python/hetu/gpu_ops/Softmax.py
@@ -0,0 +1,97 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import softmax as cpu_softmax
+from ..gpu_links import CuDNN_softmax
+from ..gpu_links import CuDNN_softmax_gradient
+
+
+def softmax_func(y):
+    """Numerically stable softmax."""
+    b = y - np.max(y, axis=-1, keepdims=True)
+    expb = np.exp(b)
+    softmax = expb / np.sum(expb, axis=-1, keepdims=True)
+    return softmax
+
+
+def softmax_gradient_func(y, dy):
+    dx = y * (dy - (dy * y).sum(axis=-1, keepdims=True))
+    return dx
+
+
+class SoftmaxOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(SoftmaxOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlSoftmax']:
+                cpu_softmax(input_vals[0], output_val)
+            else:
+                output_val[:] = softmax_func(input_vals[0].asnumpy())
+        else:
+            CuDNN_softmax(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        # Do not directly use SoftmaxOp, use SoftmaxCrossEntropyOp instead.
+        # Not allowing taking 2nd derivative of SoftmaxCrossEntropyOp.
+        return [softmax_gradient_op(self, output_grad, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+class SoftmaxGradientOp(Op):
+    def __init__(self, node_y, grad, ctx=None):
+        super().__init__(SoftmaxGradientOp, [node_y, grad], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = softmax_gradient_func(
+                input_vals[0].asnumpy(), input_vals[1].asnumpy())
+        else:
+            CuDNN_softmax_gradient(
+                input_vals[0], input_vals[1], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        return input_shapes[0]
+
+
+def softmax_op(node, ctx=None):
+    """ This function computes its softmax along an axis.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SoftmaxOp(node, ctx=ctx)
+
+
+def softmax_gradient_op(node_y, grad, ctx=None):
+    """ This function computes softmax gradient.
+
+    Parameters:
+    ----
+    node_y: Node
+        Output variable of forward softmax.
+    grad: Node
+        Gradient variable, dy.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SoftmaxGradientOp(node_y, grad, ctx=ctx)
diff --git a/python/hetu/gpu_ops/SoftmaxCrossEntropy.py b/python/hetu/gpu_ops/SoftmaxCrossEntropy.py
new file mode 100644
index 0000000..fa9607d
--- /dev/null
+++ b/python/hetu/gpu_ops/SoftmaxCrossEntropy.py
@@ -0,0 +1,94 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import softmax_crossentropy
+from .Softmax import softmax_func
+from ..gpu_links import CuDNN_softmax_cross_entropy
+from ..gpu_links import softmax_cross_entropy
+from ..gpu_links import CuDNN_softmax_cross_entropy_gradient
+from ..gpu_links import softmax_cross_entropy_gradient
+
+
+class SoftmaxCrossEntropyOp(Op):
+    def __init__(self, node_A, node_B, use_cudnn=True, ctx=None):
+        super().__init__(SoftmaxCrossEntropyOp, [node_A, node_B], ctx)
+        self.use_cudnn = use_cudnn
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        y = input_vals[0]
+        y_ = input_vals[1]
+        if self.on_cpu:
+            if DNNL_LIB['DnnlSoftmaxCrossEntropy']:
+                softmax_crossentropy(y, y_, output_val)
+            else:
+                softmax = softmax_func(y.asnumpy())
+                output_val[:] = -np.sum(y_.asnumpy() * np.log(softmax), axis=1)
+        else:
+            if self.use_cudnn:
+                CuDNN_softmax_cross_entropy(y, y_, output_val, stream_handle)
+            else:
+                softmax_cross_entropy(y, y_, output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        from .Softmax import softmax_op
+        grad_A = softmaxcrossentropy_gradient_op(
+            self.inputs[0], self.inputs[1], output_grad, use_cudnn=self.use_cudnn, ctx=self.raw_ctx)
+        return [grad_A, None]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 2
+        assert len(input_shapes[0]) >= 2
+        return input_shapes[0][:-1]
+
+
+class SoftmaxCrossEntropyGradientOp(Op):
+    def __init__(self, node_A, node_B, node_C, use_cudnn=True, ctx=None):
+        super().__init__(SoftmaxCrossEntropyGradientOp,
+                         [node_A, node_B, node_C], ctx)
+        self.use_cudnn = use_cudnn
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlSoftmaxCrossEntropy_Gradient']:
+                print('No support for DnnlSoftmaxCrossEntropy_gradient')
+            else:
+                output_val[:] = (softmax_func(input_vals[0].asnumpy(
+                )) + -1 * input_vals[1].asnumpy()) * np.expand_dims(input_vals[2].asnumpy(), -1)
+        else:
+            if self.use_cudnn:
+                CuDNN_softmax_cross_entropy_gradient(
+                    input_vals[2], input_vals[0], input_vals[1], output_val, stream_handle)
+            else:
+                softmax_cross_entropy_gradient(
+                    input_vals[0], input_vals[1], input_vals[2], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        return input_shapes[0]
+
+
+def softmaxcrossentropy_op(node_A, node_B, use_cudnn=True, ctx=None):
+    """Computes cross entropy loss for pre-softmax activations.
+
+    Parameters:
+    ----
+    node_A : Node
+        Predicted probability.
+    node_B : Node
+        Labels.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+
+    return SoftmaxCrossEntropyOp(node_A, node_B, use_cudnn=use_cudnn, ctx=ctx)
+
+
+def softmaxcrossentropy_gradient_op(node_A, node_B, node_C, use_cudnn=True, ctx=None):
+    return SoftmaxCrossEntropyGradientOp(node_A, node_B, node_C, use_cudnn=use_cudnn, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Split.py b/python/hetu/gpu_ops/Split.py
new file mode 100644
index 0000000..032cfa9
--- /dev/null
+++ b/python/hetu/gpu_ops/Split.py
@@ -0,0 +1,116 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import matrix_slice_simple
+from ..gpu_links import matrix_slice_gradient_simple
+from .. import ndarray
+
+
+class SplitOp(Op):
+    def __init__(self, node_A, axes, indices, splits, ctx=None):
+        super().__init__(SplitOp, [node_A], ctx)
+        self.axes = axes
+        self.indices = indices
+        self.splits = splits
+        assert len(self.axes) == len(self.splits)
+        assert all([x >= 0 for x in axes])
+        assert all([x >= 1 for x in splits])
+        assert all([x >= 0 and x < splits[i] for i, x in enumerate(indices)])
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            index = tuple([slice(i, i+j)
+                           for i, j in zip(self.begin_pos, self.output_shape)])
+            output_val[:] = input_vals[0].asnumpy()[index]
+        else:
+            # matrix_slice(input_vals[0], output_val, self.begin_pos, stream_handle)
+            matrix_slice_simple(
+                input_vals[0], output_val, self.gpu_buffer, stream_handle)
+
+    def gradient(self, output_grad):
+        self.grad_node = split_gradient_op(
+            output_grad, self.axes, self.indices, self.splits, ctx=self.raw_ctx)
+        return [self.grad_node]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        ori_shape = list(input_shapes[0])
+        self.begin_pos = [0 for _ in ori_shape]
+        self.output_shape = [x for x in ori_shape]
+        for axe, ind, spl in zip(self.axes, self.indices, self.splits):
+            part_size = ori_shape[axe] // spl
+            self.begin_pos[axe] = ind * part_size
+            self.output_shape[axe] = part_size if ind != spl - \
+                1 else ori_shape[axe] - self.begin_pos[axe]
+
+        if hasattr(self, 'grad_node'):
+            self.grad_node.begin_pos = self.begin_pos
+            self.grad_node.output_shape = ori_shape
+
+        # here we save the information on device for GPU computation
+        if self.on_gpu:
+            ndim = len(ori_shape)
+            gpu_buf = [0 for _ in range(3 * ndim)]
+            for i in range(ndim):
+                gpu_buf[i] = self.begin_pos[i]
+                gpu_buf[ndim + i] = ori_shape[i]
+                gpu_buf[2 * ndim + i] = self.output_shape[i]
+            self.gpu_buffer = ndarray.array(
+                gpu_buf, self.ctx, data_type=np.uintc)
+        return self.output_shape
+
+
+class SplitGradientOp(Op):
+    def __init__(self, node_A, axes, indices, splits, ctx=None):
+        super().__init__(SplitGradientOp, [node_A], ctx)
+        self.axes = axes
+        self.indices = indices
+        self.splits = splits
+        self.begin_pos = None
+        self.output_shape = None
+        assert len(self.axes) == len(self.splits)
+        assert all([x >= 0 for x in axes])
+        assert all([x >= 1 for x in splits])
+        assert all([x >= 0 and x < splits[i] for i, x in enumerate(indices)])
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = np.zeros(self.output_shape, dtype=np.float32)
+            index = tuple([slice(i, i+j)
+                           for i, j in zip(self.begin_pos, self.ori_shape)])
+            output_val[index] = input_vals[0]
+        else:
+            # matrix_slice_gradient(input_vals[0], output_val, self.begin_pos, stream_handle)
+            matrix_slice_gradient_simple(
+                input_vals[0], output_val, self.gpu_buffer, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        assert self.output_shape != None and self.begin_pos != None
+        assert len(input_shapes) == 1
+        ori_shape = list(input_shapes[0])
+        for i in range(len(ori_shape)):
+            assert self.begin_pos[i] + ori_shape[i] <= self.output_shape[i]
+        self.ori_shape = tuple(ori_shape)
+
+        # here we save the information on device for GPU computation
+        if self.on_gpu:
+            ndim = len(ori_shape)
+            gpu_buf = [0 for _ in range(3 * ndim)]
+            for i in range(ndim):
+                gpu_buf[i] = self.begin_pos[i]
+                gpu_buf[ndim + i] = ori_shape[i]
+                gpu_buf[2 * ndim + i] = self.output_shape[i]
+            self.gpu_buffer = ndarray.array(
+                gpu_buf, self.ctx, data_type=np.uintc)
+        return self.output_shape
+
+
+def split_op(node, axes, indices, splits, ctx=None):
+    return SplitOp(node, axes, indices, splits, ctx=ctx)
+
+
+def split_gradient_op(node, axes, indices, splits, ctx=None):
+    return SplitGradientOp(node, axes, indices, splits, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Sqrt.py b/python/hetu/gpu_ops/Sqrt.py
new file mode 100644
index 0000000..8b9300b
--- /dev/null
+++ b/python/hetu/gpu_ops/Sqrt.py
@@ -0,0 +1,83 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import sqrt as cpu_sqrt
+from ..cpu_links import rsqrt as cpu_rsqrt
+from ..gpu_links import matrix_sqrt
+from ..gpu_links import matrix_rsqrt
+
+
+class SqrtOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(SqrtOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlSqrt']:
+                cpu_sqrt(input_vals[0], output_val)
+            else:
+                output_val[:] = np.sqrt(input_vals[0].asnumpy())
+        else:
+            matrix_sqrt(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        return [0.5 * rsqrt_op(self.inputs[0], ctx=self.raw_ctx) * output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+class ReciprocalSqrtOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(ReciprocalSqrtOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlReciprocalSqrt']:
+                cpu_rsqrt(input_vals[0], output_val)
+            else:
+                output_val[:] = 1 / np.sqrt(input_vals[0].asnumpy())
+        else:
+            matrix_rsqrt(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        from .Division import div_op
+        return [-0.5 * div_op(rsqrt_op(self.inputs[0], ctx=self.raw_ctx), self.inputs[0], ctx=self.raw_ctx) * output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def sqrt_op(node, ctx=None):
+    """Calculate square root of a matrix elementwisely.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return SqrtOp(node, ctx=ctx)
+
+
+def rsqrt_op(node, ctx=None):
+    """Calculate the reciprocal of square root of a matrix elementwisely.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ReciprocalSqrtOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Tanh.py b/python/hetu/gpu_ops/Tanh.py
new file mode 100644
index 0000000..cede064
--- /dev/null
+++ b/python/hetu/gpu_ops/Tanh.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import tanh as cpu_tanh
+from ..gpu_links import tanh
+
+
+class TanhOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(TanhOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['DnnlTanh']:
+                cpu_tanh(input_vals[0], output_val)
+            else:
+                output_val[:] = np.tanh(input_vals[0].asnumpy())
+        else:
+            tanh(input_vals[0], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        grad_A = 1 + -1 * \
+            tanh_op(self.inputs[0], ctx=self.raw_ctx) * \
+            tanh_op(self.inputs[0], ctx=self.raw_ctx)
+        return [grad_A*output_grad]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def tanh_op(node, ctx=None):
+    """Calculate tanh of a matrix elementwisely.
+
+    Parameters:
+    ----
+    node : Node
+        Input variable.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return TanhOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Transpose.py b/python/hetu/gpu_ops/Transpose.py
new file mode 100644
index 0000000..f794b06
--- /dev/null
+++ b/python/hetu/gpu_ops/Transpose.py
@@ -0,0 +1,80 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..cpu_links import transpose as cpu_transpose
+from ..gpu_links import matrix_transpose_simple
+from .. import ndarray
+
+
+class TransposeOp(Op):
+    def __init__(self, node_A, perm=None, ctx=None):
+        super().__init__(TransposeOp, [node_A], ctx)
+        self.perm = perm
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_Transpose']:
+                cpu_transpose(input_vals[0], output_val, self.perm)
+            else:
+                output_val[:] = np.transpose(
+                    input_vals[0].asnumpy(), self.perm)
+        else:
+            # matrix_transpose(input_vals[0], output_val, self.perm, stream_handle)
+            matrix_transpose_simple(
+                input_vals[0], output_val, self.gpu_buffer, stream_handle)
+
+    def gradient(self, output_grad):
+        if self.perm:
+            grad_perm = [0 for _ in self.perm]
+            for i in range(len(self.perm)):
+                grad_perm[self.perm[i]] = i
+        else:
+            grad_perm = None
+        return [transpose_op(output_grad, grad_perm, ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        # only support matrix transpose
+        # assert len(input_shapes[0]) == 2
+        ori_shape = list(input_shapes[0])
+        if self.perm is None:
+            self.perm = list(range(len(ori_shape))[::-1])
+            res_shape = ori_shape[::-1]
+        else:
+            assert len(self.perm) == len(ori_shape) and set(
+                self.perm) == set(range(len(self.perm)))
+            res_shape = [ori_shape[self.perm[i]]
+                         for i in range(len(ori_shape))]
+
+        # here we save the information for GPU computation
+        if self.on_gpu:
+            ndim = len(ori_shape)
+            buffer = [0 for _ in range(3 * ndim)]
+            in_stride = 1
+            out_stride = 1
+            for i in range(ndim - 1, -1, -1):
+                buffer[i] = in_stride
+                buffer[ndim + i] = out_stride
+                buffer[2 * ndim + i] = self.perm[i]
+                in_stride *= ori_shape[i]
+                out_stride *= res_shape[i]
+            self.gpu_buffer = ndarray.array(
+                buffer, self.ctx, data_type=np.uintc)
+        return res_shape
+
+
+def transpose_op(node_A, perm=None, ctx=None):
+    """Make a new instance of transpose and call the instance.
+
+    Parameters:
+    ----
+    node_A : Node
+        Node to be transposed.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return TransposeOp(node_A, perm, ctx=ctx)
diff --git a/python/hetu/gpu_ops/Variable.py b/python/hetu/gpu_ops/Variable.py
new file mode 100644
index 0000000..a142a7c
--- /dev/null
+++ b/python/hetu/gpu_ops/Variable.py
@@ -0,0 +1,94 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .. import ndarray
+from .. import stream
+
+
+def Variable(name, value=None, initializer=None, trainable=True, dtype=np.float32, ctx=None):
+    """
+        Defined a variable.
+        Trainable: Parameter
+        Not Trainable: Constant
+    """
+    placeholder_node = placeholder_op(
+        name, value, initializer, trainable, dtype, ctx)
+    return placeholder_node
+
+
+class PlaceholderOp(Op):
+    def __init__(self, name, value=None, initializer=None, trainable=True, dtype=np.float32, ctx=None):
+        super().__init__(PlaceholderOp, [], ctx)
+        self.name = name
+        self.is_embed = False
+        self.shape = None
+        if value is None and initializer is None:
+            trainable = False
+        elif value is not None:
+            assert initializer is None, 'Value already specified, initializer should be None.'
+            assert isinstance(value, (np.ndarray, ndarray.NDArray)),\
+                'Value data type %s not valid.' % str(type(value))
+            self.shape = value.shape
+        else:
+            assert initializer is not None, 'Value not specified, initializer should not be None.'
+            self.shape = initializer.shape
+        self.tensor_value = value
+        self.initializer = initializer
+        self.trainable = trainable
+        self.dtype = dtype
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert self.shape, "placeholder %s values provided by feed_dict" % self.name
+
+    def gradient(self, output_grad):
+        return None
+
+    def infer_shape(self, input_shapes):
+        assert self.shape, "placeholder %s shape provided by feed_shape" % self.name
+        return self.shape
+
+    def forward_hook(self, config):
+        pass
+
+    def backward_hook(self, config):
+        if self.ctx is None:
+            self.ctx = config.context
+        if (config.node_strategy.get(self, config.comm_mode) == 'PS' or (config.node_strategy.get(self, config.comm_mode) == "Hybrid" and self.is_embed)) and self.trainable:
+            self.ctx = ndarray.cpu(0)
+            if config.cstable_policy is not None and self.is_embed:
+                self.event = stream.CSEvent(config.ps_comm, self.id)
+            else:
+                self.event = stream.PSEvent(config.ps_comm, self.id)
+        else:
+            if self.initializer:
+                self.initializer(self, config.seed,
+                                 config.np_rand, config.comp_stream)
+                self.initializer = None
+            elif self.tensor_value is not None:
+                value = self.tensor_value
+                assert isinstance(value, (np.ndarray, ndarray.NDArray)), \
+                    'Parameters should be initialized as numpy.ndarray or ndarray.NDArray .'
+                if isinstance(value, np.ndarray):
+                    value = ndarray.array(value, self.ctx)
+                elif value.ctx != self.ctx:
+                    new_value = ndarray.empty(value.shape, self.ctx)
+                    value.copyto(new_value)
+                    value = new_value
+                self.tensor_value = value
+        self.on_gpu = ndarray.is_gpu_ctx(self.ctx)
+        self.on_cpu = not self.on_gpu
+
+
+def placeholder_op(name, value=None, initializer=None, trainable=True, dtype=np.float32, ctx=None):
+    """Node of variable placeholder.
+
+    Parameters:
+    ----
+    None
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return PlaceholderOp(name, value, initializer, trainable, dtype, ctx)
diff --git a/python/hetu/gpu_ops/Where.py b/python/hetu/gpu_ops/Where.py
new file mode 100644
index 0000000..47ca499
--- /dev/null
+++ b/python/hetu/gpu_ops/Where.py
@@ -0,0 +1,48 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from ..gpu_links import where
+
+
+class WhereOp(Op):
+    def __init__(self, cond, node_A, node_B, ctx=None):
+        super().__init__(WhereOp, [cond, node_A, node_B], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            output_val[:] = np.where(input_vals[0].asnumpy(
+            ), input_vals[1].asnumpy(), input_vals[2].asnumpy())
+        else:
+
+            where(input_vals[0], input_vals[1],
+                  input_vals[2], output_val, stream_handle)
+
+    def gradient(self, output_grad):
+        from .ZerosLike import zeroslike_op
+        zeros = zeroslike_op(self.inputs[0], ctx=self.raw_ctx)
+        grad_A = where_op(self.inputs[0], output_grad, zeros, ctx=self.raw_ctx)
+        grad_B = where_op(self.inputs[0], zeros, output_grad, ctx=self.raw_ctx)
+        return [None, grad_A, grad_B]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 3
+        assert tuple(input_shapes[0]) == tuple(
+            input_shapes[1]) == tuple(input_shapes[2])
+        return input_shapes[0]
+
+
+def where_op(cond, node_A, node_B, ctx=None):
+    """Creates a node that represents np.where.
+
+    Parameters:
+    ----
+    cond : Node of a condition array
+    node_A : Node, output if cond
+    node_B : Node, output if not cond
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return WhereOp(cond, node_A, node_B, ctx=ctx)
diff --git a/python/hetu/gpu_ops/ZerosLike.py b/python/hetu/gpu_ops/ZerosLike.py
new file mode 100644
index 0000000..e54d21e
--- /dev/null
+++ b/python/hetu/gpu_ops/ZerosLike.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+import numpy as np
+from .Node import Op
+from .._base import DNNL_LIB
+from ..gpu_links import array_set
+from ..cpu_links import array_set as cpu_array_set
+
+
+class ZerosLikeOp(Op):
+    def __init__(self, node_A, ctx=None):
+        super().__init__(ZerosLikeOp, [node_A], ctx)
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        if self.on_cpu:
+            if DNNL_LIB['cpu_ArraySet']:
+                cpu_array_set(output_val, 0)
+            else:
+                output_val[:] = np.zeros(input_vals[0].asnumpy().shape)
+        else:
+            array_set(output_val, 0, stream_handle)
+
+    def gradient(self, output_grad):
+        return [zeroslike_op(self.inputs[0], ctx=self.raw_ctx)]
+
+    def infer_shape(self, input_shapes):
+        assert len(input_shapes) == 1
+        return input_shapes[0]
+
+
+def zeroslike_op(node, ctx=None):
+    """Creates a node that represents np.zeros(node_A.shape).
+
+    Parameters:
+    ----
+    node : Node
+        The Node to pad with 0.
+
+    Returns:
+    ----
+    A new Node instance created by Op.
+
+    """
+    return ZerosLikeOp(node, ctx=ctx)
diff --git a/python/hetu/gpu_ops/__init__.py b/python/hetu/gpu_ops/__init__.py
new file mode 100644
index 0000000..5a8239d
--- /dev/null
+++ b/python/hetu/gpu_ops/__init__.py
@@ -0,0 +1,156 @@
+from __future__ import absolute_import
+from .executor import wrapped_mpi_nccl_init, Executor, gradients, scheduler_init,\
+    scheduler_finish, get_worker_communicate, worker_init, worker_finish, server_init, server_finish, HetuConfig, new_group_comm
+
+from .AddConst import addbyconst_op
+from .AddElewise import add_op
+from .AvgPool import avg_pool2d_op, avg_pool2d_gradient_op
+from .BatchNorm import batch_normalization_op, batch_normalization_gradient_op, batch_normalization_gradient_of_data_op, batch_normalization_gradient_of_scale_op, batch_normalization_gradient_of_bias_op
+from .Broadcast import broadcastto_op
+from .BinaryCrossEntropy import binarycrossentropy_op
+from .Concat import concat_op, concat_gradient_op
+from .Conv2d import conv2d_op, conv2d_gradient_of_data_op, conv2d_gradient_of_filter_op
+from .Conv2dBroadcast import conv2d_broadcastto_op
+from .Conv2dReduceSum import conv2d_reducesum_op
+from .CuSparse import csrmv_op, csrmm_op
+from .Division import div_op, div_const_op
+from .Dropout import dropout_op, dropout_gradient_op
+from .Dropout2d import dropout2d_op, dropout2d_gradient_op
+from .MatrixMult import matmul_op
+from .MaxPool import max_pool2d_op, max_pool2d_gradient_op
+from .MultiplyConst import mul_byconst_op
+from .MultiplyElewise import mul_op
+from .OnesLike import oneslike_op
+from .Opposite import opposite_op
+from .Pad import pad_op, pad_gradient_op
+from .ReduceSumAxisZero import reducesumaxiszero_op
+from .Relu import relu_op, relu_gradient_op
+from .LeakyRelu import leaky_relu_op, leaky_relu_gradient_op
+from .Reshape import array_reshape_op, array_reshape_gradient_op
+from .Sigmoid import sigmoid_op
+from .Slice import slice_op, slice_gradient_op
+from .Softmax import softmax_func, softmax_op
+from .SoftmaxCrossEntropy import softmaxcrossentropy_op
+from .Split import split_op, split_gradient_op
+from .Sqrt import sqrt_op, rsqrt_op
+from .Tanh import tanh_op
+from .Transpose import transpose_op
+from .Variable import Variable, placeholder_op
+from .ZerosLike import zeroslike_op
+from .EmbeddingLookUp import embedding_lookup_op, embedding_lookup_gradient_op
+from .Where import where_op
+from .BatchMatrixMult import batch_matmul_op
+from .LayerNorm import layer_normalization_op
+from .InstanceNorm2d import instance_normalization2d_op
+from .BroadcastShape import broadcast_shape_op
+from .ReduceSum import reduce_sum_op
+from .ReduceMean import reduce_mean_op
+from .OneHot import one_hot_op
+from .AllReduceCommunicate import allreduceCommunicate_op, groupallreduceCommunicate_op
+from .ParameterServerCommunicate import parameterServerCommunicate_op, parameterServerSparsePull_op
+from .DataTransfer import datah2d_op, datad2h_op
+from .MatrixDot import matrix_dot_op
+from .DistGCN_15d import distgcn_15d_op
+from .PipelineSend import pipeline_send_op
+from .PipelineReceive import pipeline_receive_op
+from .Dispatch import dispatch
+
+__all__ = [
+    'Executor',
+    'gradients',
+    'wrapped_mpi_nccl_init',
+    'scheduler_init',
+    'scheduler_finish',
+    'get_worker_communicate',
+    'worker_init',
+    'worker_finish',
+    'server_init',
+    'server_finish',
+    'HetuConfig',
+    'new_group_comm',
+
+    'addbyconst_op',
+    'add_op',
+    'avg_pool2d_op',
+    'avg_pool2d_gradient_op',
+    'batch_normalization_op',
+    'batch_normalization_gradient_op',
+    'batch_normalization_gradient_of_data_op',
+    'batch_normalization_gradient_of_scale_op',
+    'batch_normalization_gradient_of_bias_op',
+    'broadcastto_op',
+    'concat_op',
+    'concat_gradient_op',
+    'conv2d_op',
+    'conv2d_gradient_of_data_op',
+    'conv2d_gradient_of_filter_op',
+    'conv2d_broadcastto_op',
+    'conv2d_reducesum_op',
+    'csrmv_op',
+    'csrmm_op',
+    'div_op',
+    'div_const_op',
+    'dropout_op',
+    'dropout_gradient_op',
+    'dropout2d_op',
+    'dropout2d_gradient_op',
+    'matmul_op',
+    'max_pool2d_op',
+    'max_pool2d_gradient_op',
+    'mul_byconst_op',
+    'mul_op',
+    'oneslike_op',
+    'opposite_op',
+    'pad_op',
+    'pad_gradient_op',
+    'reducesumaxiszero_op',
+    'relu_op',
+    'relu_gradient_op',
+    'leaky_relu_op',
+    'leaky_relu_gradient_op',
+    'array_reshape_op',
+    'array_reshape_gradient_op',
+    'sigmoid_op',
+    'slice_op',
+    'slice_gradient_op',
+    'softmax_func',
+    'softmax_op',
+    'softmaxcrossentropy_op',
+    'split_op',
+    'split_gradient_op',
+    'sqrt_op',
+    'scheduler_init',
+    'scheduler_finish',
+    'server_init',
+    'server_finish',
+    'rsqrt_op',
+    'tanh_op',
+    'transpose_op',
+    'Variable',
+    'worker_init',
+    'worker_finish',
+    'placeholder_op',
+    'zeroslike_op',
+    "embedding_lookup_op",
+    "embedding_lookup_gradient_op",
+    'where_op',
+    'batch_matmul_op',
+    'layer_normalization_op',
+    'instance_normalization2d_op',
+    'broadcast_shape_op',
+    'reduce_sum_op',
+    'reduce_mean_op',
+    'one_hot_op',
+    'allreduceCommunicate_op',
+    'parameterServerCommunicate_op',
+    'datah2d_op',
+    'datad2h_op',
+    'binarycrossentropy_op',
+    'matrix_dot_op',
+    'parameterServerSparsePull_op',
+    'distgcn_15d_op',
+    'groupallreduceCommunicate_op',
+    'pipeline_send_op',
+    'pipeline_receive_op',
+    'dispatch',
+]
diff --git a/python/hetu/gpu_ops/executor.py b/python/hetu/gpu_ops/executor.py
new file mode 100644
index 0000000..845e9d3
--- /dev/null
+++ b/python/hetu/gpu_ops/executor.py
@@ -0,0 +1,921 @@
+""" library to take autodiff and execute a computation graph """
+from __future__ import absolute_import
+from .BatchNorm import Batch_NormalizationOp
+import numpy as np
+from scipy.sparse import spmatrix, coo_matrix
+from .. import ndarray
+from .._base import DNNL_LIB
+from ..cpu_links import array_set as cpu_array_set
+from .Variable import PlaceholderOp  # add for optimizer
+from ..dataloader import DataloaderOp, GNNDataLoaderOp
+from .AllReduceCommunicate import AllReduceCommunicateOp
+from .ParameterServerCommunicate import ParameterServerCommunicateOp, ParameterServerSparsePullOp, parameterServerSparsePull_op
+from .AddElewise import add_op
+from .DataTransfer import DataH2DOp, DataD2HOp, DataD2HSparseOp
+from .EmbeddingLookUp import EmbeddingLookUp, EmbeddingLookUp_Gradient
+from ..optimizer import OptimizerOp
+from . import OnesLike
+from ..stream import create_stream_handle, Event
+from ..context import get_current_context, get_launch_config_by_traverse_nodes, assign_context_by_traverse_nodes, DeviceGroup
+from .PipelineSend import PipelineSendOp
+from .PipelineReceive import PipelineReceiveOp
+from .Dropout import DropoutOp
+from .LayerNorm import Layer_NormalizationOp
+from operator import add
+from functools import reduce
+import ctypes
+import os
+from time import time
+
+
+def path_to_lib(name):
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    lib_path = os.path.join(curr_path, '../../../build/lib/')
+    return os.path.join(lib_path, name)
+
+
+def wrapped_mpi_nccl_init(init_nccl=True, devices=None):
+    from ..communicator.mpi_nccl_comm import mpi_communicator
+    global mpi_comm
+    global nccl_comm
+    if 'mpi_comm' not in globals():
+        mpi_comm = mpi_communicator(devices=devices)
+        if 'nccl_comm' not in globals():
+            nccl_comm = mpi_comm.ncclInit() if init_nccl else None
+    return nccl_comm
+
+
+def new_group_comm(devices_context=None):
+    assert 'mpi_comm' in globals()
+    global mpi_comm
+    if devices_context is None:
+        comm = mpi_comm.ncclInit()
+    else:
+        comm = mpi_comm.ncclGroupInit(devices_context)
+    return comm
+
+
+def get_nccl_communicate():
+    global nccl_comm
+    return nccl_comm
+
+
+def get_worker_communicate():
+    global ps_comm
+    return ps_comm
+
+
+def worker_init():
+    global ps_comm
+    ll = ctypes.cdll.LoadLibrary
+    ps_comm = ll(path_to_lib("libps.so"))
+    ps_comm.Init()
+
+
+def worker_finish():
+    ps_comm.Finalize()
+
+
+def server_init():
+    global ps_comm
+    ll = ctypes.cdll.LoadLibrary
+    ps_comm = ll(path_to_lib("libps.so"))
+    ps_comm.Init()
+    ps_comm.StartServer()
+
+
+def server_finish():
+    ps_comm.Finalize()
+
+
+def scheduler_init():
+    global ps_comm
+    ll = ctypes.cdll.LoadLibrary
+    ps_comm = ll(path_to_lib("libps.so"))
+    ps_comm.Init()
+
+
+def scheduler_finish():
+    ps_comm.Finalize()
+
+
+class HetuConfig(object):
+    __slots__ = [
+        'eval_node_list',
+        'train_name',
+        'val_name',
+        'context',
+        'seed',
+        'np_rand',
+        'comm_mode',
+        'node_strategy',
+        'context_launch',
+        'ps_comm',
+        'nccl_comm',
+        'local_rank',
+        'rank',
+        'nrank',
+        'p2p_stream',
+        'comp_stream',
+        'nccl_stream',
+        'h2d_stream',
+        'd2h_stream',
+        'h2d_ops',
+        'd2h_ops',
+        'ps_map',
+        'infer_ps_map',
+        'dataloader_ops',
+        'use_sparse_pull',
+        'cstable_policy',
+        'inference',
+        'enable_lazy',
+        'bsp',
+        'prefetch',
+        'cache_bound',
+        'log_path',
+        'my_eval_nodes',
+        'param_allreduce_group',
+        'placeholder_to_arr_map'
+    ]
+
+    def __init__(
+        self,
+        eval_node_list,
+        train_name,
+        val_name,
+        ctx=None,
+        seed=None,
+        comm_mode=None,
+        use_sparse_pull=True,
+        cstable_policy=None,
+        bsp=False,
+        prefetch=True,
+        enable_lazy=True,
+        cache_bound=100,
+        log_path=None,
+    ):
+        '''
+        context: default device context
+        comm_mode: communication mode, should be one of the following
+            None       -> Single GPU
+            PS         -> Parameter Server
+            AllRedeuce -> MPI AllReduce
+            Hybrid     -> Parameter Server for Sparse Parameter and MPI AllReduce for Dense Parameter
+        '''
+        self.eval_node_list = eval_node_list
+        self.train_name = train_name
+        self.val_name = val_name
+
+        # check context
+        if ctx is None:
+            ctx = get_current_context()
+        assert ctx, 'Default context should be determined.'
+
+        self.comm_mode = comm_mode
+        self.node_strategy = {}
+        local_gpu_devices = None
+        context_launch = isinstance(ctx, DeviceGroup)
+        self.context_launch = context_launch
+        if context_launch:
+            # with context usage
+            launchMPI, launchPS, self.node_strategy, devices = get_launch_config_by_traverse_nodes(
+                eval_node_list, ctx)
+            local_gpu_devices = sorted(
+                [dev.device_id for dev in devices if dev.local and ndarray.is_gpu_ctx(dev)])
+            if not launchMPI and not launchPS:
+                self.comm_mode = None
+            elif launchMPI and not launchPS:
+                self.comm_mode = 'AllReduce'
+            elif not launchMPI and launchPS:
+                self.comm_mode = 'PS'
+            else:
+                self.comm_mode = 'Hybrid'
+            # in pipeline or model parallel we have to initialize another p2p stream
+            init_p2p_stream = len(devices) != len(ctx)
+
+        # variables initialization
+        self.seed = seed if seed else np.int64(time())
+        self.np_rand = np.random.RandomState(self.seed)
+
+        # get attribute of communication mode
+        self.ps_comm = None
+        self.nccl_comm = None
+        self.local_rank = None
+        self.rank = None
+        self.nrank = None
+        ps_nrank = None
+        if self.comm_mode == 'PS' or self.comm_mode == 'Hybrid':
+            worker_init()
+            self.ps_comm = get_worker_communicate()
+            ps_rank = int(self.ps_comm.rank())
+            ps_nrank = int(
+                os.environ['DMLC_NUM_WORKER']) if 'DMLC_NUM_WORKER' in os.environ else 1
+        if self.comm_mode == "Hybrid" or self.comm_mode == "AllReduce":
+            self.nccl_comm = wrapped_mpi_nccl_init(devices=local_gpu_devices)
+        elif context_launch:
+            self.nccl_comm = wrapped_mpi_nccl_init(
+                init_nccl=init_p2p_stream, devices=local_gpu_devices)
+        if self.nccl_comm is not None:
+            self.local_rank = self.nccl_comm.local_rank
+            device_id = self.nccl_comm.dev_id
+            self.rank = self.nccl_comm.rank
+            self.nrank = self.nccl_comm.nrank
+            if ps_nrank:
+                assert ps_nrank == self.nrank
+        elif self.comm_mode == 'PS':
+            self.rank = ps_rank
+            self.nrank = ps_nrank
+            if context_launch:
+                global mpi_comm
+                self.local_rank = mpi_comm.local_rank
+                device_id = mpi_comm.dev_id
+
+        self.my_eval_nodes = eval_node_list
+        self.p2p_stream = None
+        self.param_allreduce_group = {}
+        if context_launch:
+            # comm_mode is None <=> only 1 model parallel instance
+            self.context = ndarray.gpu(device_id)
+            self.p2p_stream = create_stream_handle(
+                self.context) if init_p2p_stream else None
+            self.my_eval_nodes, trainable_params, has_send_recv = assign_context_by_traverse_nodes(
+                eval_node_list, self.context, self.nccl_comm, self.p2p_stream)
+            if (self.comm_mode == "Hybrid" or self.comm_mode == "AllReduce") and has_send_recv:
+                # here we need to use group communicator to implement allreduce,
+                # since not all processes use the same group
+                groups = set([n.raw_ctx for n in trainable_params])
+                temp_group_comms = {}
+                for group in groups:
+                    temp_group_comms[group] = new_group_comm(group)
+                self.param_allreduce_group = {
+                    n: temp_group_comms[n.raw_ctx] for n in trainable_params}
+        else:
+            self.context = ctx
+
+        on_gpu = ndarray.is_gpu_ctx(self.context)
+
+        self.nccl_stream = None
+        if self.comm_mode == "Hybrid" or self.comm_mode == "AllReduce":
+            if on_gpu:
+                self.nccl_stream = create_stream_handle(self.context)
+            self.nccl_comm = get_nccl_communicate()
+
+        # define streams
+        self.comp_stream = create_stream_handle(
+            self.context) if on_gpu else None
+        self.h2d_stream = create_stream_handle(
+            self.context) if on_gpu else None
+        self.d2h_stream = create_stream_handle(
+            self.context) if on_gpu else None
+
+        self.use_sparse_pull = use_sparse_pull if self.comm_mode == 'PS' or self.comm_mode == "Hybrid" else False
+        self.cstable_policy = cstable_policy if self.comm_mode == 'PS' or self.comm_mode == "Hybrid" else None
+        self.prefetch = prefetch if self.comm_mode == 'PS' or self.comm_mode == 'Hybrid' else False
+        if self.cstable_policy is not None:
+            self.cstable_policy = self.cstable_policy.upper()
+            self.use_sparse_pull = False
+
+        self.h2d_ops = {}
+        self.d2h_ops = {}
+        self.ps_map = {}
+        self.infer_ps_map = {}
+        self.enable_lazy = False and enable_lazy  # now we don't use lazy
+        self.bsp = bsp
+        self.cache_bound = int(cache_bound)
+
+        self.log_path = log_path
+        if log_path is not None and (self.comm_mode == 'PS' or self.comm_mode == "Hybrid"):
+            assert os.path.isdir(
+                log_path), 'Need to specify a work directory to save logs.'
+            self.ps_comm.startRecord(ctypes.c_char_p(bytes(log_path, 'utf-8')))
+
+        self.placeholder_to_arr_map = dict()
+        topo_sort_with_hook(self.my_eval_nodes, self)
+
+
+class Executor(object):
+    """Executor computes values for given set of nodes in computation graph."""
+
+    def __init__(self, eval_node_dict, config=None, **kargs):
+        """
+        Parameters
+        ----------
+        eval_node_dict: dict of list of nodes whose values need to be computed.
+        """
+        if not isinstance(eval_node_dict, dict):
+            eval_node_dict = {'default': eval_node_dict}
+        train_name, val_name = None, None
+        for k, v in eval_node_dict.items():
+            if any([isinstance(node, OptimizerOp) for node in v]):
+                # get the last subexecutor containing optimizer as train for ps op
+                train_name = k
+            else:
+                # get the last subexecutor not containing optimizer as val for ps op
+                val_name = k
+        all_eval_nodes = list(set(reduce(add, eval_node_dict.values())))
+        if config is None:
+            config = HetuConfig(eval_node_list=all_eval_nodes,
+                                train_name=train_name, val_name=val_name, **kargs)
+        assert isinstance(
+            config, HetuConfig), 'Config type %s invalid.' % str(type(config))
+
+        self.eval_node_dict = eval_node_dict
+        self.config = config
+        self.subexecutor = {k: SubExecutor(
+            k, v, config) for k, v in eval_node_dict.items()}
+        self.topo_order = find_topo_sort(config.my_eval_nodes)
+        self.param_nodes = [node for node in self.topo_order if isinstance(
+            node, PlaceholderOp) and node.trainable]
+        self.comm_mode = self.config.comm_mode
+        self.ps_comm = self.config.ps_comm
+        self.local_rank = self.config.local_rank
+        self.rank = self.config.rank
+
+    def run(self, name='default', eval_node_list={}, feed_dict={}, convert_to_numpy_ret_vals=False):
+        return self.subexecutor[name].run(eval_node_list, feed_dict, convert_to_numpy_ret_vals)
+
+    @property
+    def batch_num(self):
+        assert len(
+            self.subexecutor) == 1, 'Batch num should be used with only 1 subexecutor.'
+        return list(self.subexecutor.values())[0].batch_num
+
+    def get_batch_num(self, name='default'):
+        return self.subexecutor[name].batch_num
+
+    def save(self, file_path):
+        assert os.path.isdir(
+            file_path), 'Need to specify a work directory to save parameters.'
+        if self.comm_mode in (None, 'AllReduce'):
+            # when using allreduce, users need to specify the worker whose rank equals 0 to save
+            for node in self.param_nodes:
+                np.save(os.path.join(file_path, node.name + '.npy'),
+                        self.config.placeholder_to_arr_map[node].asnumpy())
+        else:
+            self.ps_comm.BarrierWorker()
+            if self.config.rank == 0:
+                for node in self.param_nodes:
+                    if node.is_embed or self.comm_mode == 'PS':
+                        node.event.sync()
+                        nodeid = ctypes.c_int(node.id)
+                        self.ps_comm.SaveParam(
+                            nodeid, ctypes.c_char_p(bytes(file_path, 'utf-8')))
+                        self.ps_comm.Wait(nodeid)
+                    else:
+                        np.save(os.path.join(file_path, node.name + '.npy'),
+                                self.config.placeholder_to_arr_map[node].asnumpy())
+            self.ps_comm.BarrierWorker()
+
+    def load(self, file_path):
+        assert os.path.isdir(
+            file_path), 'Need to specify a work directory to load parameters.'
+        if self.comm_mode in (None, 'AllReduce'):
+            for node in self.param_nodes:
+                self.config.placeholder_to_arr_map[node][:] = np.load(
+                    os.path.join(file_path, node.name + '.npy'))
+        else:
+            self.ps_comm.BarrierWorker()
+            if self.config.rank == 0:
+                for node in self.param_nodes:
+                    if node.is_embed or self.comm_mode == 'PS':
+                        node.event.sync()
+                        nodeid = ctypes.c_int(node.id)
+                        self.ps_comm.LoadParam(
+                            nodeid, ctypes.c_char_p(bytes(file_path, 'utf-8')))
+                        node.event.update()
+            self.ps_comm.BarrierWorker()
+            for node in self.topo_order:
+                if isinstance(node, PlaceholderOp) and node.trainable and not node.is_embed:
+                    if self.comm_mode == 'PS':
+                        node.event.sync()
+                        nodeid = ctypes.c_int(node.id)
+                        self.ps_comm.Pull(
+                            nodeid, self.config.ps_map[node].handle)
+                        node.event.update()
+                    else:
+                        self.config.placeholder_to_arr_map[node][:] = np.load(
+                            os.path.join(file_path, node.name + '.npy'))
+                elif isinstance(node, EmbeddingLookUp) and self.config.prefetch:
+                    node.event.sync()
+                    nodeid = ctypes.c_int(node.inputs[0].id)
+                    self.ps_comm.SparsePull(nodeid, node.inputs[1].get_next_arr(
+                        self.name).handle, self.config.ps_map[node.inputs[0]].handle)
+                    node.event.update()
+            self.ps_comm.BarrierWorker()
+
+    def recordLoads(self):
+        for node in self.config.ps_map:
+            node.event.sync()
+        self.ps_comm.getLoads()
+
+    def __del__(self):
+        if self.config.comp_stream is not None:
+            self.config.comp_stream.sync()
+        if self.config.h2d_stream is not None:
+            self.config.h2d_stream.sync()
+        if self.config.d2h_stream is not None:
+            self.config.d2h_stream.sync()
+        if self.config.nccl_stream is not None:
+            self.config.nccl_stream.sync()
+        for node in self.param_nodes:
+            if node.event:
+                node.event.sync()
+        if self.comm_mode in ('PS', 'Hybrid'):
+            worker_finish()
+
+
+class SubExecutor(object):
+    def __init__(self, name, eval_node_list, config):
+        """
+        Parameters
+        ----------
+        eval_node_list: list of nodes whose values need to be computed.
+        topo_order: list of nodes in topological order
+        node_to_shape_map: dict from node to shape of the node
+        node_to_arr_map: dict from node to ndarray.NDArray allocated for node
+        feed_shapes: shapes of feed_dict from last run(...)
+        """
+        self.name = name
+        self.eval_node_list = eval_node_list
+        self.config = config
+        inference = not any([isinstance(node, OptimizerOp)
+                             for node in eval_node_list])
+        self.inference = inference
+
+        if config.p2p_stream:
+            self.run_results_indices = [eval_node_list.index(
+                node) if node in eval_node_list else -1 for node in config.my_eval_nodes]
+            self.eval_node_list = config.my_eval_nodes
+            self.global_eval_nodes = eval_node_list
+
+        if inference == False:
+            self.topo_order = find_topo_sort(self.eval_node_list)
+        else:  # in inference phase
+            if self.config.use_sparse_pull == True or self.config.cstable_policy is not None:
+                # insert ps_sparse_pull_op
+                self.topo_order = find_topo_sort_inference(self.eval_node_list)
+                # fetch sparse parameter
+                fetch_sparse_parameter_value(self.topo_order, self.config)
+            else:
+                self.topo_order = find_topo_sort(self.eval_node_list)
+
+        # main structures, nodes' shapes and arrays
+        self.node_to_shape_map = {}
+        self.node_to_arr_map = {}
+
+        # inherit from configurations
+        self.comm_mode = self.config.comm_mode
+        self.ps_comm = self.config.ps_comm
+        self.nccl_comm = self.config.nccl_comm
+        self.comp_stream = self.config.comp_stream
+        self.h2d_stream = self.config.h2d_stream
+        self.d2h_stream = self.config.d2h_stream
+        self.nccl_stream = self.config.nccl_stream
+        self.param_psval_map = self.config.infer_ps_map if self.inference else self.config.ps_map
+        self.use_sparse_pull = self.config.use_sparse_pull
+        self.cstable_policy = self.config.cstable_policy
+        self.use_p2p = self.config.p2p_stream is not None
+
+        # assisting structures, improve performance
+        self.need_feed_nodes = []
+        self.param_nodes = []
+        self.dataloader_nodes = []
+        self.computing_nodes = []
+        for node in self.topo_order:
+            if isinstance(node, DataloaderOp) or isinstance(node, GNNDataLoaderOp):
+                self.dataloader_nodes.append(node)
+            elif isinstance(node, PlaceholderOp):
+                if node.shape is None:
+                    self.need_feed_nodes.append(node)
+                elif node.trainable:
+                    self.param_nodes.append(node)
+            elif not ((self.use_sparse_pull or self.cstable_policy) and isinstance(node, EmbeddingLookUp) and self.config.prefetch):
+                self.computing_nodes.append(node)
+        self.batch_num = set([node.get_batch_num(self.name)
+                              for node in self.dataloader_nodes])
+        assert len(self.batch_num) <= 1, 'Batch num not conform.'
+        self.batch_num = None if len(
+            self.batch_num) == 0 else self.batch_num.pop()
+        self.init_need_allocation = (self.need_feed_nodes == []) and (
+            self.dataloader_nodes == [])
+
+    def update_executor(self, eval_node_list):
+        self.eval_node_list = eval_node_list
+        inference = not any([isinstance(node, OptimizerOp)
+                             for node in eval_node_list])
+        self.inference = inference
+
+        if self.config.p2p_stream and self.inference == True:
+            raise NotImplementedError
+
+        if inference == False:
+            self.topo_order = find_topo_sort(self.eval_node_list)
+        else:  # in inference phase
+            if self.config.use_sparse_pull == True or self.config.cstable_policy is not None:
+                # insert ps_sparse_pull_op
+                self.topo_order = find_topo_sort_inference(self.eval_node_list)
+                # fetch sparse parameter
+                fetch_sparse_parameter_value(self.topo_order, self.config)
+            else:
+                self.topo_order = find_topo_sort(self.eval_node_list)
+
+        # main structures, nodes' shapes and arrays
+        self.node_to_shape_map = {}
+        self.node_to_arr_map = {}
+
+        # assisting structures, improve performance
+        self.need_feed_nodes = []
+        self.param_nodes = []
+        self.dataloader_nodes = []
+        self.computing_nodes = []
+        for node in self.topo_order:
+            if isinstance(node, DataloaderOp) or isinstance(node, GNNDataLoaderOp):
+                self.dataloader_nodes.append(node)
+            elif isinstance(node, PlaceholderOp):
+                if node.shape is None:
+                    self.need_feed_nodes.append(node)
+                elif node.trainable:
+                    self.param_nodes.append(node)
+            elif not ((self.use_sparse_pull or self.cstable_policy) and isinstance(node, EmbeddingLookUp) and self.config.prefetch):
+                self.computing_nodes.append(node)
+        self.batch_num = set([node.get_batch_num(self.name)
+                              for node in self.dataloader_nodes])
+        assert len(self.batch_num) <= 1, 'Batch num not conform.'
+        self.batch_num = None if len(
+            self.batch_num) == 0 else self.batch_num.pop()
+        self.init_need_allocation = (self.need_feed_nodes == []) and (
+            self.dataloader_nodes == [])
+
+    def infer_shape(self, feed_shapes):
+        """Given shapes of feed_dict nodes, infer shape for all nodes in graph.
+
+        Implementation note:
+        Iteratively calls node.infer_shape to infer shapes.
+        Node shapes stored in self.node_to_shape_map.
+
+        Parameters
+        ----------
+        feed_shapes: node->shapes mapping for feed_dict nodes.
+        """
+        self.node_to_shape_map = {}
+        for node in self.topo_order:
+            if node in feed_shapes:
+                self.node_to_shape_map[node] = tuple(feed_shapes[node])
+            else:
+                input_shapes = [self.node_to_shape_map[n] for n in node.inputs]
+                cur_shape = node.infer_shape(input_shapes)
+                self.node_to_shape_map[node] = cur_shape if cur_shape is None else tuple(
+                    cur_shape)
+
+    def memory_plan(self):
+        """Allocates ndarray.NDArray for every node except feed_dict nodes.
+        Parameters
+        ----------
+        """
+        for node, shape in self.node_to_shape_map.items():
+            if isinstance(node, PlaceholderOp):
+                if self.config.placeholder_to_arr_map[node] is not None:
+                    self.node_to_arr_map[node] = self.config.placeholder_to_arr_map[node]
+                elif node not in self.node_to_arr_map:
+                    self.node_to_arr_map[node] = None
+            elif not isinstance(node, DataloaderOp) and not isinstance(node, GNNDataLoaderOp):
+                # add for OptimizerOp and ParameterServerOp
+                if shape is None:
+                    self.node_to_arr_map[node] = None
+                    continue
+                if isinstance(node, (EmbeddingLookUp_Gradient, DataD2HSparseOp)):
+                    self.node_to_arr_map[node] = ndarray.IndexedSlices(
+                        dense_shape=shape)
+                    continue
+                if isinstance(node, EmbeddingLookUp) and (self.use_sparse_pull or self.cstable_policy) and self.config.prefetch:
+                    self.node_to_arr_map[node] = self.param_psval_map[node.inputs[0]]
+                    continue
+                if node.on_gpu:
+                    if node.inplace:
+                        self.node_to_arr_map[node] = ndarray.NDArray(None)
+                    elif self.inference and isinstance(node, DropoutOp):
+                        self.node_to_arr_map[node] = self.node_to_arr_map[node.inputs[0]]
+                    else:
+                        self.node_to_arr_map[node] = ndarray.empty(
+                            shape, ctx=node.ctx)
+                else:
+                    self.node_to_arr_map[node] = ndarray.empty(
+                        shape, ctx=node.ctx)
+
+    def run(self, eval_node_list={}, feed_dict={}, convert_to_numpy_ret_vals=False):
+        """
+        Parameters
+        ----------
+        feed_dict: a dictionary of node->np.ndarray supplied by user.
+        convert_to_numpy_ret_vals: whether to convert ret vals to np.array
+
+        Returns
+        -------
+        A list of values for nodes in eval_node_list. NDArray or np.ndarray.
+        """
+        assert len(feed_dict) == len(
+            self.need_feed_nodes) or self.use_p2p, 'Feed dict invalid.'
+        if eval_node_list != {} and eval_node_list != self.eval_node_list:
+            self.update_executor(eval_node_list)
+
+        feed_shapes = {}
+        need_reallocation = self.init_need_allocation
+
+        # get feed in values
+        for node, value in feed_dict.items():
+            if self.use_p2p and node not in self.need_feed_nodes:
+                continue
+            assert node in self.need_feed_nodes, 'Only allow feed in PlaceholderOp with no values, here got %s:%s.' % (
+                str(type(node)), node.name)
+            local_shape = tuple(value.shape)
+            local_realloc = local_shape != self.node_to_shape_map.get(
+                node, None)
+            need_reallocation = need_reallocation or local_realloc
+            if node.on_cpu:
+                assert isinstance(value, (np.ndarray, spmatrix, ndarray.NDArray)), \
+                    "feed_dict value type not supported"
+                if isinstance(value, np.ndarray):
+                    if local_realloc:
+                        self.node_to_arr_map[node] = ndarray.empty(
+                            local_shape, ctx=node.ctx)
+                    self.node_to_arr_map[node][:] = value
+                else:
+                    self.node_to_arr_map[node] = value
+            else:
+                if isinstance(value, np.ndarray):
+                    if local_realloc:
+                        self.node_to_arr_map[node] = ndarray.array(
+                            value, ctx=node.ctx)
+                    else:
+                        self.node_to_arr_map[node][:] = value
+                elif isinstance(value, spmatrix):
+                    value = coo_matrix(value)
+                    value = ndarray.sparse_array(value.data,
+                                                 (value.row, value.col), shape=local_shape, ctx=node.ctx)
+                    self.node_to_arr_map[node] = value
+                elif isinstance(value, ndarray.NDArray):
+                    if value.ctx == node.ctx:
+                        self.node_to_arr_map[node] = value
+                    else:
+                        if local_realloc:
+                            self.node_to_arr_map[node] = ndarray.empty(
+                                local_shape, ctx=node.ctx)
+                        else:
+                            self.node_to_arr_map[node][:] = value
+                elif isinstance(value, ndarray.ND_Sparse_Array):
+                    self.node_to_arr_map[node] = value
+                else:
+                    assert False, "feed_dict value type not supported"
+            feed_shapes[node] = local_shape
+
+        # get dataloader values
+        for node in self.dataloader_nodes:
+            local_shape = node.get_cur_shape(self.name)
+            local_realloc = local_shape != self.node_to_shape_map.get(
+                node, None)
+            need_reallocation = need_reallocation or local_realloc
+            self.node_to_arr_map[node] = node.get_arr(self.name)
+            feed_shapes[node] = local_shape
+
+        # reallocation, infer shapes and allocate memory
+        if need_reallocation:
+            self.init_need_allocation = False
+            self.infer_shape(feed_shapes)
+            self.memory_plan()
+
+        # computing
+        for node in self.computing_nodes:
+            if node.on_cpu and isinstance(self.node_to_arr_map[node], ndarray.NDArray):
+                if DNNL_LIB['cpu_ArraySet'] and not isinstance(node, DataD2HOp):
+                    cpu_array_set(self.node_to_arr_map[node], 0.0)
+                else:
+                    # here we suppose not using DNNL_LIB
+                    # self.node_to_arr_map[node][:] = np.zeros(self.node_to_shape_map[node]).astype(np.float32)
+                    pass
+
+            input_vals = [self.node_to_arr_map[n] for n in node.inputs]
+            node_val = self.node_to_arr_map[node]
+
+            for n in node.inputs:
+                if n.event:
+                    n.event.sync()
+
+            if isinstance(node, (ParameterServerCommunicateOp, ParameterServerSparsePullOp)):
+                # Here we use d2h stream in ps op, since the stream is used for d2h data transfer.
+                # Please take care at this part.
+                node.compute(input_vals, node_val, self.d2h_stream)
+
+            elif isinstance(node, AllReduceCommunicateOp):
+                node.compute(input_vals, node_val, self.nccl_stream)
+
+            elif isinstance(node, DataH2DOp):
+                node.compute(input_vals, node_val, self.h2d_stream)
+
+            elif isinstance(node, (DataD2HOp, DataD2HSparseOp)):
+                node.compute(input_vals, node_val, self.d2h_stream)
+
+            elif isinstance(node, (PipelineSendOp, PipelineReceiveOp)):
+                node.compute(input_vals, node_val)
+
+            elif isinstance(node, (DropoutOp, Batch_NormalizationOp, Layer_NormalizationOp)):
+                node.compute(input_vals, node_val,
+                             self.comp_stream, inference=self.inference)
+                if isinstance(node.event, Event):
+                    # for d2h op / eval nodes / nodes before [allreduce or ps nodes or pipelinesend nodes]
+                    node.event.record(self.comp_stream)
+
+            else:
+                node.compute(input_vals, node_val, self.comp_stream)
+                if isinstance(node.event, Event):
+                    # for d2h op / eval nodes / nodes before [allreduce or ps nodes or pipelinesend nodes]
+                    node.event.record(self.comp_stream)
+        for n in self.eval_node_list:
+            # every node in eval_node_list should have an event (except dataloader/optimizer...)
+            if n.event:
+                n.event.sync()
+
+        # get results
+        results = [self.node_to_arr_map[n] for n in self.eval_node_list]
+        if convert_to_numpy_ret_vals:
+            for i in range(len(results)):
+                if results[i] is not None:
+                    results[i] = results[i].asnumpy()
+
+        # remap to original order in model parallel
+        if self.use_p2p:
+            new_results = [None for _ in self.global_eval_nodes]
+            for i, j in enumerate(self.run_results_indices):
+                new_results[j] = results[i]
+            results = new_results
+
+        return results
+
+
+def gradients(output_node, node_list, insert_grad=None):
+    """Take gradient of output node with respect to each node in node_list.
+
+    Parameters
+    ----------
+    output_node: output node that we are taking derivative of.
+    node_list: list of nodes that we are taking derivative wrt.
+    insert_grad: used to assign gradient to output_node in model parallel.
+
+    Returns
+    -------
+    A list of gradient values, one for each node in node_list respectively.
+
+    """
+    if isinstance(output_node, list):
+        node_to_output_grads_list = {
+            output_node[i]: [OnesLike.oneslike_op(output_node[i])] if insert_grad is None
+            else [insert_grad[i]] for i in range(len(output_node))
+        }
+    else:
+        node_to_output_grads_list = {
+            output_node: [OnesLike.oneslike_op(output_node)] if insert_grad is None else [
+                insert_grad]
+        }
+        output_node = [output_node]
+    node_to_output_grad = {}
+    # Traverse forward graph in reverse topological order
+    reverse_topo_order = reversed(find_topo_sort(output_node))
+    for node in reverse_topo_order:
+        # here the ctx for embedding lookup is a workaround
+        # TODO: when implement PS strategy for context semantics, modify here
+        if isinstance(node, EmbeddingLookUp):
+            output_grad = sum_node_list(
+                node_to_output_grads_list[node], node_to_output_grads_list[node][0].raw_ctx)
+        else:
+            output_grad = sum_node_list(
+                node_to_output_grads_list[node], node.raw_ctx)
+        if output_grad is None:
+            for n in node.inputs:
+                if n not in node_to_output_grads_list:
+                    node_to_output_grads_list[n] = []
+            continue
+        node_to_output_grad[node] = output_grad
+        input_grads_list = node.gradient(output_grad)
+        for i in range(len(node.inputs)):
+            if node.inputs[i] not in node_to_output_grads_list:
+                node_to_output_grads_list[node.inputs[i]] = []
+            # Calculate partial adjoint for input nodes.
+            node_to_output_grads_list[node.inputs[i]].append(
+                input_grads_list[i])
+
+    grad_node_list = [node_to_output_grad[node] for node in node_list]
+    return grad_node_list
+
+##################
+# Helper Methods #
+##################
+
+
+def topo_sort_with_hook(node_list, config):
+    visited = set()
+    for node in node_list:
+        topo_sort_dfs_with_hook(node, visited, config)
+
+
+def topo_sort_dfs_with_hook(node, visited, config):
+    if node in visited:
+        return
+    visited.add(node)
+    node.backward_hook(config)
+    # move param from node to config
+    if isinstance(node, PlaceholderOp):
+        config.placeholder_to_arr_map[node] = node.tensor_value
+        node.tensor_value = None
+    for n in node.inputs:
+        topo_sort_dfs_with_hook(n, visited, config)
+    node.forward_hook(config)
+
+
+def find_topo_sort(node_list):
+    """Given a list of nodes, return a topo ordering of nodes ending in them.
+
+    A simple algorithm is to do a post-order DFS traversal on the given nodes,
+    going backwards based on input edges. Since a node is added to the ordering
+    after all its predecessors are traversed due to post-order DFS, we get a
+    topological sort.
+
+    """
+    visited = set()
+    topo_order = []
+    for node in node_list:
+        topo_sort_dfs(node, visited, topo_order)
+    return topo_order
+
+
+def topo_sort_dfs(node, visited, topo_order):
+    """Post-order DFS"""
+    if node in visited:
+        return
+    visited.add(node)
+    for n in node.inputs:
+        topo_sort_dfs(n, visited, topo_order)
+    topo_order.append(node)
+
+
+def find_topo_sort_inference(node_list):
+    topo_order = find_topo_sort(node_list)
+    embedding_list = list()
+    embedding_outputs = dict()
+    embedding_cnt = dict()
+    for node in topo_order:
+        if isinstance(node, EmbeddingLookUp):
+            embedding_outputs[node] = list()
+            embedding_cnt[node] = 0
+            embedding_list.append(node)
+        else:
+            for input_node in node.inputs:
+                if isinstance(input_node, EmbeddingLookUp):
+                    embedding_outputs[input_node].append(node)
+                    embedding_cnt[input_node] += 1
+    topo_order_inference = list()
+    for node in topo_order:
+        topo_order_inference.append(node)
+        for embedding in embedding_list:
+            if node in embedding_outputs[embedding]:
+                embedding_cnt[embedding] -= 1
+            if embedding_cnt[embedding] == 0:
+                topo_order_inference.append(parameterServerSparsePull_op(
+                    embedding, embedding_outputs[embedding]))
+                embedding_list.remove(embedding)
+
+    return topo_order_inference
+
+
+def fetch_sparse_parameter_value(node_list, config):
+    for node in node_list:
+        if isinstance(node, ParameterServerSparsePullOp):
+            node.forward_hook(config)
+
+
+def fetch_dense_parameter_value(node_list, config):
+    assert config.comm_mode in ('PS', 'Hybrid')
+    topo_order = find_topo_sort(node_list)
+    val_list = []
+    # get var list
+    for node in topo_order:
+        if isinstance(node, PlaceholderOp) and node.trainable:
+            val_list.append(node)
+    for node in val_list:
+        if config.use_sparse_pull and node.is_embed:
+            continue
+        else:
+            pull_val = ndarray.empty(node.shape, ctx=ndarray.cpu(0))
+            config.ps_comm.Pull(node.id, pull_val.handle)
+            config.infer_ps_map[node] = pull_val
+            config.placeholder_to_arr_map[node] = pull_val
+        node.event.update()
+
+
+def sum_node_list(node_list, ctx):
+    """Custom sum func to avoid creating redundant nodes in Python sum func."""
+    node_list = [n for n in node_list if n is not None]
+    if node_list == []:
+        return None
+    sum_node = node_list[0]
+    for n in node_list[1:]:
+        sum_node = add_op(sum_node, n, ctx=ctx)
+    return sum_node
diff --git a/python/hetu/initializers.py b/python/hetu/initializers.py
new file mode 100644
index 0000000..eadf054
--- /dev/null
+++ b/python/hetu/initializers.py
@@ -0,0 +1,295 @@
+from hetu.gpu_ops import Variable
+from hetu import cpu_links as cpu_op
+from hetu import gpu_links as gpu_op
+from hetu import ndarray
+import numpy as np
+import ctypes
+
+
+class BaseInit(object):
+    def __init__(self, shape):
+        self.shape = tuple(shape)
+
+    def __call__(self, node, seed, np_rand=None, stream=None):
+        self.node = node
+        self.seed = seed + node.id
+        node.tensor_value = ndarray.empty(self.shape, ctx=node.ctx)
+        if ndarray.is_gpu_ctx(node.ctx):
+            self.init_on_gpu(stream)
+        else:
+            self.init_on_cpu(np_rand)
+
+    def init_on_gpu(self, stream):
+        raise NotImplementedError
+
+    def init_on_cpu(self, np_rand):
+        raise NotImplementedError
+
+    def init_on_ps(self, comm, nid, param_type, init_type, arg1, arg2, seed, opt):
+        # param types: Dense 0, Sparse 1, CacheSparse 2
+        if param_type == 0:
+            length = np.prod(self.shape)
+            width = 1
+        else:
+            assert len(self.shape) == 2
+            length = self.shape[0]
+            width = self.shape[1]
+        comm.InitTensor(nid, ctypes.c_int(param_type), ctypes.c_int(length), ctypes.c_int(width),
+                        ctypes.c_int(init_type), ctypes.c_double(arg1), ctypes.c_double(arg2), ctypes.c_ulonglong(seed), opt[0], opt[1], opt[2])
+
+
+class ConstantInit(BaseInit):
+    def __init__(self, constant, shape):
+        super().__init__(shape)
+        self.constant = constant
+
+    def init_on_gpu(self, stream):
+        gpu_op.array_set(self.node.tensor_value, self.constant, stream)
+
+    def init_on_cpu(self, np_rand):
+        from ._base import DNNL_LIB
+        if DNNL_LIB['cpu_ArraySet']:
+            cpu_op.array_set(self.node.tensor_value, self.constant)
+        else:
+            self.node.tensor_value[:] = np.full(
+                self.shape, self.constant).astype(np.float32)
+
+    def init_on_ps(self, comm, nid, param_type, seed, opt):
+        super().init_on_ps(comm, nid, param_type, 0, self.constant, 1.0, seed, opt)
+
+
+class ZerosInit(ConstantInit):
+    def __init__(self, shape):
+        super().__init__(0.0, shape)
+
+
+class OnesInit(ConstantInit):
+    def __init__(self, shape):
+        super().__init__(1.0, shape)
+
+
+class UniformInit(BaseInit):
+    def __init__(self, low, high, shape):
+        super().__init__(shape)
+        self.low = low
+        self.high = high
+
+    def init_on_gpu(self, stream):
+        gpu_op.uniform_init(self.node.tensor_value, self.low,
+                            self.high, self.seed, stream)
+
+    def init_on_cpu(self, np_rand):
+        from ._base import DNNL_LIB
+        if DNNL_LIB['cpu_UniformInit']:
+            cpu_op.uniform_init(self.node.tensor_value,
+                                self.low, self.high, self.seed)
+        else:
+            self.node.tensor_value[:] = np_rand.uniform(
+                low=self.low, high=self.high, size=self.shape).astype(np.float32)
+
+    def init_on_ps(self, comm, nid, param_type, seed, opt):
+        super().init_on_ps(comm, nid, param_type, 1, self.low, self.high, seed, opt)
+
+
+class GeneralizedXavierUniformInit(UniformInit):
+    def __init__(self, gain, mode, shape):
+        assert mode in ('fan_in', 'fan_out',
+                        'avg'), 'Mode %s not valid.' % mode
+        assert gain > 0, 'Gain value %s not valid.' % str(gain)
+        assert len(
+            shape) >= 2, 'Generalized xavier requires shape to be at least 2D.'
+        hw_scale = 1 if len(shape) == 2 else np.prod(shape[2:])
+        fan_in = hw_scale * shape[1]
+        fan_out = hw_scale * shape[0]
+        if mode == 'fan_in':
+            factor = fan_in
+        elif mode == 'fan_out':
+            factor = fan_out
+        else:
+            factor = (fan_in + fan_out) / 2.0
+        limit = np.sqrt(gain / factor)
+        super().__init__(-limit, limit, shape)
+
+
+class XavierUniformInit(GeneralizedXavierUniformInit):
+    def __init__(self, shape):
+        super().__init__(3.0, 'avg', shape)
+
+
+class HeUniformInit(GeneralizedXavierUniformInit):
+    def __init__(self, shape):
+        super().__init__(6.0, 'fan_in', shape)
+
+
+class LecunUniformInit(GeneralizedXavierUniformInit):
+    def __init__(self, shape):
+        super().__init__(3.0, 'fan_in', shape)
+
+
+class NormalInit(BaseInit):
+    def __init__(self, mean, stddev, shape):
+        super().__init__(shape)
+        self.mean = mean
+        self.stddev = stddev
+
+    def init_on_gpu(self, stream):
+        gpu_op.normal_init(self.node.tensor_value, self.mean,
+                           self.stddev, self.seed, stream)
+
+    def init_on_cpu(self, np_rand):
+        from ._base import DNNL_LIB
+        if DNNL_LIB['cpu_NormalInit']:
+            cpu_op.normal_init(self.node.tensor_value,
+                               self.mean, self.stddev, self.seed)
+        else:
+            self.node.tensor_value[:] = np_rand.normal(
+                loc=self.mean, scale=self.stddev, size=self.shape).astype(np.float32)
+
+    def init_on_ps(self, comm, nid, param_type, seed, opt):
+        super().init_on_ps(comm, nid, param_type, 2, self.mean, self.stddev, seed, opt)
+
+
+class GeneralizedXavierNormalInit(NormalInit):
+    def __init__(self, gain, mode, shape):
+        assert mode in ('fan_in', 'fan_out', 'avg'), 'Mode not allowed.'
+        assert gain > 0, 'Gain value not allowed.'
+        assert len(
+            shape) >= 2, 'Generalized xavier requires shape to be at least 2D.'
+        hw_scale = 1 if len(shape) == 2 else np.prod(shape[2:])
+        fan_in = hw_scale * shape[1]
+        fan_out = hw_scale * shape[0]
+        if mode == 'fan_in':
+            factor = fan_in
+        elif mode == 'fan_out':
+            factor = fan_out
+        else:
+            factor = (fan_in + fan_out) / 2.0
+        scale = np.sqrt(gain / factor)
+        super().__init__(0, scale, shape)
+
+
+class XavierNormalInit(GeneralizedXavierNormalInit):
+    def __init__(self, shape):
+        super().__init__(1.0, 'avg', shape)
+
+
+class HeNormalInit(GeneralizedXavierNormalInit):
+    def __init__(self, shape):
+        super().__init__(2.0, 'fan_in', shape)
+
+
+class LecunNormalInit(GeneralizedXavierNormalInit):
+    def __init__(self, shape):
+        super().__init__(1.0, 'fan_in', shape)
+
+
+class TruncatedNormalInit(BaseInit):
+    def __init__(self, mean, stddev, shape):
+        super().__init__(shape)
+        self.mean = mean
+        self.stddev = stddev
+
+    def init_on_gpu(self, stream):
+        gpu_op.truncated_normal_init(
+            self.node.tensor_value, self.mean, self.stddev, self.seed, stream)
+
+    def init_on_cpu(self, np_rand):
+        from ._base import DNNL_LIB
+        if DNNL_LIB['cpu_TruncatedNormalInit']:
+            cpu_op.truncated_normal_init(
+                self.node.tensor_value, self.mean, self.stddev, self.seed)
+        else:
+            # this function cannot use np_rand
+            from scipy.stats import truncnorm
+            self.node.tensor_value[:] = truncnorm(
+                -2.0, 2.0, loc=self.mean, scale=self.stddev).rvs(self.shape).astype(np.float32)
+
+    def init_on_ps(self, comm, nid, param_type, seed, opt):
+        super().init_on_ps(comm, nid, param_type, 3, self.mean, self.stddev, seed, opt)
+
+
+# here we provide easy APIs
+
+
+def zeros(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'zeros_initializer'
+    init = ZerosInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def ones(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'ones_initializer'
+    init = OnesInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def constant(shape, fill_value=0.0, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'constant_initializer'
+    init = ConstantInit(fill_value, shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def truncated_normal(shape, mean=0.0, stddev=1.0, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'truncated_normal_initializer'
+    init = TruncatedNormalInit(mean, stddev, shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def random_normal(shape, mean=0.0, stddev=1.0, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'random_normal_initializer'
+    init = NormalInit(mean, stddev, shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def random_uniform(shape, minval=-1.0, maxval=1.0, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'random_uniform_initializer'
+    init = UniformInit(minval, maxval, shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def xavier_normal(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'xavier_normal_initializer'
+    init = XavierNormalInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def xavier_uniform(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'xavier_uniform_initializer'
+    init = XavierUniformInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def he_normal(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'he_normal_initializer'
+    init = HeNormalInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def he_uniform(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'he_uniform_initializer'
+    init = HeUniformInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def lecun_normal(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'lecun_normal_initializer'
+    init = LecunNormalInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
+
+
+def lecun_uniform(shape, name=None, trainable=True, ctx=None):
+    if name is None:
+        name = 'lecun_uniform_initializer'
+    init = LecunUniformInit(shape)
+    return Variable(name=name, initializer=init, trainable=trainable, ctx=ctx)
diff --git a/python/hetu/launcher.py b/python/hetu/launcher.py
new file mode 100644
index 0000000..fbde488
--- /dev/null
+++ b/python/hetu/launcher.py
@@ -0,0 +1,82 @@
+import argparse
+import yaml
+import os
+import signal
+import multiprocessing
+import hetu as ht
+
+_procs = []
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in _procs:
+        proc.kill()
+    exit(0)
+
+
+def launch(target, args):
+    file_path = args.config
+    settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader)
+    for k, v in settings['shared'].items():
+        os.environ[k] = str(v)
+    args.num_local_worker = int(settings["launch"]["worker"])
+    for i in range(args.num_local_worker):
+        proc = multiprocessing.Process(
+            target=start_worker, args=[target, args])
+        _procs.append(proc)
+    for i in range(int(settings["launch"]["server"])):
+        proc = multiprocessing.Process(target=start_server)
+        _procs.append(proc)
+    if settings["launch"]["scheduler"] != 0:
+        proc = multiprocessing.Process(target=start_sched)
+        _procs.append(proc)
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in _procs:
+        proc.start()
+    for proc in _procs:
+        proc.join()
+
+
+def start_sched():
+    os.environ["DMLC_ROLE"] = "scheduler"
+    ht.scheduler_init()
+    ht.scheduler_finish()
+
+
+def start_server():
+    os.environ["DMLC_ROLE"] = "server"
+    ht.server_init()
+    ht.server_finish()
+
+
+def start_worker(target, args):
+    os.environ["DMLC_ROLE"] = "worker"
+    ht.worker_init()
+    target(args)
+    ht.worker_finish()
+
+
+if __name__ == '__main__':
+    signal.signal(signal.SIGINT, signal_handler)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config")
+    parser.add_argument("-n", type=int, default=1)
+    parser.add_argument("--sched", action="store_true")
+    args = parser.parse_args()
+    file_path = args.config
+    settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader)
+    for k, v in settings['shared'].items():
+        os.environ[k] = str(v)
+    if args.sched:
+        _procs.append(multiprocessing.Process(target=start_sched))
+    for i in range(args.n):
+        _procs.append(multiprocessing.Process(target=start_server))
+    for proc in _procs:
+        proc.start()
+    for proc in _procs:
+        proc.join()
+
+__all__ = [
+    'launch'
+]
diff --git a/python/hetu/lr_scheduler.py b/python/hetu/lr_scheduler.py
new file mode 100644
index 0000000..b94ed13
--- /dev/null
+++ b/python/hetu/lr_scheduler.py
@@ -0,0 +1,142 @@
+
+class FixedScheduler(object):
+    def __init__(self, learning_rate):
+        self.learning_rate = learning_rate
+
+    def step(self):
+        return self.learning_rate
+
+    def get(self):
+        return self.learning_rate
+
+
+class StepScheduler(FixedScheduler):
+    def __init__(self, learning_rate, step_size, gamma=0.1, ending=1e-8):
+        self.learning_rate = learning_rate
+        self.step_size = step_size
+        self.gamma = gamma
+        self.ending = ending
+        assert step_size > 0 and gamma > 0
+        assert learning_rate > ending, \
+            'Initial learning rate should be larger than ending learning rate; got {}, {}'.format(
+                learning_rate, ending)
+        assert ending >= 0
+        self.reach_end = False
+        self.cur_step = 0
+
+    def step(self):
+        if self.reach_end:
+            return self.ending
+        if self.cur_step > 0 and self.cur_step % self.step_size == 0:
+            self.learning_rate *= self.gamma
+            if self.learning_rate <= self.ending:
+                self.reach_end = True
+                self.learning_rate = self.ending
+        self.cur_step += 1
+        return self.learning_rate
+
+
+class MultiStepScheduler(FixedScheduler):
+    def __init__(self, learning_rate, milestones, gamma=0.1):
+        self.learning_rate = learning_rate
+        self.milestones = milestones
+        self.gamma = gamma
+        assert milestones[0] > 0
+        for i in range(1, len(milestones)):
+            assert milestones[i] > milestones[i-1]
+        self.cur_step = 0
+
+    def step(self):
+        if not self.milestones:
+            return self.learning_rate
+        if self.cur_step == self.milestones[0]:
+            self.milestones = self.milestones[1:]
+            self.learning_rate *= self.gamma
+        self.cur_step += 1
+        return self.learning_rate
+
+
+class ExponentialScheduler(FixedScheduler):
+    def __init__(self, learning_rate, gamma=0.9, ending=1e-8):
+        self.learning_rate = learning_rate
+        self.gamma = gamma
+        self.ending = ending
+        assert gamma > 0
+        assert learning_rate > ending, \
+            'Initial learning rate should be larger than ending learning rate; got {}, {}'.format(
+                learning_rate, ending)
+        assert ending >= 0
+        self.cur_step = 0
+        self.reach_end = False
+
+    def step(self):
+        if self.reach_end:
+            return self.learning_rate
+        prev_lr = self.learning_rate
+        self.learning_rate *= self.gamma
+        if self.learning_rate <= self.ending:
+            self.reach_end = True
+            self.learning_rate = self.ending
+        return prev_lr
+
+
+class ReduceOnPlateauScheduler(FixedScheduler):
+    def __init__(self, learning_rate, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, ending=1e-8):
+        self.learning_rate = learning_rate
+        self.mode = mode
+        self.factor = factor
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.ending = ending
+        assert learning_rate > ending, \
+            'Initial learning rate should be larger than ending learning rate; got {}, {}'.format(
+                learning_rate, ending)
+        assert mode in ('min', 'max')
+        assert threshold_mode in ('rel', 'abs')
+        assert factor > 0
+        assert patience >= 0
+        assert threshold >= 0
+        assert cooldown >= 0
+        assert ending >= 0
+        self.step_in_cooldown = -1
+        self.patience_step = 0
+        self.last_value = None
+        self.reach_end = False
+
+    def step(self, value):
+        if self.reach_end:
+            return self.learning_rate
+        if self.step_in_cooldown >= 0:
+            self.step_in_cooldown -= 1
+            self.last_value = eval(self.mode)(self.last_value, value)
+            return self.learning_rate
+        if self.last_value is None:
+            self.last_value = value
+            return self.learning_rate
+        if self.mode == 'min':
+            larger = self.last_value
+            smaller = value
+        else:
+            larger = value
+            smaller = self.last_value
+        should_change = False
+        if self.threshold_mode == 'rel':
+            should_change = larger < (1 - self.threshold) * smaller
+        else:
+            should_change = larger < smaller - self.threshold
+        if should_change:
+            if self.patience_step >= self.patience:
+                self.patience_step = 0
+                self.learning_rate *= self.factor
+                if self.learning_rate <= self.ending:
+                    self.learning_rate = self.ending
+                    self.reach_end = True
+                self.step_in_cooldown += self.cooldown
+            else:
+                self.patience_step += 1
+        else:
+            self.patience_step = 0
+        self.last_value = eval(self.mode)(self.last_value, value)
+        return self.learning_rate
diff --git a/python/hetu/metrics.py b/python/hetu/metrics.py
new file mode 100644
index 0000000..3803c41
--- /dev/null
+++ b/python/hetu/metrics.py
@@ -0,0 +1,359 @@
+import numpy as np
+
+
+def softmax_func(y):
+    """Computes softmax activations.
+      This function performs the equivalent of
+          softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits))
+    another form: np.exp(y)/np.sum(np.exp(y),axis=1,keepdims=True)
+    """
+
+    b = y - np.max(y, axis=1, keepdims=True)
+    expb = np.exp(b)
+    softmax = expb / np.sum(expb, axis=1, keepdims=True)
+    return softmax
+
+
+def confusion_matrix_at_thresholds(labels, predictions, thresholds, includes=None):
+    """Computes true_positives, false_negatives, true_negatives, false_positives.
+      Args:
+        labels: A np.array whose shape matches `predictions`. Will be cast to
+          `bool`.
+        predictions: A floating point np.array of arbitrary shape and whose values
+          are in the range `[0, 1]`.
+        thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+        includes: Tuple of keys to return, from 'tp', 'fn', 'tn', fp'. If `None`,
+            default to all four.
+      Returns:
+        values: Dict of variables of shape `[len(thresholds)]`. Keys are from
+            `includes`.
+      """
+    all_includes = ('tp', 'fn', 'tn', 'fp')
+    if includes is None:
+        includes = all_includes
+    else:
+        for include in includes:
+            if include not in all_includes:
+                raise ValueError('Invaild key: %s.' % include)
+    # Reshape predictions and labels.
+    # This function is often used in dichotomies.
+    # In multi-classification problems, we often stretch the dimensions directly into dichotomies.
+    predictions_2d = np.reshape(predictions, [-1, 1])
+    labels_2d = np.reshape(labels.astype(dtype=np.bool), [1, -1])
+    num_predictions = predictions_2d.shape[0]
+    num_thresholds = len(thresholds)
+    # thresh_tiled's shape:[num_thresholds,num_predictions]
+    thresh_tiled = np.tile(
+        np.expand_dims(np.array(thresholds), axis=1), [1, num_predictions])
+    pred_is_pos = np.greater(
+        np.tile(np.transpose(predictions_2d), [num_thresholds, 1]),
+        thresh_tiled)
+    if ('fn' in includes) or ('tn' in includes):
+        pred_is_neg = np.logical_not(pred_is_pos)
+    # Tile labels by number of thresholds
+    # label_is_pos's shape:[num_thresholds,num_predictions]
+    label_is_pos = np.tile(labels_2d, [num_thresholds, 1])
+    if ('fp' in includes) or ('tn' in includes):
+        label_is_neg = np.logical_not(label_is_pos)
+
+    values = {}
+    if 'tp' in includes:
+        is_true_positive = np.logical_and(
+            label_is_pos, pred_is_pos).astype(np.float32)
+        values['tp'] = np.sum(is_true_positive, axis=1)
+    if 'fn' in includes:
+        is_false_negative = np.logical_and(
+            label_is_pos, pred_is_neg).astype(np.float32)
+        values['fn'] = np.sum(is_false_negative, axis=1)
+    if 'tn' in includes:
+        is_true_negative = np.logical_and(
+            label_is_neg, pred_is_neg).astype(np.float32)
+        values['tn'] = np.sum(is_true_negative, axis=1)
+    if 'fp' in includes:
+        is_false_positive = np.logical_and(
+            label_is_neg, pred_is_pos).astype(np.float32)
+        values['fp'] = np.sum(is_false_positive, axis=1)
+    return values
+
+
+def roc_pr_curve(values, curve='ROC'):
+    """Computes the roc-auc or pr-auc based on confusion counts.
+    Args:
+        values: A dict from the func:confusion_matrix_at_thresholds and must have
+            four keys:tp,fp,fn,tn
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+        'PR' for the Precision-Recall-curve.
+    Returns:
+        x_axis: A python list of the curve's x-axis. In ROC it's fpr;In PR it's Recall.
+        y_axis:A python list of the curve's y-axis. In ROC it's tpr;In PR it's Precision.
+            fpr=fp/(fp+tn)
+            tpr=tp/(tp+fn)
+            Recall=tpr
+            Precision=tp/(tp+fp)
+    """
+    if 'tp' not in values.keys():
+        raise ValueError('values must have the key tp')
+    if 'fp' not in values.keys():
+        raise ValueError('values must have the key fp')
+    if 'fn' not in values.keys():
+        raise ValueError('values must have the key fn')
+    if 'tn' not in values.keys():
+        raise ValueError('values must have the key tn')
+    tp = values['tp']
+    fp = values['fp']
+    fn = values['fn']
+    tn = values['tn']
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+    rec = np.divide(tp + epsilon, tp + fn + epsilon)
+    if curve == 'ROC':
+        fp_rate = np.divide(fp + epsilon, fp + tn + epsilon)
+        x_axis = fp_rate
+        y_axis = rec
+    else:  # curve == 'PR'.
+        prec = np.divide(tp + epsilon, tp + fp + epsilon)
+        x_axis = rec
+        y_axis = prec
+    return x_axis, y_axis
+
+
+def auc(labels, predictions, num_thresholds=200,
+        curve='ROC'):
+    """Computes the approximate AUC via a Riemann sum.
+      We get four variables `true_positives`,`true_negatives`, `false_positives`
+      and `false_negatives` that are used to compute the AUC first.
+      And then compute auc_curve using the function roc_pr_curve.
+      The `num_thresholds` variable controls the degree of discretization with
+      larger numbers of thresholds more closely approximating the true AUC.
+      For best results, `predictions` should be distributed approximately uniformly
+      in the range [0, 1] and not peaked around 0 or 1.
+      Args:
+        labels: A np.array whose shape matches `predictions`. Will be cast to
+          `bool`.
+        predictions: A floating point np.array of arbitrary shape and whose values
+          are in the range `[0, 1]`.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+          curve.
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+        'PR' for the Precision-Recall-curve.
+      Returns:
+        auc: A scalar representing the current area-under-curve.
+      """
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds - 2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+    values = confusion_matrix_at_thresholds(labels, predictions, thresholds)
+    x_axis, y_axis = roc_pr_curve(values, curve=curve)
+    auc_value = np.sum(np.multiply(
+        x_axis[:num_thresholds - 1] - x_axis[1:],
+        (y_axis[:num_thresholds - 1] + y_axis[1:]) / 2.))
+    return auc_value
+
+
+def accuracy(labels, predictions):
+    """Calculates the degree of `predictions` matches `labels`.
+    Args:
+        labels: A np.array whose shape matches `predictions`.
+        predictions: A floating point np.array of arbitrary shape and it's the
+        predicted value.
+    returns:
+        accuracy: A  accuracy, the value of `total` divided by `count`.
+    """
+    acc_val = np.equal(
+        np.argmax(labels, 1),
+        np.argmax(predictions, 1)).astype(np.float32)
+    accuracy = np.mean(acc_val)
+    return accuracy
+
+
+def confusion_matrix_one_hot(labels, predictions):
+    """Computes true_positives, false_negatives, true_negatives, false_positives.
+
+      Args:
+        labels: A np.array whose shape matches `predictions` and must be one_hot.
+         Will be cast to `bool`.
+        predictions: A floating point np.array of arbitrary shape.
+      Returns:
+        values: Dict of variables of shape `[predictions.shape[1]]`.
+      example:
+        labels:[[1,0,0]
+                [0,1,0]
+                [0,0,1]]
+        predictions:
+               [[9.1,5.0,7.8]   true
+                [0.3,0.7,1.4]   false
+                [4.3,1.3,5.3]]  true
+        returns:
+        values{'tp':[1,0,1]
+               'tn':[2,2,1]
+               'fp':[0,0,1]
+               'fn':[0,1,0]
+               }
+    """
+    # transpose prediction to one hot.for the example above,it will be:
+    # [[1,0,0]
+    # [0,0,1]
+    # [0,0,1]]
+    prediction_one_hot = np.eye(predictions.shape[1])[
+        np.argmax(predictions, axis=1)]
+    values = {}
+    is_true_positive = np.logical_and(
+        np.equal(labels, True), np.equal(prediction_one_hot, True))
+    is_false_positive = np.logical_and(
+        np.equal(labels, False), np.equal(prediction_one_hot, True))
+    is_true_negatives = np.logical_and(
+        np.equal(labels, False), np.equal(prediction_one_hot, False))
+    is_false_negatives = np.logical_and(
+        np.equal(labels, True), np.equal(prediction_one_hot, False))
+    values['tp'] = np.sum(
+        is_true_positive.astype(dtype=np.float32), axis=0)
+    values['fp'] = np.sum(
+        is_false_positive.astype(dtype=np.float32), axis=0)
+    values['tn'] = np.sum(
+        is_true_negatives.astype(dtype=np.float32), axis=0)
+    values['fn'] = np.sum(
+        is_false_negatives.astype(dtype=np.float32), axis=0)
+    return values
+
+
+def precision_score_one_hot(labels, predictions, average=None):
+    """compute precision score, precision=tp/(tp+fp)
+    the labels must be one_hot.
+    the predictions is prediction results.
+    Args:
+        labels: A np.array whose shape matches `predictions` and must be one_hot.
+    Will be cast to `bool`.
+        predictions: A floating point np.array of arbitrary shape.
+        average : string, [None(default), 'micro', 'macro',]
+            This parameter is required for multiclass/multilabel targets.
+            If ``None``, the scores for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean.  This does not take label imbalance into account.
+
+    Returns:
+        values:  A score  .
+
+    References
+    -----------------------
+     [1]   https://blog.csdn.net/sinat_28576553/article/details/80258619
+    """
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+    values = confusion_matrix_one_hot(labels, predictions)
+    if average is None:
+        tp = values['tp']
+        fp = values['fp']
+        p = np.divide(tp+epsilon, tp + fp+epsilon)
+        return p
+    elif average == 'micro':
+        tp = np.sum(values['tp'])
+        fp = np.sum(values['fp'])
+        return np.divide(tp+epsilon, tp + fp+epsilon)
+
+    elif average == 'macro':
+        tp = values['tp']
+        fp = values['fp']
+        p = np.divide(tp+epsilon, tp + fp+epsilon)
+        return np.average(p)
+    else:
+        raise ValueError('Invaild average: %s.' % average)
+
+
+def recall_score_one_hot(labels, predictions, average=None):
+    """compute recall score, precision=tp/(tp+fn)
+    the labels must be one_hot.
+    the predictions is prediction results.
+    Args:
+        labels: A np.array whose shape matches `predictions` and must be one_hot.
+    Will be cast to `bool`.
+        predictions: A floating point np.array of arbitrary shape.
+            average : string, [None(default), 'micro', 'macro',]
+            This parameter is required for multiclass/multilabel targets.
+            If ``None``, the scores for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean.  This does not take label imbalance into account.
+    Returns:
+        values:  A score  .
+
+    References
+    -----------------------
+     [1]   https://blog.csdn.net/sinat_28576553/article/details/80258619
+    """
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+    values = confusion_matrix_one_hot(labels, predictions)
+    if average is None:
+        tp = values['tp']
+        fn = values['fn']
+        p = np.divide(tp+epsilon, tp + fn+epsilon)
+        return p
+    elif average == 'micro':
+        tp = np.sum(values['tp'])
+        fn = np.sum(values['fn'])
+        return np.divide(tp+epsilon, tp + fn+epsilon)
+
+    elif average == 'macro':
+        tp = values['tp']
+        fn = values['fn']
+        p = np.divide(tp+epsilon, tp + fn+epsilon)
+        return np.average(p)
+    else:
+        raise ValueError('Invaild average: %s.' % average)
+
+
+def f_score_one_hot(labels, predictions, beta=1.0, average=None):
+    """compute f score, =(1+beta*beta)precision*recall/(beta*beta*precision+recall)
+     the labels must be one_hot.
+     the predictions is prediction results.
+     Args:
+         labels: A np.array whose shape matches `predictions` and must be one_hot.
+     Will be cast to `bool`.
+         predictions: A floating point np.array of arbitrary shape.
+             average : string, [None(default), 'micro', 'macro',]
+             This parameter is required for multiclass/multilabel targets.
+             If ``None``, the scores for each class are returned. Otherwise, this
+             determines the type of averaging performed on the data:
+             ``'micro'``:
+                 Calculate metrics globally by counting the total true positives,
+                 false negatives and false positives.
+             ``'macro'``:
+                 Calculate metrics for each label, and find their unweighted
+                 mean.  This does not take label imbalance into account.
+     Returns:
+         values:  A score float.
+
+     References
+     -----------------------
+      [1]   https://blog.csdn.net/sinat_28576553/article/details/80258619
+     """
+    if beta < 0:
+        raise ValueError("beta should be >=0 in the F-beta score")
+    beta2 = beta ** 2
+    p = precision_score_one_hot(labels, predictions, average=average)
+    r = recall_score_one_hot(labels, predictions, average=average)
+    # In the functions:precision and recall,add a epsilon,so p and r will
+    # not be zero.
+    f = (1+beta2)*p*r/(beta2*p+r)
+    if average is None or average == 'micro':
+        p = precision_score_one_hot(labels, predictions, average=average)
+        r = recall_score_one_hot(labels, predictions, average=average)
+        f = (1 + beta2) * p * r / (beta2 * p + r)
+        return f
+    elif average == 'macro':
+        p = precision_score_one_hot(labels, predictions, average=None)
+        r = recall_score_one_hot(labels, predictions, average=None)
+        f = (1 + beta2) * p * r / (beta2 * p + r)
+        return np.average(f)
+    else:
+        raise ValueError('Invaild average: %s.' % average)
diff --git a/python/hetu/ndarray.py b/python/hetu/ndarray.py
new file mode 100644
index 0000000..9bc669d
--- /dev/null
+++ b/python/hetu/ndarray.py
@@ -0,0 +1,547 @@
+from __future__ import absolute_import
+
+from ._base import _LIB, check_call, c_array
+import ctypes
+import numpy as np
+import scipy.sparse
+import socket
+
+
+class DLContext(ctypes.Structure):
+    """DL context strucure."""
+    _fields_ = [("device_id", ctypes.c_int),
+                ("device_type", ctypes.c_int)]
+
+    MASK2STR = {
+        1: 'cpu',
+        2: 'gpu',
+    }
+
+    def __init__(self, device_id, device_type, hostname='localhost'):
+        super(DLContext, self).__init__()
+        self.device_id = device_id
+        self.device_type = device_type
+        if hostname in ('localhost', socket.gethostname()):
+            self.hostname = 'localhost'
+            self.local = True
+        else:
+            self.hostname = hostname
+            self.local = False
+
+    def __repr__(self):
+        if not hasattr(self, 'local') or self.local:
+            return "%s(%d)" % (
+                DLContext.MASK2STR[self.device_type], self.device_id)
+        else:
+            return "%s:%s(%d)" % (
+                self.hostname, DLContext.MASK2STR[self.device_type], self.device_id)
+
+    def __hash__(self):
+        if not hasattr(self, 'local') or self.local:
+            return hash((self.device_type, self.device_id))
+        else:
+            return hash((self.hostname, self.device_type, self.device_id))
+
+    def __eq__(self, other):
+        return hash(self) == hash(other)
+
+    def __ne__(self, other):
+        return hash(self) != hash(other)
+
+
+class DLArray(ctypes.Structure):
+    """DLArray in C API"""
+    _fields_ = [("data", ctypes.c_void_p),
+                ("ctx", DLContext),
+                ("ndim", ctypes.c_int),
+                ("shape", ctypes.POINTER(ctypes.c_int64)),
+                ("stride", ctypes.POINTER(ctypes.c_int64))]
+
+
+DLArrayHandle = ctypes.POINTER(DLArray)
+
+
+def cpu(dev_id=0):
+    """Construct a CPU device
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+    """
+    return DLContext(dev_id, 1)
+
+
+def gpu(dev_id=0):
+    """Construct a GPU device
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+    """
+    return DLContext(dev_id, 2)
+
+
+def rcpu(hostname, dev_id=0):
+    """Construct a remote CPU device
+    Parameters
+    ----------
+    hostname: str
+        The hostname of device
+    dev_id : int, optional
+        The integer device id
+    """
+    return DLContext(dev_id, 1, hostname=hostname)
+
+
+def rgpu(hostname, dev_id=0):
+    """Construct a remote GPU device
+    Parameters
+    ----------
+    hostname: str
+        The hostname of device
+    dev_id : int, optional
+        The integer device id
+    """
+    return DLContext(dev_id, 2, hostname=hostname)
+
+
+def is_gpu_ctx(ctx):
+    """Return if context is GPU context.
+    Parameters
+    ----------
+    ctx : DLContext
+        The query context
+    """
+    return ctx and ctx.device_type == 2
+
+
+def shape_to_stride(shape):
+    """Return the stride.
+    Parameters
+    ----------
+    shape : tuple(int)
+        The shape tuple
+    """
+    ndim = len(shape)
+    stride = [1] * ndim
+    for i in range(ndim-1, 0, -1):
+        stride[i-1] = stride[i] * shape[i]
+    return tuple(stride)
+
+
+class NDArray(object):
+    """Lightweight NDArray class of DL runtime.
+    Strictly this is only an Array Container(a buffer object)
+    No arthimetic operations are defined.
+    """
+    __slots__ = ["handle", "no_free"]
+
+    def __init__(self, handle):
+        """Initialize the function with handle
+        Parameters
+        ----------
+        handle : DLArrayHandle
+            the handle to the underlying C++ DLArray
+        """
+        self.handle = handle
+        self.no_free = False
+
+    def __del__(self):
+        if self.no_free:
+            return
+        check_call(_LIB.DLArrayFree(self.handle))
+
+    @property
+    def shape(self):
+        """Shape of this array"""
+        return tuple(self.handle.contents.shape[i]
+                     for i in range(self.handle.contents.ndim))
+
+    @property
+    def stride(self):
+        """Stride of this array"""
+        return tuple(self.handle.contents.stride[i]
+                     for i in range(self.handle.contents.ndim))
+
+    @property
+    def lazy(self):
+        """Whether this array is lazy"""
+        return not self.stride == shape_to_stride(self.shape)
+
+    @property
+    def ctx(self):
+        """context of this array"""
+        return self.handle.contents.ctx
+
+    def __setitem__(self, in_slice, value):
+        """Set ndarray value"""
+        if (not isinstance(in_slice, slice) or
+                in_slice.start is not None
+                or in_slice.stop is not None):
+            raise ValueError('Array only support set from numpy array')
+        if isinstance(value, NDArray):
+            if value.handle is not self.handle:
+                value.copyto(self)
+        elif isinstance(value, (np.ndarray, np.generic)):
+            self._sync_copyfrom(value)
+        else:
+            raise TypeError('type %s not supported' % str(type(value)))
+
+    def _sync_copyfrom(self, source_array, data_type=np.float32):
+        """Peform an synchronize copy from the array.
+        Parameters
+        ----------
+        source_array : array_like
+            The data source we should like to copy from.
+        """
+        if not isinstance(source_array, np.ndarray):
+            try:
+                source_array = np.array(source_array, dtype=data_type)
+            except:
+                raise TypeError('array must be an array_like data,' +
+                                'type %s is not supported'
+                                % str(type(source_array)))
+        source_array = np.ascontiguousarray(source_array, dtype=data_type)
+        if source_array.shape != self.shape:
+            raise ValueError('array shape do not match the shape of NDArray')
+        source_arr, shape, stride = NDArray._numpyasarray(source_array)
+        check_call(_LIB.DLArrayCopyFromTo(
+            ctypes.byref(source_arr), self.handle, None))
+        # de-allocate shape until now
+        _ = shape
+        _ = stride
+
+    def _async_copyfrom(self, source_array, stream_handle, event_handle=None):
+        """Peform an asynchronize copy from the array.
+        Parameters
+        ----------
+        source_array : array_like
+            The data source we should like to copy from.
+        """
+        check_call(_LIB.DLArrayCopyFromTo(
+            source_array.handle, self.handle, stream_handle.handle))
+        if not event_handle is None:
+            check_call(_LIB.DLEventRecord(
+                stream_handle.handle, event_handle.handle))
+
+    def async_h2d(self, source_array, stream_handle, event_handle=None):
+        if isinstance(source_array, np.ndarray):
+            source_array = array(source_array, cpu(0))
+        assert self.handle.contents.ctx.device_type == 2
+        assert source_array.handle.contents.ctx.device_type == 1
+        assert stream_handle
+        self._async_copyfrom(source_array, stream_handle, event_handle)
+
+    def async_d2h(self, source_array, stream_handle, event_handle=None):
+        assert self.handle.contents.ctx.device_type == 1
+        assert source_array.handle.contents.ctx.device_type == 2
+        assert stream_handle
+        self._async_copyfrom(source_array, stream_handle, event_handle)
+
+    @staticmethod
+    def _numpyasarray(np_data):
+        """Return a DLArray representation of a numpy array."""
+        data = np_data
+        assert data.flags['C_CONTIGUOUS']
+        arr = DLArray()
+        shape = c_array(ctypes.c_int64, data.shape)
+        stride = c_array(ctypes.c_int64, shape_to_stride(data.shape))
+        arr.data = data.ctypes.data_as(ctypes.c_void_p)
+        arr.shape = shape
+        arr.stride = stride
+        arr.ndim = data.ndim
+        # CPU device
+        arr.ctx = cpu(0)
+        return arr, shape, stride
+
+    def asnumpy(self):
+        """Convert this array to numpy array
+        Returns
+        -------
+        np_arr : numpy.ndarray
+            The corresponding numpy array.
+        """
+        self.wrapped_lazy_callback()
+        np_arr = np.empty(self.shape, dtype=np.float32)
+        arr, shape, stride = NDArray._numpyasarray(np_arr)
+        check_call(_LIB.DLArrayCopyFromTo(
+            self.handle, ctypes.byref(arr), None))
+        _ = shape
+        _ = stride
+        return np_arr
+
+    def copyto(self, target):
+        """Copy array to target
+        Parameters
+        ----------
+        target : NDArray
+            The target array to be copied, must have same shape as this array.
+        """
+        self.wrapped_lazy_callback()
+        if isinstance(target, DLContext):
+            target = empty(self.shape, target)
+        if isinstance(target, NDArray):
+            check_call(_LIB.DLArrayCopyFromTo(
+                self.handle, target.handle, None))
+        else:
+            raise ValueError("Unsupported target type %s" % str(type(target)))
+        return target
+
+    def reshape(self, shape, target):
+        """Reshape the array to target array.
+        Parameters
+        ----------
+        shape : tuple (int)
+            The target shape.
+        target : NDArray
+            The target array.
+        """
+        self.wrapped_lazy_callback()
+        arr = DLArray()
+        arr.data = self.handle.contents.data
+        arr.ctx = self.handle.contents.ctx
+        arr.ndim = len(shape)
+        arr.shape = c_array(ctypes.c_int64, shape)
+        arr.stride = c_array(ctypes.c_int64, shape_to_stride(shape))
+        target.handle = ctypes.pointer(arr)
+        target.no_free = True
+
+    def broadcast_to(self, shape, target, add_axes=None):
+        """Broadcast the array to target array (lazy).
+        Parameters
+        ----------
+        shape : tuple (int)
+            The target shape.
+        target : NDArray
+            The target array.
+        add_axes(Optional): list (int)
+            Add axes if needed, using index of shape parameter.
+            This is for gradient node of reduce_sum_op when there exists keepdims == False.
+        """
+        if add_axes is None:
+            add_axes = []
+        arr_ndim = len(shape)
+        self_ndim = len(self.shape) + len(add_axes)
+        ori_self_shape = list(self.shape)
+        ori_self_stride = list(self.stride)
+        if self_ndim > arr_ndim:
+            assert self_ndim == arr_ndim + 1 and tuple(self.shape) == (1,)
+            ori_self_shape = []
+            ori_self_stride = []
+        self_ndim = len(ori_self_shape)
+        self_shape = [1] * arr_ndim
+        self_stride = [0] * arr_ndim
+        idx = self_ndim - 1
+        target_stride = [0] * arr_ndim
+        rule = True
+        for i in range(arr_ndim):
+            pos = arr_ndim - 1 - i
+            if pos not in add_axes and idx >= 0:
+                self_shape[pos] = ori_self_shape[idx]
+                self_stride[pos] = ori_self_stride[idx]
+                idx -= 1
+            if self_shape[pos] == shape[pos]:
+                target_stride[pos] = self_stride[pos]
+            elif self_shape[pos] != 1:
+                rule = False
+                break
+        assert rule
+        arr = DLArray()
+        arr.data = self.handle.contents.data
+        arr.ctx = self.handle.contents.ctx
+        arr.ndim = arr_ndim
+        arr.shape = c_array(ctypes.c_int64, tuple(shape))
+        arr.stride = c_array(ctypes.c_int64, tuple(target_stride))
+        target.handle = ctypes.pointer(arr)
+        target.no_free = True
+
+    def lazy_callback(self, stream=None):
+        assert self.handle.contents.ctx.device_type == 2
+        assert self.lazy
+        shape = c_array(ctypes.c_int64, self.shape)
+        stride = c_array(ctypes.c_int64, shape_to_stride(self.shape))
+        ndim = ctypes.c_int(len(self.shape))
+        handle = DLArrayHandle()
+        check_call(_LIB.DLArrayAlloc(shape, stride, ndim,
+                                     self.handle.contents.ctx, ctypes.byref(handle)))
+        check_call(_LIB.DLGpuArrayLazyCallback(
+            self.handle, handle, stream.handle if stream else None))
+        self.handle = handle
+
+    def wrapped_lazy_callback(self, stream=None):
+        # TODO: reshape / copyto / asnumpy may have more efficient implementation
+        # This is just a workaround.
+        if self.lazy:
+            # here we move the judgement for lazy into forward hooks, shouldn't have callbacks.
+            assert False
+            self.lazy_callback(stream)
+
+
+def array(arr, ctx, data_type=np.float32):
+    """Create an array from source arr.
+    Parameters
+    ----------
+    arr : numpy.ndarray
+        The array to be copied from
+    ctx : DLContext, optional
+        The device context to create the array
+    Returns
+    -------
+    ret : NDArray
+        The created array
+    """
+    if not isinstance(arr, np.ndarray):
+        arr = np.array(arr, dtype=data_type)
+    ret = empty(arr.shape, ctx)
+    ret._sync_copyfrom(arr, data_type=data_type)
+    return ret
+
+
+def empty(shape, ctx=cpu(0)):
+    """Create an empty array given shape and device
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the array
+    ctx : DLContext
+        The context of the array
+    Returns
+    -------
+    arr : ndarray
+        The array hetusys supported.
+    """
+    shape = c_array(ctypes.c_int64, shape)
+    stride = c_array(ctypes.c_int64, shape_to_stride(shape))
+    ndim = ctypes.c_int(len(shape))
+    handle = DLArrayHandle()
+    check_call(_LIB.DLArrayAlloc(
+        shape, stride, ndim, ctx, ctypes.byref(handle)))
+    return NDArray(handle)
+
+
+def numpyasdlarrayhandle(data):
+    if not data.flags['C_CONTIGUOUS']:
+        data = np.ascontiguousarray(data)
+    arr = DLArray()
+    shape = c_array(ctypes.c_int64, data.shape)
+    arr.data = data.ctypes.data_as(ctypes.c_void_p)
+    arr.shape = shape
+    arr.stride = c_array(ctypes.c_int64, shape_to_stride(data.shape))
+    arr.ndim = data.ndim
+    arr.ctx = cpu(0)
+    return arr
+
+
+class ND_Sparse_Array(object):
+    __slots__ = ["data", "row", "col", "nrow", "ncol", "lazy"]
+
+    def __init__(self, data, row, col, nrow, ncol):
+        self.data = data
+        self.row = row
+        self.col = col
+        self.nrow = nrow
+        self.ncol = ncol
+        self.lazy = False
+
+    @property
+    def shape(self):
+        """Shape of this array"""
+        return tuple((self.nrow, self.ncol))
+
+
+def sparse_array(values, indices, shape, ctx=cpu(0)):
+    """Create an sparse array from source arrs.
+    ----------
+    values : numpy.ndarray
+        The value array to be copied from
+    indices : tuple(numpy.ndarray, numpy.ndarray)
+        The index array to be copied from
+    ctx : DLContext, optional
+        The device context to create the array
+    Returns
+    -------
+    ret : NDArray
+        The created array
+    """
+    assert len(shape) == len(indices) == 2
+    assert len(values) == len(indices[0]) == len(indices[1])
+    assert isinstance(indices, tuple)
+    mat = scipy.sparse.csr_matrix((values, indices), shape)
+    values = mat.data
+    rows = mat.indptr
+    cols = mat.indices
+    values_ret = empty(values.shape, ctx)
+    values_ret._sync_copyfrom(values)
+    row_ret = empty(rows.shape, ctx)
+    row_ret._sync_copyfrom(rows, np.int32)
+    col_ret = empty(cols.shape, ctx)
+    col_ret._sync_copyfrom(cols, np.int32)
+    return ND_Sparse_Array(values_ret, row_ret, col_ret, shape[0], shape[1])
+
+
+class IndexedSlices(object):
+    __slots__ = ["indices", "values", "dense_shape", "deduplicated", "lazy"]
+
+    def __init__(self, indices=None, values=None, dense_shape=None):
+        self.indices = indices
+        self.values = values
+        self.dense_shape = dense_shape
+        self.deduplicated = False
+        self.lazy = False
+
+    def get_dense_shape(self):
+        assert self.dense_shape is not None
+        return self.dense_shape
+
+    def get_sparse_shape(self):
+        assert isinstance(self.values, NDArray)
+        return self.values.shape
+
+    def update(self, indices, values, dense_shape):
+        self.indices = indices
+        self.values = values
+        if self.dense_shape is not None:
+            assert tuple(self.dense_shape) == tuple(dense_shape)
+        else:
+            self.dense_shape = dense_shape
+
+    def deduplicate(self, stream):
+        assert is_gpu_ctx(self.indices.ctx)
+        np_indices = self.indices.asnumpy()
+        unique_indices, inverse = np.unique(np_indices, return_inverse=True)
+        indices_on_ctx = array(unique_indices, ctx=self.indices.ctx)
+        self.indices = indices_on_ctx
+        inverse_on_ctx = array(inverse, ctx=self.indices.ctx)
+        new_value_shape = list(unique_indices.shape)
+        new_value_shape.append(self.values.shape[-1])
+        new_values = empty(new_value_shape, ctx=self.values.ctx)
+        _LIB.DLGpuArraySet(new_values.handle, ctypes.c_float(
+            0), stream.handle if stream else None)
+        _LIB.DeduplicateIndexedSlices(
+            self.values.handle, inverse_on_ctx.handle, new_values.handle, stream.handle if stream else None)
+        self.values = new_values
+        self.deduplicated = True
+
+    def cpu_deduplicate(self):
+        assert not is_gpu_ctx(self.indices.ctx)
+        np_indices = self.indices.asnumpy()
+        unique_indices, inverse = np.unique(np_indices, return_inverse=True)
+        new_value_shape = list(unique_indices.shape)
+        last_dim = self.values.shape[-1]
+        new_value_shape.append(last_dim)
+        new_values = np.zeros(new_value_shape).astype(np.float32)
+        flatten_ind = np_indices.reshape(-1)
+        flatten = self.values.asnumpy().reshape((-1, last_dim))
+        for i, ind in enumerate(inverse):
+            new_values[ind] += flatten[i]
+        self.values = array(new_values, cpu(0))
+        self.indices = array(unique_indices, cpu(0))
+        self.deduplicated = True
+
+    def free_deduplicate(self):
+        if self.deduplicated:
+            del self.indices
+            del self.values
+            self.indices = None
+            self.values = None
+            self.deduplicated = False
diff --git a/python/hetu/onnx/X2hetu/__init__.py b/python/hetu/onnx/X2hetu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/hetu/onnx/X2hetu/handler.py b/python/hetu/onnx/X2hetu/handler.py
new file mode 100644
index 0000000..685fba0
--- /dev/null
+++ b/python/hetu/onnx/X2hetu/handler.py
@@ -0,0 +1,89 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import copy
+import inspect
+import os
+import shutil
+
+from onnx import defs
+
+
+class BackendHandler:
+
+    ONNX_OP = None
+
+    DOMAIN = defs.ONNX_DOMAIN
+    VERSION = 0
+    SINCE_VERSION = 0
+    PARTIAL_SUPPORT = False
+    PS_DESCRIPTION = ""
+    ONEFLOW_BLOBNAME_MAP = {}
+    ONEFLOW_CODE_GEN = []
+    OP_OUTPUS = []
+
+    @classmethod
+    def check_cls(cls):
+        if not cls.ONNX_OP:
+            print('doesn`t have ONNX_OP')
+
+    @staticmethod
+    def onnx_op(op):
+        return BackendHandler.property_register("ONNX_OP", op)
+
+    @classmethod
+    def handle(cls, node, tensor_dict, **kwargs):
+
+        ver_handle = getattr(cls, "version_{}".format(cls.SINCE_VERSION), None)
+        if ver_handle:
+            return ver_handle(node, tensor_dict, **kwargs)
+        raise ValueError(
+            'node "{}" of version {} is not supported'.format(
+                node.op_type, cls.SINCE_VERSION
+            )
+        )
+
+    @classmethod
+    def run_onnx_node(cls,
+                      node, tensor_dict,
+                      inputs=None,
+                      attrs=None,
+                      name='',
+                      **kwargs,
+                      ):
+        if inputs is None:
+            inputs = [tensor_dict.get(inp, None)
+                      for inp in node.input_tensor_names]
+        if attrs is None:
+            attrs = copy.deepcopy(node._attrs)
+        if name != "":
+            attrs["name"] = name
+        for inp in node.input_tensor_names:
+            if tensor_dict[inp] not in cls.ONEFLOW_BLOBNAME_MAP:
+                cls.ONEFLOW_BLOBNAME_MAP[tensor_dict[inp]] = inp
+        cls.OP_OUTPUS = []
+        for oup in node.output_tensor_names:
+            cls.OP_OUTPUS.append(oup)
+        # todo
+        # y = cls._run_flow_func(flow_func, inputs, attrs)
+        # if type(y) == list():
+        #     for x in cls.OP_OUTPUS:
+        #         if y[x] not in cls.ONEFLOW_BLOBNAME_MAP:
+        #             cls.ONEFLOW_BLOBNAME_MAP[y[x]] = x
+        # else:
+        #     if y not in cls.ONEFLOW_BLOBNAME_MAP:
+        #         cls.ONEFLOW_BLOBNAME_MAP[y] = cls.OP_OUTPUS[0]
+        return None  # y
+
+    @staticmethod
+    def property_register(name, value):
+        def deco(cls):
+            setattr(cls, name, value)
+            return cls
+
+        return deco
+
+
+onnx_op = BackendHandler.onnx_op
diff --git a/python/hetu/onnx/X2hetu/handlers/__init__.py b/python/hetu/onnx/X2hetu/handlers/__init__.py
new file mode 100644
index 0000000..d7bdfa2
--- /dev/null
+++ b/python/hetu/onnx/X2hetu/handlers/__init__.py
@@ -0,0 +1,7 @@
+import os
+import pkgutil
+
+__all__ = [
+    modname
+    for _, modname, _ in pkgutil.walk_packages(path=[os.path.split(__file__)[0]])
+]
diff --git a/python/hetu/onnx/X2hetu/handlers/array.py b/python/hetu/onnx/X2hetu/handlers/array.py
new file mode 100644
index 0000000..47e26ab
--- /dev/null
+++ b/python/hetu/onnx/X2hetu/handlers/array.py
@@ -0,0 +1,99 @@
+import numpy as np
+
+from hetu.onnx.X2hetu.handler import BackendHandler, onnx_op
+
+import hetu as ht
+
+
+@onnx_op('Identity')
+class Identify(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        inputs = [tensor_dict.get(inp, None)
+                  for inp in node.input_tensor_names]
+        assert len(inputs) == 1
+        y = inputs[0]
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+
+@onnx_op('Reshape')
+class Reshape(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        x = tensor_dict[node.input_tensor_names[0]]
+        output_shape = tensor_dict[node.input_tensor_names[1]]
+
+        y = ht.array_reshape_op(x, output_shape)
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_5(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_13(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+
+@onnx_op('Transpose')
+class Transpose(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        x = tensor_dict[node.input_tensor_names[0]]
+        perm = node.get_attr_value('perm')
+        y = ht.transpose_op(x, perm=perm)
+        tensor_dict[node.output_tensor_names[0]] = y
+
+        return y
+
+    @classmethod
+    def version_5(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_13(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+
+@onnx_op('Slice')
+class Slice(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        x = tensor_dict[node.input_tensor_names[0]]
+        ends = node.get_attr_value('ends')
+        starts = node.get_attr_value('starts')
+        # for hetu ,size is (-1,size) ,it is output_shape
+        assert len(ends) == len(starts)
+        size = [ends[i] - starts[i] for i in range(len(ends))]
+
+        size[0] = -1
+        y = ht.slice_op(x, begin=starts, size=size)
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+
+@onnx_op('Concat')
+class Concat(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        a = tensor_dict[node.input_tensor_names[0]]
+        b = tensor_dict[node.input_tensor_names[1]]
+        axis = node.get_attr_value('axis')
+
+        y = ht.concat_op(a, b, axis=axis)
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_4(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
diff --git a/python/hetu/onnx/X2hetu/handlers/math.py b/python/hetu/onnx/X2hetu/handlers/math.py
new file mode 100644
index 0000000..192b359
--- /dev/null
+++ b/python/hetu/onnx/X2hetu/handlers/math.py
@@ -0,0 +1,84 @@
+import numpy as np
+
+from hetu.onnx.X2hetu.handler import BackendHandler, onnx_op
+import hetu as ht
+
+
+@onnx_op('Cast')
+class Cast(BackendHandler):
+
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        output_shape = tensor_dict[node.input_tensor_names[0]].tensor_value
+        assert output_shape is not None
+
+        tensor_dict[node.output_tensor_names[0]] = output_shape
+
+    @classmethod
+    def version_6(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+
+@onnx_op('Add')
+class Add(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        assert False, 'not yet implemented! addd version 1'
+
+    @classmethod
+    def version_6(cls, node, tensor_dict, **kwargs):
+        assert False, 'not yet implemented! add version 6'
+
+    @classmethod
+    def version_7(cls, node, tensor_dict, **kwargs):
+        inputs = [tensor_dict.get(inp, None)
+                  for inp in node.input_tensor_names]
+        assert len(inputs) == 2
+        y = ht.add_op(inputs[0], inputs[1])
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+
+@onnx_op('MatMul')
+class MatMul(BackendHandler):
+
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        a = tensor_dict[node.input_tensor_names[0]]
+        b = tensor_dict[node.input_tensor_names[1]]
+
+        y = ht.matmul_op(a, b, )
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_9(cls, node, tensor_dict, **kwargs):
+        assert False, 'not yet implemented! matmul version 9'
+        pass
+
+
+@onnx_op('Gemm')
+class Gemm(BackendHandler):
+
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        a = tensor_dict[node.input_tensor_names[0]]
+        b = tensor_dict[node.input_tensor_names[1]]
+        transA = False if node.get_attr_value("transA", 0) == 0 else True
+        transB = False if node.get_attr_value("transB", 0) == 0 else True
+
+        y = ht.matmul_op(a, b, trans_A=transA, trans_B=transB)
+        if len(node.input_tensor_names) > 2:
+            z = tensor_dict[node.input_tensor_names[2]]
+            y = ht.add_op(y, z)
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_7(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_9(cls, node, tensor_dict, **kwargs):
+        assert False, 'not yet implemented! matmul version 9'
+        pass
diff --git a/python/hetu/onnx/X2hetu/handlers/nn.py b/python/hetu/onnx/X2hetu/handlers/nn.py
new file mode 100644
index 0000000..9053f4d
--- /dev/null
+++ b/python/hetu/onnx/X2hetu/handlers/nn.py
@@ -0,0 +1,68 @@
+import numpy as np
+
+from hetu.onnx.X2hetu.handler import BackendHandler, onnx_op
+
+import hetu as ht
+
+
+@onnx_op('Conv')
+class Conv2d(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        x = tensor_dict[node.input_tensor_names[0]]
+        in_weights = tensor_dict[node.input_tensor_names[1]]
+        in_weights_shape = list(in_weights.shape)
+        paddings = node.get_attr_value('pads')
+        strides = node.get_attr_value('strides')
+        assert len(set(paddings)) == 1 and len(set(strides)) == 1
+
+        y = ht.conv2d_op(x, in_weights, padding=paddings[0], stride=strides[0])
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+
+@onnx_op('Relu')
+class Relu(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        inputs = [tensor_dict.get(inp, None)
+                  for inp in node.input_tensor_names]
+        assert len(inputs) == 1
+        y = ht.relu_op(inputs[0])
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_6(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+
+@onnx_op('AveragePool')
+class AveragePool(BackendHandler):
+    @classmethod
+    def version_1(cls, node, tensor_dict, **kwargs):
+        inputs = [tensor_dict.get(inp, None)
+                  for inp in node.input_tensor_names]
+        assert len(inputs) == 1
+        kernel_shape = node.get_attr_value('kernel_shape')
+        strides = node.get_attr_value('strides')
+        assert len(kernel_shape) == 2 and len(strides) == 2
+        assert strides[0] == strides[1], 'strides 0 and 1 must be equal now!'
+
+        # todo,here padding set to 0. check.
+        y = ht.avg_pool2d_op(inputs[0], kernel_H=kernel_shape[0],
+                             kernel_W=kernel_shape[1], padding=0, stride=strides[0])
+        tensor_dict[node.output_tensor_names[0]] = y
+        return y
+
+    @classmethod
+    def version_7(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_10(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
+
+    @classmethod
+    def version_11(cls, node, tensor_dict, **kwargs):
+        cls.version_1(node, tensor_dict, **kwargs)
diff --git a/python/hetu/onnx/__init__.py b/python/hetu/onnx/__init__.py
new file mode 100644
index 0000000..c584a69
--- /dev/null
+++ b/python/hetu/onnx/__init__.py
@@ -0,0 +1,9 @@
+from __future__ import division
+from .hetu2onnx import (export)
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+__all__ = ["hetu2onnx", "util", "constants", "handler", "graph", "onnx2hetu"]
+
+from hetu.onnx import (hetu2onnx, util, constants, graph, handler, onnx2hetu)
diff --git a/python/hetu/onnx/constants.py b/python/hetu/onnx/constants.py
new file mode 100644
index 0000000..3e130aa
--- /dev/null
+++ b/python/hetu/onnx/constants.py
@@ -0,0 +1,9 @@
+NEEDLESS_ATTRS = ['op', 'desc', 'id', 'swap', 'trainable', 'ctx', 'event',
+                  'inplace', 'lazy_execution', 'on_cpu', 'on_gpu', 'compute', 'middle_result', 'gpu_buffer',
+
+
+                  ]
+
+
+ONNX_DOMAIN = ""
+AI_ONNX_ML_DOMAIN = "ai.onnx.ml"
diff --git a/python/hetu/onnx/graph.py b/python/hetu/onnx/graph.py
new file mode 100644
index 0000000..46db17d
--- /dev/null
+++ b/python/hetu/onnx/graph.py
@@ -0,0 +1,437 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import collections
+import copy
+import logging
+import six
+import numpy as np
+from os.path import join as pathjoin
+
+from onnx import (
+    helper,
+    numpy_helper,
+    OperatorSetIdProto,
+    AttributeProto,
+    TensorProto,
+    onnx_pb,
+)
+
+from hetu.onnx import util
+from hetu.onnx.util import FindOpset
+from hetu.onnx import constants
+
+
+class Node(object):
+    def __init__(self, node, graph=None):
+        self._op = node
+        self._graph = graph
+        self._inputs = list(node.input)
+        self._outputs = list(node.output)
+        self._attrs = {}
+        if graph is not None:
+            graph.set_node_by_name(self)
+        for a in node.attribute:
+            self._attrs[a.name] = a
+
+    @property
+    def input_tensor_names(self):
+        return self._inputs
+
+    @input_tensor_names.setter
+    def input_tensor_names(self, val):
+        self._inputs = copy.deepcopy(val)
+
+    @property
+    def output_tensor_names(self):
+        return copy.deepcopy(self._outputs)
+
+    @property
+    def name(self):
+        return self._op.name
+
+    @property
+    def op_type(self):
+        return self._op.op_type
+
+    @op_type.setter
+    def op_type(self, val):
+        self._op.op_type = val
+
+    @property
+    def is_graph_input(self):
+        return self.op_type in ['defined_in']
+
+    @property
+    def is_graph_output(self):
+        return self.op_type in ['defined_out']
+
+    @property
+    def input_nodes(self):
+        return [self._graph.get_node_by_outputname(n) for n in self._inputs]
+
+    @property
+    def op(self):
+        return self._op
+
+    def set_attr(self, name, val):
+        self._attrs[name] = helper.make_attribute(name, val,)
+
+    def get_attr(self, name, default=None):
+        return self._attrs.get(name, default)
+
+    def get_attr_value(self, name, default=None):
+        attr = self.get_attr(name)
+        if attr:
+            attr_val = helper.get_attribute_value(attr)
+            if isinstance(attr_val, bytes):
+                attr_val = attr_val.decode('utf-8')
+            return attr_val
+        return default
+
+    @property
+    def is_const(self):
+        return self.op_type in ['Const'] or \
+            (self.op_type in ['PlaceholderOp']
+             and self._attrs.get('value') is not None)
+
+    def get_tensor_value(self, as_list=True):
+        assert self.is_const, "Failed: Node {} must be Const".format(self.name)
+        t = self.get_attr('value')
+        t = numpy_helper.to_array(helper.get_attribute_value(t))
+        if as_list:
+            t = t.tolist()
+        return t
+
+    def onnx_attrs(self):
+        schema = util.get_schema(self.op_type, self._graph._opset)
+        onnx_attrs = {}
+        for name, attr in self._attrs.items():
+            if name == 'value':
+                onnx_attrs[name] = self._attrs['value']
+            elif schema is None or schema.has_attribute(name):
+                onnx_attrs[name] = attr
+        return onnx_attrs
+
+    def update_node_proto(self):
+        nodes = list(self._op.input)
+        for node in nodes:
+            self._op.input.remove(node)
+        self._op.input.extend(self._inputs)
+        nodes = list(self._op.output)
+        for node in nodes:
+            self._op.output.remove(node)
+        self._op.output.extend(self._outputs)
+
+        del self._op.attribute[:]
+
+        attr = list(self.onnx_attrs().values())
+        if attr:
+            self._op.attribute.extend(attr)
+
+    # add for X2hetu
+
+    @property
+    def domain(self):
+        """Return Op type."""
+        return self._op.domain
+
+
+class Graph(object):
+
+    def __init__(self, nodes, shapes=None, dtypes=None, opset=None, output_names=None):
+
+        self._nodes = []
+        self._nodename_to_node = {}
+        self._outputname_to_nodename = {}
+        self._dtypes = dtypes
+        self._shapes = shapes
+        self._opset = FindOpset(opset)
+        self._outputs = output_names if output_names is not None else []
+        ops = [Node(node, self) for node in nodes]
+        self.update_graph_nodes(ops)
+
+    def update_graph_nodes(self, ops):
+
+        remained_dtypes = {}
+        remained_shapes = {}
+        self._outputname_to_nodename = {}
+        for op in ops:
+            for op_output in op.output_tensor_names:
+                if op_output in self._dtypes:
+                    remained_dtypes[op_output] = self._dtypes[op_output]
+                if op_output in self._shapes:
+                    remained_shapes[op_output] = self._shapes[op_output]
+                self._outputname_to_nodename[op_output] = op.name
+
+        self._nodes = ops
+        self._nodename_to_node = {op.name: op for op in ops}
+        self._dtypes = remained_dtypes
+        self._shapes = remained_shapes
+
+    def set_node_by_name(self, node):
+        self._nodename_to_node[node.name] = node
+        for outputname in node._outputs:
+            self._outputname_to_nodename[outputname] = node.name
+
+    def get_node_by_outputname(self, outputname):
+        nodename = self._outputname_to_nodename.get(outputname)
+        if nodename:
+            return self._nodename_to_node.get(nodename)
+        return None
+
+    def get_shape(self, name):
+        return self._shapes.get(name)
+
+    def set_shape(self, name, val):
+        self._shapes[name] = val
+
+    def get_dtype(self, name):
+        return self._dtypes.get(name)
+
+    def set_dtype(self, name, val):
+        self._dtypes[name] = val
+
+    def update_node_shape_dtype(self, node):
+        if node.is_const or node.is_graph_input:
+            return
+        initializers = []
+        for i, inp in enumerate(node.input_nodes):
+            if inp.is_const:
+                tensor = util.TensorProtoFromNumpy(inp.get_tensor_value(as_list=False),
+                                                   name=inp.output_tensor_names[0])
+                initializers.append(tensor)
+        input_shapes = [self.get_shape(i) for i in node.input_tensor_names]
+        input_dtypes = [self.get_dtype(i) for i in node.input_tensor_names]
+        shapes, dtypes = util.InferOnnxShapeDtype(
+            node, self._opset, input_shapes, input_dtypes, initializers)
+        if not shapes or not dtypes:
+            return
+        for output, shape, dtype in zip(node.output_tensor_names, shapes, dtypes):
+            self.set_dtype(output, dtype)
+            self.set_shape(output, shape)
+
+    def make_const(self, name, np_val, raw=False, is_0D_tensor=False):
+
+        shape = [] if is_0D_tensor else np_val.shape
+        if raw:
+            onnx_tensor = None
+            # fixme:   Not yet implemented
+            pass
+        else:
+            onnx_tensor = helper.make_tensor(
+                name,
+                util.numpy_to_onnx_dtype(np_val.dtype),
+                shape,
+                np_val,
+                raw=False,
+            )
+        dtype = onnx_tensor.data_type
+        node = self.make_node(
+            "Const",
+            [],
+            outputs=[name],
+            name=name,
+            attr={"value": onnx_tensor},
+            dtypes=[dtype],
+        )
+        self.set_shape(name, shape)
+        self.set_dtype(name, dtype)
+        return node
+
+    def make_node(self, op_type, inputs, attr=None, output_count=1, outputs=None,
+                  name=None, shapes=None, dtypes=None):
+        if attr is None:
+            attr = {}
+        if shapes is None:
+            shapes = []
+        if dtypes is None:
+            dtypes = []
+        if name is None:
+            name = util.make_name(op_type)
+        if outputs is None:
+            outputs = [name+':'+str(i) for i in range(output_count)]
+        output_count = len(outputs)
+
+        onnx_node = helper.make_node(
+            op_type, inputs, outputs, name=name, **attr)
+
+        node = Node(onnx_node, self)
+
+        if shapes:
+            assert len(
+                shapes) == output_count, "Failed: output shapes count not equal to output count when make_node"
+            for i in range(output_count):
+                self.set_shape(node._outputs[i], shapes[i])
+        if dtypes:
+            assert len(
+                dtypes) == output_count, "Failed: output dtypes count not equal to output count when make_node"
+            for i in range(output_count):
+                self.set_dtype(node._outputs[i], dtypes[i])
+        if not shapes or not dtypes:
+            self.update_node_shape_dtype(node)
+        self._nodes.append(node)
+        return node
+
+    def insert_new_node_on_input(self, node, op_type, input_name, name=None, **kwargs):
+        if name is None:
+            name = util.make_name(node.name)
+        new_output = util.make_name(name)
+        if not isinstance(input_name, list):
+            input_name = [input_name]
+        new_node = self.make_node(
+            op_type,
+            input_name,
+            attr=kwargs,
+            outputs=[new_output],
+            name=name,
+        )
+        for i, n in enumerate(node.input_tensor_names):
+            if n == input_name[0]:
+                node.input_tensor_names[i] = new_output
+                break
+        return new_node
+
+    def insert_new_node_on_output(self, op_type, output_name, name, **kwargs):
+        new_output = util.make_name(name)
+        new_node = self.make_node(
+            op_type,
+            [output_name],
+            attr=kwargs,
+            outputs=[new_output],
+            name=name,
+        )
+        for node in self._nodes:
+            if node == new_node:
+                continue
+            for i, input_name in enumerate(node.input_tensor_names):
+                if input_name == output_name:
+                    node.input_tensor_names[i] = new_output
+        return new_node
+
+    def replace_input(self, node, old_input, new_input, input_index=None):
+        if input_index is None:
+            for i, input_name in enumerate(node._inputs):
+                if input_name == old_input:
+                    node._inputs[i] = new_input
+        elif node._inputs[input_index] == old_input:
+            node._inputs[input_index] = new_input
+        else:
+            raise RuntimeError("Failed:Unable to replace input %r into %r for node %r." % (
+                old_input, new_input, node.name))
+
+    def topology_sort(self, ops):
+        def _push_stack(stack, node, in_stack):
+            stack.append(node)
+            if node in in_stack:
+                raise ValueError("Graph has cycles.")
+            in_stack[node] = True
+
+        def _get_unvisited_child(g, node, not_visited):
+            for child in g[node]:
+                if child in not_visited:
+                    return child
+            return -1
+
+        ops.sort(key=lambda op: op.name)
+        n = len(ops)
+        g = [[] for _ in range(n)]
+        op_name_to_index = {}
+        for i, op in enumerate(ops):
+            op_name_to_index[op.name] = i
+        for i, op in enumerate(ops):
+            all_input = list(op.input_tensor_names)
+            for inp in sorted(all_input):
+                j = self.get_node_by_outputname(inp)
+                g[op_name_to_index[j.name]].append(i)
+
+        label = [-1 for _ in range(n)]
+        stack = []
+        in_stack = dict()
+        not_visited = dict.fromkeys(range(n))
+        label_counter = n-1
+        while not_visited:
+            node = list(not_visited.keys())[0]
+            _push_stack(stack, node, in_stack)
+            while stack:
+                node = _get_unvisited_child(g, stack[-1], not_visited)
+                if node != -1:
+                    _push_stack(stack, node, in_stack)
+                else:
+                    node = stack.pop()
+                    in_stack.pop(node)
+                    not_visited.pop(node)
+                    label[node] = label_counter
+                    label_counter -= 1
+        ret = [x for _, x in sorted(zip(label, ops))]
+        self.update_graph_nodes(ret)
+
+    def make_model(self, graph_doc, onnx_filename, graph_name='hetu.python.onnx'):
+        graph = self.make_graph(
+            graph_doc, onnx_filename, graph_name=graph_name,
+        )
+        model_proto = helper.make_model(graph)
+        return model_proto
+
+    def make_graph(self, doc, onnx_filename, graph_name='hetu.python.onnx'):
+
+        for node in self._nodes:
+            node.update_node_proto()
+
+        ops = []
+        const_ops = []
+        input_ops = []
+        for op in self._nodes:
+            if op.is_const:
+                const_ops.append(op)
+                continue
+            if op.is_graph_input:
+                input_ops.append(op)
+                continue
+            ops.append(op)
+        initializers = []
+        for op in const_ops:
+            tensor_name = op.output_tensor_names[0]
+            tensor = util.TensorProtoFromNumpy(
+                op.get_tensor_value(as_list=False),
+                tensor_name,
+                export_path=onnx_filename,
+            )
+            initializers.append(tensor)
+
+        # sorted inputs by input id.  input_tensor_name like this:   A:0,B:1
+        # fixme:mybe outputs should be sort also
+        input_ids = [op.output_tensor_names[0] for op in input_ops]
+        input_ids = sorted(input_ids, key=lambda x: int(x.split('-')[-1]))
+
+        if self._opset < 9:
+            input_ids += [op.output_tensor_names[0] for op in const_ops]
+        input_tensor_values = self.MakeOnnxGraphIO(input_ids)
+        output_tensor_values = self.MakeOnnxGraphIO(self._outputs)
+
+        graph = helper.make_graph(
+            [op.op for op in ops],
+            graph_name,
+            input_tensor_values,
+            output_tensor_values,
+            initializer=initializers,
+            doc_string=doc,
+        )
+
+        return graph
+
+    def MakeOnnxGraphIO(self, ids):
+        tensor_value_infos = []
+        for name in ids:
+            dtype = self.get_dtype(name)
+            shape = self.get_shape(name)
+            v = util.MakeOnnxInputsOutputs(name, dtype, shape)
+            tensor_value_infos.append(v)
+        return tensor_value_infos
+
+    def copy_shape(self, input_name, output_name):
+        shape = self.get_shape(input_name)
+        if shape is not None:
+            self.set_shape(output_name, shape)
diff --git a/python/hetu/onnx/handler.py b/python/hetu/onnx/handler.py
new file mode 100644
index 0000000..709be51
--- /dev/null
+++ b/python/hetu/onnx/handler.py
@@ -0,0 +1,73 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import collections
+import inspect
+
+from hetu.onnx import constants
+
+
+class hetu_op:
+
+    _OPSETS = collections.OrderedDict()
+    _MAPPING = None
+
+    def __init__(self, name, onnx_op=None, domain=constants.ONNX_DOMAIN, **kwargs):
+
+        if not isinstance(name, list):
+            name = [name]
+        self.name = name
+        if not isinstance(onnx_op, list):
+            onnx_op = [onnx_op]*len(name)
+        self.onnx_op = onnx_op
+        self.domain = domain
+        self.kwargs = kwargs
+
+    def __call__(self, func):
+        opset = hetu_op._OPSETS.get(self.domain)
+        if not opset:
+            opset = []
+            hetu_op._OPSETS[self.domain] = opset
+        for k, v in inspect.getmembers(func, inspect.ismethod):
+            if k.startswith("version_"):
+                version = int(k.replace("version_", ""))
+                while version >= len(opset):
+                    opset.append({})
+                opset_dict = opset[version]
+                for i, name in enumerate(self.name):
+                    opset_dict[name] = (v, self.onnx_op[i], self.kwargs)
+        return func
+
+    @staticmethod
+    def get_opsets():
+        return hetu_op._OPSETS
+
+    @staticmethod
+    def create_mapping(max_onnx_opset_version):
+        mapping = {constants.ONNX_DOMAIN: max_onnx_opset_version}
+        ops_mapping = {}
+        for domain, opsets in hetu_op.get_opsets().items():
+            for target_opset, op_map in enumerate(opsets):
+                m = mapping.get(domain)
+                if m:
+                    if target_opset <= m and op_map:
+                        ops_mapping.update(op_map)
+
+        hetu_op._MAPPING = ops_mapping
+        return ops_mapping
+
+    @staticmethod
+    def find_effective_op(name):
+        """Find the effective version of an op create_mapping.
+           This is used if we need to compose ops from other ops where we'd need to find the
+           op that is doing to be used in the final graph, for example there is a custom op
+           that overrides a onnx op ...
+
+        :param name: The operator name.
+        """
+        map_info = hetu_op._MAPPING.get(name)
+        if map_info is None:
+            return None
+        return map_info
diff --git a/python/hetu/onnx/hetu2onnx.py b/python/hetu/onnx/hetu2onnx.py
new file mode 100644
index 0000000..22e11fe
--- /dev/null
+++ b/python/hetu/onnx/hetu2onnx.py
@@ -0,0 +1,204 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+
+from typing import Text, Optional, Dict, Callable, List
+
+import hetu.onnx.onnx_opset
+from hetu import Executor
+from hetu.onnx import util, constants, handler
+from hetu.onnx.graph import Graph
+
+
+import numpy as np
+import ctypes
+import os
+import collections
+try:
+    import onnx
+    from onnx import (helper, onnx_pb, numpy_helper)
+except:
+    print('ONNX export support disabled because onnx python package is not found.')
+    print(' You may install onnx package with "pip install onnx".')
+
+
+def export(
+        executor: Executor,
+        inputs: list,
+        outputs: list,
+        onnx_save_dir: Text,
+        job_name: Text = None):
+    assert len(inputs) > 0
+    assert len(outputs) > 0
+    onnx_graph = ProcessHetuGraph(
+        executor, inputs, outputs,
+    )
+    if job_name is None:
+        job_name = 'HetutoOnnx'
+    model_proto = onnx_graph.make_model(
+        job_name, onnx_save_dir,
+    )
+    with open(onnx_save_dir, 'wb') as f:
+        try:
+            f.write(model_proto.SerializeToString())
+        except ValueError as e:
+            raise ValueError(
+                "Error occured when running model_proto.SerializeToString.")
+
+    # node_list,input_list,output_list=HetuToOnnxNaive(executor,inputs,outputs)
+    # graph_proto=helper.make_graph(node_list,"test",input_list,output_list)
+    # onnx.checker.check_graph(graph_proto)
+    # model_def=helper.make_model(graph_proto,producer_name="test_onnx")
+    # onnx.checker.check_model(model_def)
+    # onnx.save(model_def,onnx_save_dir)
+
+
+def ProcessHetuGraph(
+        executor,
+        inputs,
+        outputs,
+        opset=None,
+):
+    opset = util.FindOpset(opset)  # opset=12 on my pc
+    if opset > util.get_max_supported_opset_version():
+        print("Onnx package %s is too low to support opset %s!",
+              util.get_onnx_version(), opset)
+
+    (onnx_nodes, dtypes, shapes, output_names) = HetuToOnnxPrevious(
+        executor, inputs, outputs)
+
+    g = Graph(onnx_nodes, shapes, dtypes, opset, output_names=output_names)
+
+    ops_mapping = handler.hetu_op.create_mapping(g._opset)
+    HetuOnnxMapping(g, ops_mapping)
+    g.topology_sort(g._nodes)
+    return g
+
+
+def HetuOnnxMapping(g, ops_mapping):
+    mapped_op = collections.Counter()
+    ops = list(g._nodes)
+
+    for node in ops:
+        op = node.op_type
+        map_info = ops_mapping.get(op)
+
+        assert map_info is not None, "op [%s:%s] is not supported" % (
+            node.name, op)
+
+        mapped_op[op] += 1
+        func, onnx_op, kwargs = map_info
+        if onnx_op is not None:
+            node.op_type = onnx_op
+        try:
+            func(g, node, **kwargs)
+        except Exception as ex:
+            assert False, "Failed to convert node %s" % (node.name)
+
+    return mapped_op
+
+
+def HetuToOnnxPrevious(executor, inputs, outputs):
+
+    def get_op_name(node):
+        if node in inputs:
+            return node.name+'-'+str(inputs.index(node))
+        if node in outputs:
+            return node.name+'-'+str(outputs.index(node))
+        return node.name+'_'+str(node.id)
+
+    def get_op_shape(node):
+        return executor.node_to_shape_map[node]
+
+    topo_nodes = executor.topo_order
+
+    dtypes = {}
+    node_list = []
+    output_names = []
+    shapes = {}
+    for node in topo_nodes:
+        attrs = {}
+        kvs = {**node.__dict__}
+        for k, v in kvs.items():
+            if k in constants.NEEDLESS_ATTRS or v is None:
+                continue
+            # for batchnormop ,in gpu ,type(save_var and save_mean) is narray.ndarray ,not support in onnx.
+            #
+            if isinstance(v, hetu.ndarray.NDArray):
+                # print(k,v)
+
+                v = v.asnumpy()
+            # for placeholder
+            if k == 'tensor_value' and v is not None:
+                v = numpy_helper.from_array(v, name=get_op_name(node))
+                k = 'value'
+            if k == 'name':
+                v = get_op_name(node)
+            if isinstance(v, np.ndarray):
+                v = numpy_helper.from_array(v, name=get_op_name(node))
+            if node.op_type == 'PadOp' and k == 'paddings':
+                assert isinstance(v, list)
+                v = np.array(v).transpose().flatten()
+            if isinstance(v, ctypes.c_ulong):
+                v = v.value
+            attrs[k] = v
+
+        try:
+            if node in inputs:
+                attrs['op_type'] = 'defined_in'
+            attrs['inputs'] = [get_op_name(no) for no in attrs['inputs']]
+            attrs['outputs'] = [get_op_name(node)]
+            if node in outputs:
+                try:
+                    assert len(attrs['outputs']) == 1, "Failed: output node %s must have one output" % (
+                        node.name)
+
+                    defined_out_name = 'defined_out' + \
+                        attrs['outputs'][0][attrs['outputs'][0].rfind(':'):]
+                    onnx_node = helper.make_node('Identity', inputs=attrs['outputs'],
+                                                 outputs=[defined_out_name],
+                                                 name=defined_out_name)
+                    # fixme:hetu tensor have only one dtype as np.float now.
+                    if node.op_type == 'OneHotOp':
+                        dtype = np.int64
+                    else:
+                        dtype = np.float32
+                    dtypes[onnx_node.name] = util.numpy_to_onnx_dtype(dtype)
+                    if onnx_node.name not in shapes:
+                        shapes[onnx_node.name] = get_op_shape(node)
+                    output_names.append(onnx_node.name)
+                    node_list.append(onnx_node)
+                except Exception as ex:
+                    print("convert failed for %s to defined_out, ex=%s" %
+                          (node.name, ex))
+                    raise
+
+        except Exception as ex:
+            print("format inputs failed for %s, ex=%s" % (node.name, ex))
+            raise
+        assert attrs.__contains__('op_type')
+        assert attrs.__contains__('inputs')
+        assert attrs.__contains__('outputs')
+
+        # fixme:hetu tensor have only one dtype as np.float now.
+        # fixme:only variableop add dtype attr now.
+        if attrs.__contains__('dtype'):
+            dtype = attrs['dtype']
+            # same name of 'dtype' in onnx:make_node. so del it first.
+            del attrs['dtype']
+        else:
+            dtype = np.float32
+        dtypes[attrs['name']] = util.numpy_to_onnx_dtype(dtype)
+
+        if attrs['name'] not in shapes:
+            shapes[attrs['name']] = get_op_shape(node)
+        try:
+            onnx_node = helper.make_node(**attrs,)
+            node_list.append(onnx_node)
+        except Exception as ex:
+            print(attrs)
+            print("convert failed for %s, ex=%s" % (node.name, ex))
+            raise
+    return node_list, dtypes, shapes, output_names
diff --git a/python/hetu/onnx/onnx2hetu.py b/python/hetu/onnx/onnx2hetu.py
new file mode 100644
index 0000000..a87d285
--- /dev/null
+++ b/python/hetu/onnx/onnx2hetu.py
@@ -0,0 +1,215 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+
+from typing import Text, Optional, Dict, Callable, List
+
+import hetu.onnx.onnx_opset
+from hetu import Variable
+from hetu.onnx import util, constants, handler
+from hetu.onnx.graph import Graph
+
+from hetu.onnx.X2hetu.handler import BackendHandler
+from hetu.onnx.X2hetu.handlers import *
+from hetu.onnx.graph import Node as OnnxNode
+
+import numpy as np
+import ctypes
+import os
+import collections
+try:
+    import onnx
+    from onnx import (helper, onnx_pb, numpy_helper)
+    from onnx.helper import make_opsetid
+    from onnx import defs
+except:
+    print('ONNX export support disabled because onnx python package is not found.')
+    print(' You may install onnx package with "pip install onnx".')
+
+
+def load_onnx(onnx_path=None,):
+    assert onnx_path is not None, 'onnx path is None!'
+    onnx_model = onnx.load(onnx_path)
+    return from_onnx(onnx_model)
+    pass
+
+
+def from_onnx(onnx_model: onnx.ModelProto,):
+
+    initialized = [
+        init.name
+        for init in onnx_model.graph.initializer
+    ]
+    input_names = []
+    for x in onnx_model.graph.input:
+        if x.name not in initialized:
+            input_names.append(x.name)
+
+    # for x in onnx_model.graph.initializer:
+    #     print(x.name)
+    # for i,node in enumerate(onnx_model.graph.node):
+    #     print(i,node.name,node.op_type,[[inp] for inp in node.input],[[inp] for inp in node.output])
+
+    d = prepare(onnx_model)
+    output_names = [x.name for x in onnx_model.graph.output]
+    assert len(input_names) == 1 and len(
+        output_names) == 1, 'only support length of input and output is 1 now.'
+    return d[input_names[0]], d[output_names[0]]
+
+    # if len(output_names) == 1:
+    #     return d[output_names[0]]
+    # return {output_name: d[output_name] for output_name in output_names}
+
+    # output = onnx_model.graph.output
+    # print(output)
+    pass
+
+
+def get_all_backend_handlers(opset_dict):
+
+    handlers = {}
+    for handler in BackendHandler.__subclasses__():
+        handler.check_cls()
+
+        domain = handler.DOMAIN
+        version = opset_dict[domain]
+        handler.VERSION = version
+
+        since_version = 1
+        if defs.has(handler.ONNX_OP, domain=handler.DOMAIN):
+            try:
+                since_version = defs.get_schema(
+                    handler.ONNX_OP,
+                    domain=handler.DOMAIN,
+                    max_inclusive_version=version,
+                ).since_version
+            except RuntimeError:
+                print(
+                    "Fail to get since_version of {} in domain `{}` "
+                    "with max_inclusive_version={}. Set to 1.".format(
+                        handler.ONNX_OP, handler.DOMAIN, version
+                    )
+                )
+        else:
+            print(
+                "Unknown op {} in domain `{}`.".format(
+                    handler.ONNX_OP, handler.DOMAIN or "ai.onnx"
+                )
+            )
+        handler.SINCE_VERSION = since_version
+        handlers.setdefault(domain, {})[handler.ONNX_OP] = handler
+    return handlers
+
+
+class HetuBackend(object):
+
+    @classmethod
+    def prepare(cls,
+                model,
+                ):
+        return cls.onnx_model_2_hetu(model)
+
+    @classmethod
+    def onnx_model_2_hetu(cls, model,):
+        # if model.ir_version < 3:
+        #     opset_import = [make_opsetid(defs.ONNX_DOMAIN,1)]
+        assert model.ir_version >= 3
+        opset_import = model.opset_import
+        return cls._onnx_graph_2_hetu(
+            model.graph, opset_import,
+        )
+        pass
+
+    @classmethod
+    def _onnx_graph_2_hetu(cls, graph_def, opset,):
+        handlers = cls._get_handlers(opset)
+
+        if graph_def.initializer:
+            initialized = {
+                init.name: onnx.numpy_helper.to_array(init)
+                for init in graph_def.initializer
+            }
+            input_dict_items = cls._onnx_initializer_to_input_dict_items(
+                graph_def.initializer,
+                initialized,
+            )
+
+        else:
+            input_dict_items = []
+            initialized = {}
+
+        for node in graph_def.node:
+            node = OnnxNode(node)
+            # todo:should check.
+            if node.op_type == 'Constant':
+                initialized[node.output_tensor_names[0]] = numpy_helper.to_array(
+                    node.attrs["value"]
+                )
+        # creating placeholders for currently unknown inputs
+        for value_info in graph_def.input:
+            if value_info.name in initialized.keys():
+                continue
+            shape = list(
+                d.dim_value if (
+                    d.dim_value > 0 and d.dim_param == "") else None
+                for d in value_info.type.tensor_type.shape.dim
+            )
+            # todo,check here ,shape not use
+
+            input_dict_items.append((value_info.name,
+                                     Variable(name=value_info.name),
+                                     ))
+        tensor_dict = dict(input_dict_items)
+        for node in graph_def.node:
+            onnx_node = OnnxNode(node)
+            # print(onnx_node.name,onnx_node.op_type)
+            output_ops = cls._onnx_node_to_hetu_op(
+                onnx_node,
+                tensor_dict,
+                initialized,
+                handlers,
+                opset=opset,
+            )
+
+        return tensor_dict
+
+    @classmethod
+    def _onnx_node_to_hetu_op(cls, node, tensor_dict, init_dict, handlers=None, opset=None,):
+        # handlers = handlers or
+        handler = handlers[node.domain].get(node.op_type, None)
+        if handler:
+            output = handler.handle(
+                node, tensor_dict, init_dict=init_dict,
+            )
+            if not isinstance(output, (list, tuple)):
+                output = [output]
+            return output
+        else:
+            raise ValueError("{} is not supported".format(node.op_type))
+
+    @classmethod
+    def _onnx_initializer_to_input_dict_items(cls, initializer,
+                                              initialized,):
+
+        def get_flow_shape(shape):
+            if len(shape) == 0:
+                return (1,)
+            return shape
+        return [
+            (
+                init.name,
+                Variable(name=init.name, value=initialized[init.name]),
+
+            )
+            for init in initializer
+        ]
+
+    @classmethod
+    def _get_handlers(cls, opset):
+        opset_dict = dict([(o.domain, o.version) for o in opset])
+        return get_all_backend_handlers(opset_dict)
+
+
+prepare = HetuBackend.prepare
diff --git a/python/hetu/onnx/onnx_opset/AddConst.py b/python/hetu/onnx/onnx_opset/AddConst.py
new file mode 100644
index 0000000..62fd066
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/AddConst.py
@@ -0,0 +1,48 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["AddByConstOp"], onnx_op=["Add"])
+class Add:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 != shape1:
+            node.set_attr('broadcast', 1)
+            if shape0 and shape1 and len(shape0) < len(shape1):
+                tmp = node._inputs[0]
+                ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+                ctx.replace_input(node, node._inputs[1], tmp, 1)
+        else:
+            node.set_attr('broadcast', 0)
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 and shape1 and len(shape0) < len(shape1):
+            tmp = node._inputs[0]
+            ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+            ctx.replace_input(node, node._inputs[1], tmp, 1)
+
+
+def convert_const_to_node(ctx, node):
+    const = node.get_attr_value('const_attr', None)
+    assert const is not None, "Failed: const_attr is none when AddConst op mapping"
+    const_node = ctx.make_const(util.make_name(
+        'const'), np.array([const], dtype=np.float32), raw=False)
+    node.input_tensor_names = node.input_tensor_names+[const_node.name]
diff --git a/python/hetu/onnx/onnx_opset/AddElewise.py b/python/hetu/onnx/onnx_opset/AddElewise.py
new file mode 100644
index 0000000..96805b3
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/AddElewise.py
@@ -0,0 +1,35 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["AddOp"], onnx_op=["Add"])
+class Add:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 != shape1:
+            node.set_attr('broadcast', 1)
+            if shape0 and shape1 and len(shape0) < len(shape1):
+                tmp = node._inputs[0]
+                ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+                ctx.replace_input(node, node._inputs[1], tmp, 1)
+        else:
+            node.set_attr('broadcast', 0)
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 and shape1 and len(shape0) < len(shape1):
+            tmp = node._inputs[0]
+            ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+            ctx.replace_input(node, node._inputs[1], tmp, 1)
diff --git a/python/hetu/onnx/onnx_opset/BatchNorm.py b/python/hetu/onnx/onnx_opset/BatchNorm.py
new file mode 100644
index 0000000..ccc75b2
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/BatchNorm.py
@@ -0,0 +1,35 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb, numpy_helper
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["Batch_NormalizationOp"], onnx_op=["BatchNormalization"])
+class BatchNormalization:
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+        epsilon = node.get_attr_value('eps', 0.01)
+        node.set_attr("epsilon", epsilon)
+        momentum = node.get_attr_value('momentum', 0.99)
+
+        mean = node.get_attr_value('save_mean', None)
+        var = node.get_attr_value('save_var', None)
+        assert mean is not None and var is not None
+
+        mean = numpy_helper.to_array(mean)
+        var = numpy_helper.to_array(var)
+        mean = np.reshape(mean, [-1])
+        var = np.reshape(var, [-1])
+        new_mean_node_name = util.make_name(node.name+'_mean_')
+        new_mean_node = ctx.make_const(new_mean_node_name, mean)
+        node.input_tensor_names += new_mean_node.output_tensor_names
+
+        new_val_node_name = util.make_name(node.name+'_var_')
+        new_val_node = ctx.make_const(new_val_node_name, var)
+        node.input_tensor_names += new_val_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/Concat.py b/python/hetu/onnx/onnx_opset/Concat.py
new file mode 100644
index 0000000..8f73869
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Concat.py
@@ -0,0 +1,25 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["ConcatOp"], onnx_op=["Concat"])
+class Concat:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        pass
+
+        # todo:opset < 8: might need to wrap concat in casts since only float is supported
+        # if ctx.opset < 8:
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+        # Opset 11 supports negative axis, but core logic is same
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Conv2d.py b/python/hetu/onnx/onnx_opset/Conv2d.py
new file mode 100644
index 0000000..83d945f
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Conv2d.py
@@ -0,0 +1,27 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["Conv2dOp"], onnx_op=["Conv"])
+class Conv:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        kernel_shape = ctx._shapes[node._inputs[1]][2:]
+        pads = [node.get_attr_value('padding', 0)]*4
+        strides = [node.get_attr_value('stride', 1)]*2
+        node.set_attr('kernel_shape', kernel_shape)
+        node.set_attr('pads', pads)
+        node.set_attr('strides', strides)
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Division.py b/python/hetu/onnx/onnx_opset/Division.py
new file mode 100644
index 0000000..cece90c
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Division.py
@@ -0,0 +1,53 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["DivOp"], onnx_op=["Div"])
+class Div(general.PassOp):
+    pass
+
+
+@hetu_op(["DivConstOp"], onnx_op=["Div"])
+class DivConst:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 != shape1:
+            node.set_attr('broadcast', 1)
+            if shape0 and shape1 and len(shape0) > len(shape1):
+                tmp = node._inputs[0]
+                ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+                ctx.replace_input(node, node._inputs[1], tmp, 1)
+        else:
+            node.set_attr('broadcast', 0)
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 and shape1 and len(shape0) > len(shape1):
+            tmp = node._inputs[0]
+            ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+            ctx.replace_input(node, node._inputs[1], tmp, 1)
+
+
+def convert_const_to_node(ctx, node):
+    const = node.get_attr_value('const_attr', None)
+    assert const is not None, "Failed: const_attr is none when AddConst op mapping"
+    const_node = ctx.make_const(util.make_name(
+        'const'), np.array([const], dtype=np.float32), raw=False)
+    node.input_tensor_names = node.input_tensor_names + const_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/Dropout.py b/python/hetu/onnx/onnx_opset/Dropout.py
new file mode 100644
index 0000000..f222c1e
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Dropout.py
@@ -0,0 +1,47 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["DropoutOp"], onnx_op=["Dropout"])
+class Dropout():
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        assert False, 'todo:'
+        pass
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+        cls.version_1(ctx, node, **kwargs)
+
+    @classmethod
+    def version_7(cls, ctx, node, **kwargs):
+        cls.version_1(ctx, node, **kwargs)
+
+    @classmethod
+    def version_10(cls, ctx, node, **kwargs):
+        cls.version_1(ctx, node, **kwargs)
+
+    @classmethod
+    def version_12(cls, ctx, node, **kwargs):
+        radio = node.get_attr_value('keep_prob', None)
+        # node.set_attr('radio',radio)
+        assert radio is not None
+        radio_node = ctx.make_const(util.make_name(node.name+'_radio_'),
+                                    np.array([1-radio], dtype=np.float32), is_0D_tensor=True)
+        node.input_tensor_names += radio_node.output_tensor_names
+
+        training_mode = np.bool_(True)
+        training_mode_node = ctx.make_const(util.make_name(node.name+'_training_mode_'),
+                                            np.array([training_mode], dtype=np.bool), is_0D_tensor=True)
+        node.input_tensor_names += training_mode_node.output_tensor_names
+
+        # seed=node.get_attr_value('seed',None)
+        # node.set_attr('seed',seed)
diff --git a/python/hetu/onnx/onnx_opset/Identity.py b/python/hetu/onnx/onnx_opset/Identity.py
new file mode 100644
index 0000000..de333b6
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Identity.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["Identity"])
+class Identity(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/MatrixMult.py b/python/hetu/onnx/onnx_opset/MatrixMult.py
new file mode 100644
index 0000000..489cf2f
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/MatrixMult.py
@@ -0,0 +1,25 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["MatMulOp"], onnx_op=["MatMul"])
+class MatMul:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        trans_a = node.get_attr_value('matmul_attr_trans_A', 0)
+        trans_b = node.get_attr_value('matmul_attr_trans_B', 0)
+        # fixme:only supported matrixs have two dims now
+        if trans_a != 0:
+            ctx.insert_new_node_on_input(
+                node, 'Transpose', node._inputs[0], perm=[1, 0])
+        if trans_b != 0:
+            ctx.insert_new_node_on_input(
+                node, 'Transpose', node._inputs[1], perm=[1, 0])
diff --git a/python/hetu/onnx/onnx_opset/MultiplyConst.py b/python/hetu/onnx/onnx_opset/MultiplyConst.py
new file mode 100644
index 0000000..3ae0633
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/MultiplyConst.py
@@ -0,0 +1,48 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["MulByConstOp"], onnx_op=["Mul"])
+class MulByConst:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 != shape1:
+            node.set_attr('broadcast', 1)
+            if shape0 and shape1 and len(shape0) < len(shape1):
+                tmp = node._inputs[0]
+                ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+                ctx.replace_input(node, node._inputs[1], tmp, 1)
+        else:
+            node.set_attr('broadcast', 0)
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+
+        convert_const_to_node(ctx, node)
+
+        shape0 = ctx.get_shape(node._inputs[0])
+        shape1 = ctx.get_shape(node._inputs[1])
+        if shape0 and shape1 and len(shape0) < len(shape1):
+            tmp = node._inputs[0]
+            ctx.replace_input(node, node._inputs[0], node._inputs[1], 0)
+            ctx.replace_input(node, node._inputs[1], tmp, 1)
+
+
+def convert_const_to_node(ctx, node):
+    const = node.get_attr_value('const_attr', None)
+    assert const is not None, "Failed: const_attr is none when AddConst op mapping"
+    const_node = ctx.make_const(util.make_name(
+        'const'), np.array([const], dtype=np.float32), raw=False)
+    node.input_tensor_names = node.input_tensor_names+const_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/OneHot.py b/python/hetu/onnx/onnx_opset/OneHot.py
new file mode 100644
index 0000000..a514d7c
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/OneHot.py
@@ -0,0 +1,36 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["OneHotOp"], onnx_op=["OneHot"])
+class OneHot:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        assert False, "until there is no onehot op in onnx"
+
+    @classmethod
+    def version_9(cls, ctx, node, **kwargs):
+        depth = node.get_attr_value('num_classes', None)
+        assert depth is not None
+        input_shape = ctx.get_shape(node._inputs[0])
+
+        depth_node = ctx.make_const(util.make_name(
+            node.name), np.array([depth], dtype=np.int64))
+        values_node = ctx.make_const(util.make_name(
+            node.name), np.array([0, 1], dtype=np.int64))
+        node.input_tensor_names = node.input_tensor_names +\
+            depth_node.output_tensor_names +\
+            values_node.output_tensor_names
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+        # Opset 11 supports negative axis, but core logic is same
+        cls.version_9(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Opposite.py b/python/hetu/onnx/onnx_opset/Opposite.py
new file mode 100644
index 0000000..6d675d8
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Opposite.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["OppositeOp"], onnx_op=["Neg"])
+class Neg(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/Pad.py b/python/hetu/onnx/onnx_opset/Pad.py
new file mode 100644
index 0000000..bc42648
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Pad.py
@@ -0,0 +1,45 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["PadOp"], onnx_op=["Pad"])
+class Pad:
+    @classmethod
+    def version_2(cls, ctx, node, **kwargs):
+        pads = node.get_attr_value('paddings', None)
+        assert pads is not None
+        node.set_attr('pads', pads)
+
+        support_modes = ['constant', 'reflect', 'edge']
+        mode = node.get_attr_value('mode', 'constant').lower()
+        assert mode in support_modes
+        node.set_attr('mode', mode)
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+        pads = node.get_attr_value('paddings', None)
+        assert pads is not None
+        paddings = np.array(pads).astype(np.int64)
+        paddings_node = ctx.make_const(util.make_name(node.name), paddings)
+        node.input_tensor_names = node.input_tensor_names + \
+            paddings_node.output_tensor_names
+
+        support_modes = ['constant', 'reflect', 'edge']
+        mode = node.get_attr_value('mode', 'constant').lower()
+        assert mode in support_modes
+        node.set_attr('mode', mode)
+
+        constant_value = node.get_attr_value('constant_values', None)
+        constant_value = np.array([constant_value]).astype(np.float32)
+        constant_value_node = ctx.make_const(
+            util.make_name(node.name), constant_value,)
+        node.input_tensor_names = node.input_tensor_names + \
+            constant_value_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/Pool.py b/python/hetu/onnx/onnx_opset/Pool.py
new file mode 100644
index 0000000..2a3f9f8
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Pool.py
@@ -0,0 +1,29 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["Avg_Pool2dOp"], onnx_op=["AveragePool"])
+@hetu_op(["Max_Pool2dOp"], onnx_op=["MaxPool"])
+class Pool:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        kernel_shape = [node.get_attr_value(
+            'kernel_H', 2), node.get_attr_value('kernel_W', 2)]
+        pads = [node.get_attr_value('padding', 0)]*4
+        strides = [node.get_attr_value('stride', 1)]*2
+        node.set_attr('kernel_shape', kernel_shape)
+        node.set_attr('pads', pads)
+        node.set_attr('strides', strides)
+
+    @classmethod
+    def version_10(cls, ctx, node, **kwargs):
+
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Reduces.py b/python/hetu/onnx/onnx_opset/Reduces.py
new file mode 100644
index 0000000..8a13fee
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Reduces.py
@@ -0,0 +1,25 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["ReduceMeanOp"], onnx_op=["ReduceMean"])
+@hetu_op(["ReduceSumOp"], onnx_op=["ReduceSum"])
+class ReduceMean(general.PassOp):
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        keepdims = node.get_attr_value('keepdims', None)
+        assert keepdims is not None
+        node.set_attr("keepdims", keepdims[0])
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+        # Opset 11 supports negative axis, but core logic is same
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Relu.py b/python/hetu/onnx/onnx_opset/Relu.py
new file mode 100644
index 0000000..b463ca1
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Relu.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["ReluOp"], onnx_op=["Relu"])
+class Relu(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/Reshape.py b/python/hetu/onnx/onnx_opset/Reshape.py
new file mode 100644
index 0000000..e00e7be
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Reshape.py
@@ -0,0 +1,28 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["Array_ReshapeOp"], onnx_op=["Reshape"])
+class AveragePool:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        shape = node.get_attr_value('output_shape', None)
+        assert shape is not None, "Failed: ReshapeOp does not have a const shape"
+        node.set_attr("shape", shape)
+
+    @classmethod
+    def version_5(cls, ctx, node, **kwargs):
+        shape = node.get_attr_value('output_shape', None)
+        assert shape is not None, "Failed: ReshapeOp does not have a const shape"
+        shape_node = ctx.make_const(
+            util.make_name("shape"), np.array(shape, None)
+        )
+        node.input_tensor_names = node.input_tensor_names + shape_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/Slice.py b/python/hetu/onnx/onnx_opset/Slice.py
new file mode 100644
index 0000000..ec1ca99
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Slice.py
@@ -0,0 +1,36 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["SliceOp"], onnx_op=["Slice"])
+class Slice:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        starts = node.get_attr_value('begin_pos')
+        size = node.get_attr_value('output_shape')
+        ends = np.zeros_like(starts)
+        for i, s in enumerate(size):
+            ends[i] = starts[i] + size[i]
+
+        #hetu:output_shape (input_size[0],size)
+        # ends[0]=starts[0]
+
+        node.set_attr('starts', starts)
+        node.set_attr('ends', ends)
+
+        starts_node = ctx.make_const(
+            util.make_name("starts"), np.array(starts, None)
+        )
+        ends_node = ctx.make_const(
+            util.make_name("ends"), np.array(ends, None)
+        )
+        node.input_tensor_names = node.input_tensor_names + \
+            starts_node.output_tensor_names + ends_node.output_tensor_names
diff --git a/python/hetu/onnx/onnx_opset/Softmax.py b/python/hetu/onnx/onnx_opset/Softmax.py
new file mode 100644
index 0000000..ac19d3f
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Softmax.py
@@ -0,0 +1,23 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["SoftmaxOp"], onnx_op=["Softmax"])
+class Softmax():
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        pass
+        # logits_rank = len(ctx.get_shape(node.input_tensor_names[0]))
+        # node.set_attr("axis",logits_rank - 1)
+
+    @classmethod
+    def version_11(cls, ctx, node, **kwargs):
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/onnx_opset/Sqrt.py b/python/hetu/onnx/onnx_opset/Sqrt.py
new file mode 100644
index 0000000..d939636
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Sqrt.py
@@ -0,0 +1,27 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["SqrtOp"], onnx_op=["Sqrt"])
+class Sqrt(general.PassOp):
+    pass
+
+
+@hetu_op(["ReciprocalSqrtOp"], onnx_op=["Sqrt"])
+class rSqrt:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        op_name = util.make_name(node.name)
+        reciprocal = ctx.insert_new_node_on_output(
+            "Reciprocal", node.output_tensor_names[0], name=op_name
+        )
+        ctx.copy_shape(
+            node.output_tensor_names[0], reciprocal.output_tensor_names[0])
diff --git a/python/hetu/onnx/onnx_opset/Tanh.py b/python/hetu/onnx/onnx_opset/Tanh.py
new file mode 100644
index 0000000..24d3ab8
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Tanh.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["TanhOp"], onnx_op=["Tanh"])
+class Tanh(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/Transpose.py b/python/hetu/onnx/onnx_opset/Transpose.py
new file mode 100644
index 0000000..fb550b3
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Transpose.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["TransposeOp"], onnx_op=["Transpose"])
+class Transpose(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/Variable.py b/python/hetu/onnx/onnx_opset/Variable.py
new file mode 100644
index 0000000..b038dbe
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Variable.py
@@ -0,0 +1,23 @@
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from hetu.onnx.handler import hetu_op
+
+from onnx import helper
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["PlaceholderOp"], onnx_op=["Placeholder"])
+class PlaceholderOp:
+    @classmethod
+    def version_1(clsc, ctx, node, **kwargs):
+        val = node.get_attr_value('value')
+        if(val is not None):
+            node.op_type = "Const"
+
+
+@hetu_op(["defined_in"])
+class Defined_In(general.PassOp):
+    pass
diff --git a/python/hetu/onnx/onnx_opset/Where.py b/python/hetu/onnx/onnx_opset/Where.py
new file mode 100644
index 0000000..6da3b65
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/Where.py
@@ -0,0 +1,22 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import numpy as np
+from onnx import onnx_pb
+from hetu.onnx import constants, util, graph
+from hetu.onnx.handler import hetu_op
+from hetu.onnx.onnx_opset import general
+
+
+@hetu_op(["WhereOp"], onnx_op=["Where"])
+class Where():
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        assert False, "This version of the operator has been available since version 9 of the default ONNX operator set"
+        pass
+
+    @classmethod
+    def version_9(cls, ctx, node, **kwargs):
+        pass
diff --git a/python/hetu/onnx/onnx_opset/__init__.py b/python/hetu/onnx/onnx_opset/__init__.py
new file mode 100644
index 0000000..d403b19
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/__init__.py
@@ -0,0 +1,27 @@
+from . import (
+    MatrixMult,
+    MultiplyConst,
+    Variable,
+    AddElewise,
+    Relu,
+    Identity,
+    Conv2d,
+    Pool,
+    Reshape,
+    AddConst,
+    Concat,
+    Sqrt,
+    Tanh,
+    BatchNorm,
+    Pad,
+    Division,
+    OneHot,
+    Opposite,
+    Softmax,
+    general,
+    Reduces,
+    Dropout,
+    Transpose,
+    Where,
+    Slice,
+)
diff --git a/python/hetu/onnx/onnx_opset/general.py b/python/hetu/onnx/onnx_opset/general.py
new file mode 100644
index 0000000..a007008
--- /dev/null
+++ b/python/hetu/onnx/onnx_opset/general.py
@@ -0,0 +1,15 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+
+class PassOp:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        pass
+
+    @classmethod
+    def version_6(cls, ctx, node, **kwargs):
+
+        cls.version_1(ctx, node, **kwargs)
diff --git a/python/hetu/onnx/util.py b/python/hetu/onnx/util.py
new file mode 100644
index 0000000..d1f4d3d
--- /dev/null
+++ b/python/hetu/onnx/util.py
@@ -0,0 +1,289 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import numpy as np
+import os
+import onnx
+from onnx import onnx_pb, helper, defs, numpy_helper, TensorProto, OperatorSetIdProto, shape_inference
+from hetu.onnx import constants, util
+from collections import defaultdict, OrderedDict
+
+
+#
+# mapping dtypes from hetu to onnx
+#
+# fixme:bug,(int64) type is error
+# fixme:unused now
+HETU_TO_ONNX_DTYPES = {
+    np.float: onnx_pb.TensorProto.FLOAT,
+}
+
+
+#
+# mapping dtypes from onnx to numpy
+#
+ONNX_TO_NUMPY_DTYPE = {
+    onnx_pb.TensorProto.FLOAT: np.float32,
+    onnx_pb.TensorProto.FLOAT16: np.float16,
+    onnx_pb.TensorProto.DOUBLE: np.float64,
+    onnx_pb.TensorProto.INT32: np.int32,
+    onnx_pb.TensorProto.INT16: np.int16,
+    onnx_pb.TensorProto.INT8: np.int8,
+    onnx_pb.TensorProto.UINT8: np.uint8,
+    onnx_pb.TensorProto.UINT16: np.uint16,
+    onnx_pb.TensorProto.INT64: np.int64,
+    onnx_pb.TensorProto.UINT64: np.uint64,
+    onnx_pb.TensorProto.BOOL: np.bool,
+}
+
+
+def numpy_to_onnx_dtype(np_dtype):
+    for onnx_dtype, numpy_dtype in ONNX_TO_NUMPY_DTYPE.items():
+        if numpy_dtype == np_dtype:
+            return onnx_dtype
+    raise ValueError("unsupported dtype "+np_dtype+" for mapping")
+
+
+def onnx_to_numpy_dtype(onnx_dtype):
+    return ONNX_TO_NUMPY_DTYPE[onnx_dtype]
+
+
+def map_hetu_dtype(dtype):
+    if dtype:
+        dtype = HETU_TO_ONNX_DTYPES[dtype]
+    return dtype
+
+
+ONNX_UNKNOWN_DIMENSION = -1
+
+INSERT_NAME_ID = 1
+
+
+def make_name(name):
+    global INSERT_NAME_ID
+    INSERT_NAME_ID += 1
+    return "{}_{}".format(name, INSERT_NAME_ID)
+
+
+def FindOpset(opset):
+    if opset is None or opset == 0:
+        opset = defs.onnx_opset_version()
+    return opset
+
+
+def get_onnx_version():
+    return onnx.__version__
+
+
+def MakeOnnxInputsOutputs(name, elem_type, shape, **kwargs):
+    if elem_type is None:
+        elem_type = onnx_pb.TensorProto.UNDEFINED
+
+    return helper.make_tensor_value_info(
+        name, elem_type, shape)
+
+
+def GenerateValidFilename(s):
+    return "".join([c if c.isalpha() or c.isdigit() else "_" for c in s])
+
+
+def TensorProtoFromNumpy(
+    arr: np.ndarray, name=None, external_data=False, export_path=None
+):
+    if name is None:
+        name = make_name("tensor_")
+    tp = numpy_helper.from_array(arr, name)
+    # value with size < 1024 bytes will remain in .onnx file
+    # (like what pytorch does)
+    if (not external_data) or arr.nbytes < 1024:
+        return tp
+    assert tp.HasField("raw_data")
+    tp.ClearField("raw_data")
+    export_dir = os.path.dirname(export_path)
+    filename = GenerateValidFilename(name)
+    with open(os.path.join(export_dir, filename), "wb") as f:
+        arr.tofile(f)
+    tp.data_location = onnx_pb.TensorProto.EXTERNAL
+    external_data = tp.external_data.add()
+    external_data.key = "location"
+    external_data.value = filename
+    return tp
+
+
+class OnnxOpSchema(object):
+
+    def __init__(self, name, domain, since_version, attributes):
+
+        self._name = name
+        self._domain = domain
+        self._attributes = attributes
+        self._since_version = since_version
+
+    @property
+    def attributes(self):
+        return self._attributes
+
+    @property
+    def domain(self):
+        return self._domain
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def since_version(self):
+        return self._since_version
+
+    @staticmethod
+    def FromOnnxSchema(onnx_schema):
+        name = onnx_schema.name
+        domain = onnx_schema.domain
+        since_version = int(onnx_schema.since_version)
+        attributes = onnx_schema.attributes
+        return OnnxOpSchema(name, domain, since_version, attributes)
+
+    def has_attribute(self, attr):
+        return attr in self.attributes
+
+
+def _RegisterAllSchemasWithHistory():
+    """Register all schemas with history"""
+    onnx_schemas = defs.get_all_schemas_with_history()
+    name_domain_version_schema_map = defaultdict(lambda: defaultdict(dict))
+    for s in onnx_schemas:
+        schema = OnnxOpSchema.FromOnnxSchema(s)
+        name_domain_version_schema_map[schema.name][schema.domain][
+            schema.since_version
+        ] = schema
+
+    ordered_map = defaultdict(lambda: defaultdict(OrderedDict))
+    for name, domain_version_schema_map in name_domain_version_schema_map.items():
+        for domain, version_schema_map in domain_version_schema_map.items():
+            ordered_map[name][domain] = OrderedDict(
+                sorted(version_schema_map.items(), key=lambda x: -x[0])
+            )
+    return ordered_map
+
+
+def _ParseDomainOpsetVersions(schemas):
+    """ Get max opset version among all schemas within each domain. """
+    domain_opset_versions = dict()
+    for domain_version_schema_map in schemas.values():
+        for domain, version_schema_map in domain_version_schema_map.items():
+            # version_schema_map is sorted by since_version in descend order
+            max_version = next(iter(version_schema_map))
+            if domain not in domain_opset_versions:
+                domain_opset_versions[domain] = int(max_version)
+            else:
+                domain_opset_versions[domain] = max(
+                    domain_opset_versions[domain], int(max_version)
+                )
+    return domain_opset_versions
+
+
+_schemas = _RegisterAllSchemasWithHistory()
+
+_domain_opset_versions = _ParseDomainOpsetVersions(_schemas)
+
+
+def get_schema(name, max_inclusive_opset_version, domain=None):
+    """Get schema by name within specific version."""
+    domain = domain or constants.ONNX_DOMAIN
+    domain_version_schema_map = _schemas[name]
+    version_schema_map = domain_version_schema_map[domain]
+    for version, schema in version_schema_map.items():
+        if version <= max_inclusive_opset_version:
+            return schema
+    return None
+
+
+def get_max_supported_opset_version(domain=None):
+    """Get max supported opset version by current onnx package given a domain."""
+    domain = domain or constants.ONNX_DOMAIN
+    return _domain_opset_versions.get(domain, None)
+
+
+def InferOnnxShapeDtype(
+    node, opset_version, input_shapes, input_dtypes, initializers=None
+):
+    """
+    Infer shapes and dtypes for outputs of the node.
+    Sometimes, shape inference needs the values of node's inputs, so initializers are used.
+    """
+
+    def BuildOnnxOp(node):
+        """Build onnx op"""
+        onnx_node = helper.make_node(
+            node.op_type,
+            node.input_tensor_names,
+            node.output_tensor_names,
+            name=node.name,
+        )
+
+        # # deal with attributes
+        # attr = []
+        # attr_graphs = node.get_body_graphs()
+        # if attr_graphs:
+        #     for attr_name, sub_graph in attr_graphs.items():
+        #         copied_sub_graph = copy.deepcopy(sub_graph)
+        #         graph_proto = copied_sub_graph.MakeGraph(
+        #             "graph for " + node.name + " " + attr_name
+        #         )
+        #         attr.append(helper.make_attribute(attr_name, graph_proto))
+        # attr.extend(node.attrs_onnx.values())
+        # if attr:
+        #     onnx_node.attribute.extend(attr)
+        return onnx_node
+
+    inputs = []
+    outputs = []
+    for inp, shape, dtype in zip(node.input_tensor_names, input_shapes, input_dtypes):
+        inputs.append(util.MakeOnnxInputsOutputs(inp, dtype, shape))
+    for output in node.output_tensor_names:
+        outputs.append(util.MakeOnnxInputsOutputs(
+            output, TensorProto.UNDEFINED, None))
+    graph_proto = helper.make_graph(
+        [BuildOnnxOp(node)], "infer-graph", inputs, outputs, initializer=initializers
+    )
+    imp = OperatorSetIdProto()
+    imp.version = opset_version
+    model_proto = helper.make_model(graph_proto, opset_imports=[imp])
+
+    inferred_model = None
+    try:
+        inferred_model = shape_inference.infer_shapes(model_proto)
+    except Exception:
+        print('error')
+        return None, None
+    shapes = {}
+    dtypes = {}
+    for output in inferred_model.graph.output:
+        tensor_type = output.type.tensor_type
+        if tensor_type.HasField("elem_type"):
+            dtypes[output.name] = tensor_type.elem_type
+        else:
+            dtypes[output.name] = TensorProto.UNDEFINED
+        # 0 in shapes of onnx means unknown which is -1 in our convertor
+        # fixme:how to do if the dim is -1 originally
+        if tensor_type.HasField("shape"):
+            shapes[output.name] = [
+                dim.dim_value if dim.dim_value != 0 else util.ONNX_UNKNOWN_DIMENSION
+                for dim in tensor_type.shape.dim
+            ]
+        else:
+            shapes[output.name] = None
+    output_shapes = []
+    output_dtypes = []
+    for output in node.output_tensor_names:
+        if output in shapes:
+            output_shapes.append(shapes[output])
+        else:
+            output_shapes.append(None)
+        if output in dtypes:
+            output_dtypes.append(dtypes[output])
+        else:
+            output_dtypes.append(TensorProto.UNDEFINED)
+    return output_shapes, output_dtypes
diff --git a/python/hetu/optimizer.py b/python/hetu/optimizer.py
new file mode 100644
index 0000000..b7b8216
--- /dev/null
+++ b/python/hetu/optimizer.py
@@ -0,0 +1,393 @@
+import numpy as np
+import ctypes
+import hetu as ht
+from . import ndarray
+from . import gpu_links as gpu_op
+from .lr_scheduler import FixedScheduler
+from .gpu_ops.Node import Op
+from .gpu_ops.EmbeddingLookUp import EmbeddingLookUp_Gradient
+from .gpu_ops.ParameterServerCommunicate import ParameterServerCommunicateOp
+from .gpu_ops.Variable import PlaceholderOp
+
+
+class Optimizer(object):
+    """Optimizers."""
+
+    def __init__(self, learning_rate, l2reg=0):
+        if isinstance(learning_rate, FixedScheduler):
+            self.lr_sched = learning_rate
+        else:
+            assert learning_rate >= 0, \
+                "learning rate must be non-negative"
+            self.lr_sched = FixedScheduler(learning_rate)
+        # now we don't support l2 regularizer for sparse updates
+        # TODO: support l2 regularizer for sparse updates
+        # now we don't support l2 regularizer for PS mode parameters
+        # TODO: support l2 regularizer for PS mode parameters (after PS mode has optimizer on Servers)
+        assert l2reg >= 0, 'L2 regularizer should be positive or 0.'
+        self.l2reg = l2reg
+        self.params = None
+        self.tensors = None
+        self.initiated = False
+
+    @property
+    def learning_rate(self):
+        return self.lr_sched.get()
+
+    @staticmethod
+    def get_var_list(loss):
+        def topo_sort_dfs(node, visited, var_list):
+            if node in visited:
+                return
+            visited.add(node)
+            if isinstance(node, PlaceholderOp) and node.trainable:
+                var_list.append(node)
+                return
+            for n in node.inputs:
+                topo_sort_dfs(n, visited, var_list)
+
+        visited = set()
+        trainable_vars = []
+        if isinstance(loss, list):
+            for l in loss:
+                topo_sort_dfs(l, visited, trainable_vars)
+        else:
+            topo_sort_dfs(loss, visited, trainable_vars)
+        return trainable_vars
+
+    def initiate_states(self, config):
+        assert not self.initiated, "Optimizer already initiated."
+        self.tensors = [config.placeholder_to_arr_map[node]
+                        for node in self.params]
+        self.initiated = True
+
+    def minimize(self, loss, var_list=None):
+        """Return an optimizer op to update parameters.
+
+        Parameters
+        ----------
+        loss: loss node that we are minimizing.
+        var_list: list of nodes that we are taking derivative wrt.
+
+        Returns
+        -------
+        An optimizer node.
+
+        """
+        if not var_list:
+            var_list = self.get_var_list(loss)
+        self.params = var_list
+        grads = ht.gradients(loss, self.params)
+        optimizer_node = OptimizerOp(grads, self)
+        return optimizer_node
+
+
+class OptimizerOp(Op):
+    def __init__(self, grads, optimizer):
+        super().__init__(OptimizerOp, grads, None)
+        self.name = "Optimizer_%s" % (optimizer.name)
+        self.optimizer = optimizer
+
+    def compute(self, input_vals, output_val, stream_handle=None):
+        assert output_val is None
+        # For PS op, this input_vals is None
+        # PS mode doesn't need local update
+        if self.comm_mode != 'PS':
+            self.optimizer.update(input_vals, stream_handle)
+
+    def gradient(self, output_grad):
+        raise NotImplementedError
+
+    def infer_shape(self, input_shapes):
+        return None
+
+    def forward_hook(self, config):
+        # disable inplace if not lazy execution
+        # previously we use array reshape lazy callback to do this, which is deprecated (not efficient)
+        for node in self.inputs:
+            node.inplace = False
+
+        self.optimizer.initiate_states(config)
+        self.on_cpu = self.on_gpu = None
+        self.comm_mode = config.comm_mode
+        # some things todo.
+        if self.comm_mode != 'PS':
+            for i in range(len(self.inputs)):
+                # Though the gradients for transfer ops are well defined,
+                # we called gradients in optimizer op before transfer ops are added.
+                # So here we also add tranfer ops for gradients update.
+                # Could be optimized later.
+                if not isinstance(self.inputs[i], ParameterServerCommunicateOp):
+                    paramctx = self.optimizer.params[i].ctx
+                    self.inputs[i] = super().add_transfer_op(
+                        self.inputs[i], paramctx, config.h2d_ops, config.d2h_ops)
+
+    def backward_hook(self, config):
+        self.comm_mode = config.comm_mode
+        new_inputs = []
+        for i, node in enumerate(self.inputs):
+            current_strategy = config.node_strategy.get(
+                self.optimizer.params[i], self.comm_mode)
+            if current_strategy == 'AllReduce' or (current_strategy == 'Hybrid' and not isinstance(node, EmbeddingLookUp_Gradient)):
+                new_inputs.append(ht.allreduceCommunicate_op(
+                    node, config.param_allreduce_group.get(self.optimizer.params[i], config.nccl_comm)))
+            elif current_strategy == 'PS' or (current_strategy == 'Hybrid' and isinstance(node, EmbeddingLookUp_Gradient)):
+                new_inputs.append(ht.parameterServerCommunicate_op(
+                    node, self.optimizer.params[i], self.optimizer.get_config()))
+            else:
+                new_inputs.append(node)
+        self.inputs = new_inputs
+
+
+class SGDOptimizer(Optimizer):
+    def __init__(self, learning_rate=0.01, l2reg=0):
+        super(SGDOptimizer, self).__init__(learning_rate, l2reg)
+        self.name = 'SGD'
+
+    def get_config(self):
+        return (ctypes.c_int(0), (ctypes.c_float * 1)(self.learning_rate), ctypes.c_int(1))
+
+    def initiate_states(self, config):
+        super().initiate_states(config)
+
+    def update(self, grads, stream_handle=None):
+        assert self.initiated is True
+        params_size = len(self.params)
+        assert params_size == len(grads)
+        for i in range(params_size):
+            if grads[i] == None:
+                continue
+            if self.params[i].on_gpu:
+                assert isinstance(self.tensors[i], ndarray.NDArray)
+                assert isinstance(
+                    grads[i], (ndarray.NDArray, ndarray.IndexedSlices))
+                if self.l2reg > 0:
+                    gpu_op.add_l2_regularization(
+                        self.tensors[i], grads[i], self.l2reg, stream_handle)
+                gpu_op.sgd_update(
+                    self.tensors[i], grads[i], self.learning_rate, stream_handle)
+            else:
+                from ._base import DNNL_LIB
+                if isinstance(grads[i], ndarray.IndexedSlices):
+                    if DNNL_LIB['cpu_SGDOptimizerSparseUpdate']:
+                        from .cpu_links import sgd_update_sparse as cpu_sgd_update_sparse
+                        cpu_sgd_update_sparse(
+                            self.tensors[i], grads[i].indices, grads[i].values, self.learning_rate)
+                    else:
+                        grads[i].cpu_deduplicate()
+                        np_tensor = self.tensors[i].asnumpy()
+                        np_tensor[grads[i].indices.asnumpy().astype(
+                            np.int)] -= self.learning_rate * grads[i].values.asnumpy()
+                        self.tensors[i][:] = np_tensor
+                        grads[i].free_deduplicate()
+                else:
+                    if DNNL_LIB['cpu_SGDOptimizerUpdate']:
+                        from .cpu_links import sgd_update as cpu_sgd_update
+                        if self.l2reg > 0:
+                            from .cpu_links import add_l2_regularization as cpu_add_l2_regularization
+                            cpu_add_l2_regularization(
+                                self.tensors[i], grads[i], self.l2reg)
+                        cpu_sgd_update(
+                            self.tensors[i], grads[i], self.learning_rate)
+                    else:
+                        prev_param = self.tensors[i].asnumpy()
+                        grad = grads[i].asnumpy(
+                        ) + self.l2reg * prev_param if self.l2reg > 0 else grads[i].asnumpy()
+                        self.tensors[i][:] = prev_param - \
+                            self.learning_rate * grad
+
+
+class MomentumOptimizer(Optimizer):
+    def __init__(self, learning_rate=0.01, momentum=0.9, nesterov=False, l2reg=0):
+        super(MomentumOptimizer, self).__init__(learning_rate, l2reg)
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.name = "Momentum"
+
+    def get_config(self):
+        return (ctypes.c_int(self.nesterov + 1), (ctypes.c_float * 2)(self.learning_rate, self.momentum), ctypes.c_int(2))
+
+    def initiate_states(self, config):
+        super().initiate_states(config)
+        self.velocity = []
+        for t in self.tensors:
+            self.velocity.append(None if t is None else ndarray.array(
+                np.zeros(t.shape, dtype=np.float32), t.ctx))
+
+    def update(self, grads, stream_handle=None):
+        assert self.initiated is True
+        params_size = len(self.params)
+        assert params_size == len(grads)
+        for i in range(params_size):
+            if grads[i] == None:
+                continue
+            if self.params[i].on_gpu:
+                assert isinstance(self.tensors[i], ndarray.NDArray)
+                assert isinstance(
+                    grads[i], (ndarray.NDArray, ndarray.IndexedSlices))
+                assert isinstance(self.velocity[i], ndarray.NDArray)
+                if self.l2reg > 0:
+                    gpu_op.add_l2_regularization(
+                        self.tensors[i], grads[i], self.l2reg, stream_handle)
+                gpu_op.momentum_update(self.tensors[i], grads[i], self.velocity[i], self.learning_rate, self.momentum,
+                                       self.nesterov, stream_handle)
+            else:
+                if isinstance(grads[i], ndarray.IndexedSlices):
+                    raise NotImplementedError
+                else:
+                    from ._base import DNNL_LIB
+                    if DNNL_LIB['cpu_MomentumOptimizerUpdate']:
+                        from .cpu_links import momentum_update as cpu_momentum_update
+                        if self.l2reg > 0:
+                            from .cpu_links import add_l2_regularization as cpu_add_l2_regularization
+                            cpu_add_l2_regularization(
+                                self.tensors[i], grads[i], self.l2reg)
+                        cpu_momentum_update(self.tensors[i], grads[i], self.velocity[i], self.learning_rate, self.momentum,
+                                            self.nesterov)
+                    else:
+                        prev_param = self.tensors[i].asnumpy()
+                        grad = grads[i].asnumpy(
+                        ) + self.l2reg * prev_param if self.l2reg > 0 else grads[i].asnumpy()
+                        velo = self.velocity[i].asnumpy()
+                        if self.nesterov:
+                            lr_grads = -self.learning_rate * grad
+                            self.velocity[i][:] = self.momentum * \
+                                (velo + lr_grads)
+                            self.tensors[i][:] = prev_param + velo + lr_grads
+                        else:
+                            self.velocity[i][:] = self.momentum * \
+                                velo - self.learning_rate * grad
+                            self.tensors[i][:] = prev_param + velo
+
+
+class AdaGradOptimizer(Optimizer):
+    def __init__(self, learning_rate=0.01, initial_accumulator_value=0.0, eps=1e-7, l2reg=0):
+        assert initial_accumulator_value >= 0.0, \
+            "initial accumulator value must be non-negative"
+        assert eps > 0.0, \
+            "epsilon must be positive"
+        super(AdaGradOptimizer, self).__init__(learning_rate, l2reg)
+        self.initial_accumulator_value = initial_accumulator_value
+        self.eps = eps
+        self.name = "AdaGrad"
+
+    def get_config(self):
+        return (ctypes.c_int(3), (ctypes.c_float * 3)(self.learning_rate, self.initial_accumulator_value, self.eps), ctypes.c_int(3))
+
+    def initiate_states(self, config):
+        super().initiate_states(config)
+        self.accumulator_value = []
+        for t in self.tensors:
+            self.accumulator_value.append(None if t is None else ndarray.array(
+                np.full(t.shape, self.initial_accumulator_value), t.ctx))
+
+    def update(self, grads, stream_handle=None):
+        assert self.initiated is True
+        params_size = len(self.params)
+        assert params_size == len(grads)
+        for i in range(params_size):
+            if grads[i] == None:
+                continue
+            if self.params[i].on_gpu:
+                assert isinstance(self.tensors[i], ndarray.NDArray)
+                assert isinstance(
+                    grads[i], (ndarray.NDArray, ndarray.IndexedSlices))
+                if self.l2reg > 0:
+                    gpu_op.add_l2_regularization(
+                        self.tensors[i], grads[i], self.l2reg, stream_handle)
+                gpu_op.adagrad_update(self.tensors[i], grads[i], self.accumulator_value[i], self.learning_rate, self.eps,
+                                      stream_handle)
+            else:
+                if isinstance(grads[i], ndarray.IndexedSlices):
+                    raise NotImplementedError
+                else:
+                    from ._base import DNNL_LIB
+                    if DNNL_LIB['cpu_AdaGradOptimizerUpdate']:
+                        from .cpu_links import adagrad_update as cpu_adagrad_update
+                        if self.l2reg > 0:
+                            from .cpu_links import add_l2_regularization as cpu_add_l2_regularization
+                            cpu_add_l2_regularization(
+                                self.tensors[i], grads[i], self.l2reg)
+                        cpu_adagrad_update(
+                            self.tensors[i], grads[i], self.accumulator_value[i], self.learning_rate, self.eps)
+                    else:
+                        prev_param = self.tensors[i].asnumpy()
+                        grad = grads[i].asnumpy(
+                        ) + self.l2reg * prev_param if self.l2reg > 0 else grads[i].asnumpy()
+                        self.accumulator_value[i][:] = self.accumulator_value[i].asnumpy(
+                        ) + np.power(grad, 2)
+                        self.tensors[i][:] = \
+                            prev_param - self.learning_rate * grad / \
+                            (np.sqrt(
+                                self.accumulator_value[i].asnumpy()) + self.eps)
+
+
+class AdamOptimizer(Optimizer):
+    def __init__(self, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-7, l2reg=0):
+        super(AdamOptimizer, self).__init__(learning_rate, l2reg)
+        self.beta1 = beta1
+        self.beta1_t = 1.0
+        self.beta2 = beta2
+        self.beta2_t = 1.0
+        self.epsilon = epsilon
+        self.name = "Adam"
+
+    def get_config(self):
+        return (ctypes.c_int(4), (ctypes.c_float * 4)(self.learning_rate, self.beta1, self.beta2, self.epsilon), ctypes.c_int(4))
+
+    def initiate_states(self, config):
+        super().initiate_states(config)
+        self.m = []
+        self.v = []
+        for t in self.tensors:
+            self.m.append(None if t is None else ndarray.array(
+                np.zeros(t.shape), t.ctx))
+            self.v.append(None if t is None else ndarray.array(
+                np.zeros(t.shape), t.ctx))
+
+    def update(self, grads, stream_handle=None):
+        assert self.initiated is True
+        params_size = len(self.tensors)
+        assert params_size == len(grads)
+        self.beta1_t *= self.beta1
+        self.beta2_t *= self.beta2
+        for i in range(params_size):
+            if grads[i] == None:
+                continue
+            if self.params[i].on_gpu:
+                assert isinstance(self.tensors[i], ndarray.NDArray)
+                assert isinstance(
+                    grads[i], (ndarray.NDArray, ndarray.IndexedSlices))
+                assert isinstance(self.m[i], ndarray.NDArray)
+                assert isinstance(self.v[i], ndarray.NDArray)
+                if self.l2reg > 0:
+                    gpu_op.add_l2_regularization(
+                        self.tensors[i], grads[i], self.l2reg, stream_handle)
+                gpu_op.adam_update(self.tensors[i], grads[i], self.m[i], self.v[i], self.learning_rate, self.beta1,
+                                   self.beta2, self.beta1_t, self.beta2_t, self.epsilon, stream_handle)
+            else:
+                if isinstance(grads[i], ndarray.IndexedSlices):
+                    raise NotImplementedError
+                else:
+                    from ._base import DNNL_LIB
+                    if DNNL_LIB['cpu_AdamOptimizerUpdate']:
+                        from .cpu_links import adam_update as cpu_adam_update
+                        if self.l2reg > 0:
+                            from .cpu_links import add_l2_regularization as cpu_add_l2_regularization
+                            cpu_add_l2_regularization(
+                                self.tensors[i], grads[i], self.l2reg)
+                        cpu_adam_update(self.tensors[i], grads[i], self.m[i], self.v[i], self.learning_rate, self.beta1,
+                                        self.beta2, self.beta1_t, self.beta2_t, self.epsilon)
+                    else:
+                        prev_param = self.tensors[i].asnumpy()
+                        grad = grads[i].asnumpy(
+                        ) + self.l2reg * prev_param if self.l2reg > 0 else grads[i].asnumpy()
+                        self.m[i][:] = self.beta1 * \
+                            self.m[i].asnumpy() + (1 - self.beta1) * grad
+                        self.v[i][:] = self.beta2 * self.v[i].asnumpy() + \
+                            (1 - self.beta2) * grad * grad
+                        mc = self.m[i].asnumpy() / (1 - self.beta1_t)
+                        vc = self.v[i].asnumpy() / (1 - self.beta2_t)
+                        self.tensors[i][:] = prev_param - \
+                            self.learning_rate * mc / \
+                            (np.sqrt(vc) + self.epsilon)
diff --git a/python/hetu/stream.py b/python/hetu/stream.py
new file mode 100644
index 0000000..493defe
--- /dev/null
+++ b/python/hetu/stream.py
@@ -0,0 +1,99 @@
+from __future__ import absolute_import
+
+from ._base import _LIB, check_call
+import ctypes
+from . import ndarray
+
+
+class DLStream(ctypes.Structure):
+    _fields_ = [("device_id", ctypes.c_int),
+                ("handle", ctypes.c_void_p)]
+
+
+DLStreamHandle = ctypes.POINTER(DLStream)
+
+
+class Stream(ctypes.Structure):
+    __slots__ = ["handle"]
+
+    def __init__(self, handle):
+        self.handle = handle
+
+    def __del__(self):
+        check_call(_LIB.DLStreamDestroy(self.handle))
+
+    def sync(self):
+        check_call(_LIB.DLStreamSync(self.handle))
+
+
+def create_stream_handle(ctx):
+    assert ndarray.is_gpu_ctx(ctx)
+    handle = DLStreamHandle()
+    check_call(_LIB.DLStreamCreate(ctx.device_id, ctypes.byref(handle)))
+    return Stream(handle)
+
+
+class DLEvent(ctypes.Structure):
+    _fields_ = [("device_id", ctypes.c_int),
+                ("handle", ctypes.c_void_p)]
+
+
+DLEventHandle = ctypes.POINTER(DLEvent)
+
+
+class Event(ctypes.Structure):
+    __slots__ = ["handle"]
+
+    def __init__(self, handle):
+        self.handle = handle
+
+    def __del__(self):
+        check_call(_LIB.DLEventDestroy(self.handle))
+
+    def sync(self):
+        check_call(_LIB.DLEventSync(self.handle))
+
+    def record(self, stream_handle):
+        check_call(_LIB.DLEventRecord(stream_handle.handle, self.handle))
+
+
+def create_event_handle(ctx):
+    assert ndarray.is_gpu_ctx(ctx)
+    handle = DLEventHandle()
+    check_call(_LIB.DLEventCreate(ctx.device_id, ctypes.byref(handle)))
+    return Event(handle)
+
+
+class PSEvent(object):
+    __slots__ = ["comm", "nid", "need_wait"]
+
+    def __init__(self, comm, nid):
+        self.comm = comm
+        self.nid = nid
+        self.need_wait = False
+
+    def update(self):
+        self.need_wait = True
+
+    def sync(self):
+        if self.need_wait:
+            self.comm.Wait(self.nid)
+        self.need_wait = False
+
+
+class CSEvent(PSEvent):
+    __slots__ = ["tss"]
+
+    def __init__(self, comm, nid):
+        super().__init__(comm, nid)
+        self.tss = []
+
+    def update_ts(self, ts):
+        self.tss.append(ts)
+
+    def sync(self):
+        super().sync()
+        if self.tss != []:
+            for ts in self.tss:
+                ts.wait()
+        self.tss = []
diff --git a/python/hetu/tokenizers/__init__.py b/python/hetu/tokenizers/__init__.py
new file mode 100644
index 0000000..c6784ae
--- /dev/null
+++ b/python/hetu/tokenizers/__init__.py
@@ -0,0 +1 @@
+from .bert_tokenizer import BertTokenizer
diff --git a/python/hetu/tokenizers/bert_tokenizer.py b/python/hetu/tokenizers/bert_tokenizer.py
new file mode 100644
index 0000000..cb8e3f0
--- /dev/null
+++ b/python/hetu/tokenizers/bert_tokenizer.py
@@ -0,0 +1,363 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import os
+import unicodedata
+import six
+from io import open
+
+from .utils import cached_path
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                              never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(
+                    len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            print(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            print("loading vocabulary file {}".format(vocab_file))
+        else:
+            print("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/python/hetu/tokenizers/utils.py b/python/hetu/tokenizers/utils.py
new file mode 100644
index 0000000..58320c2
--- /dev/null
+++ b/python/hetu/tokenizers/utils.py
@@ -0,0 +1,248 @@
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            "unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s",
+                        url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s",
+                        temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/python/runner.py b/python/runner.py
new file mode 100644
index 0000000..00c6c86
--- /dev/null
+++ b/python/runner.py
@@ -0,0 +1,280 @@
+import argparse
+import yaml
+import os
+import signal
+import multiprocessing
+import subprocess
+import paramiko
+import socket
+import psutil
+import hetu as ht
+
+_procs = []
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in _procs:
+        proc.kill()
+    global executor_shell
+    executor_shell.kill()
+    exit(0)
+
+
+def start_sched():
+    os.environ["DMLC_ROLE"] = "scheduler"
+    ht.scheduler_init()
+    ht.scheduler_finish()
+
+
+def start_server():
+    os.environ["DMLC_ROLE"] = "server"
+    ht.server_init()
+    ht.server_finish()
+
+
+def start_remote_server(host, local_server_num, identify_file):
+    ssh_directory = os.path.expanduser('~/.ssh') if identify_file == '' else os.path.dirname(
+        os.path.abspath(os.path.expanduser(identify_file)))
+    ssh = paramiko.SSHClient()
+    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    private = paramiko.RSAKey.from_private_key_file(
+        os.path.join(ssh_directory, 'id_rsa'))
+    config = paramiko.config.SSHConfig.from_path(
+        os.path.join(ssh_directory, 'config'))
+    conf = config.lookup(host)
+    ssh.connect(hostname=conf['hostname'], port=conf['port'],
+                username=conf['user'], pkey=private)
+    sftp = ssh.open_sftp()
+    sftp.put('/tmp/temp_hetu_config.yml',
+             '/tmp/temp_hetu_config.yml', confirm=True)
+    sftp.close()
+    stdin, stdout, stderr = ssh.exec_command(
+        'python -m hetu.launcher /tmp/temp_hetu_config.yml -n %d' % local_server_num)
+    stdout = stdout.read().decode()
+    stderr = stderr.read().decode()
+    if stdout:
+        print('From remote %s stdout:\n %s' % (host, stdout.strip()))
+    if stderr:
+        print('From remote %s stderr:\n %s' % (host, stderr.strip()))
+    ssh.close()
+
+
+def get_available_port(localhost):
+    ports = set()
+    for conn in psutil.net_connections():
+        la = conn.laddr
+        ra = conn.raddr
+        if len(la) == 2 and la.ip in (localhost, '127.0.0.1'):
+            ports.add(la.port)
+        if len(ra) == 2 and ra.ip in (localhost, '127.0.0.1'):
+            ports.add(ra.port)
+    for p in range(13100, 13200):
+        if p not in ports:
+            return p
+
+
+def get_nic_names(local_address, remote_hostnames, identify_file):
+    # get local interface
+    nics = dict()
+    for iface, addrs in psutil.net_if_addrs().items():
+        for addr in addrs:
+            if addr.family == socket.AF_INET:
+                nics[addr.address] = iface
+    local_nic = nics[local_address]
+
+    # get remote interfaces
+    command_prefix = "\"from socket import AF_INET;\nfrom psutil import net_if_addrs;\n" +\
+        "nics = dict();\nfor iface, addrs in net_if_addrs().items():\n    for addr in addrs:" +\
+        "\n        if addr.family == AF_INET:\n            nics[addr.address] = iface;\n"
+    ssh_directory = os.path.expanduser('~/.ssh') if identify_file == '' else os.path.dirname(
+        os.path.abspath(os.path.expanduser(identify_file)))
+    remote_nics = set()
+    for hostname in remote_hostnames:
+        ssh = paramiko.SSHClient()
+        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        private = paramiko.RSAKey.from_private_key_file(
+            os.path.join(ssh_directory, 'id_rsa'))
+        config = paramiko.config.SSHConfig.from_path(
+            os.path.join(ssh_directory, 'config'))
+        conf = config.lookup(hostname)
+        command = command_prefix + "print(nics[\'%s\'])\"" % (conf['hostname'])
+        ssh.connect(hostname=conf['hostname'], port=conf['port'],
+                    username=conf['user'], pkey=private)
+        stdin, stdout, stderr = ssh.exec_command('python -c %s' % command)
+        stdout = stdout.read().decode()
+        stderr = stderr.read().decode()
+        remote_nics.add(stdout.strip())
+        if stderr:
+            print('From remote %s stderr:\n %s' % (hostname, stderr.strip()))
+        ssh.close()
+
+    remote_nics.add(local_nic)
+    return list(remote_nics)
+
+
+def get_subnet(local_address, remote_hostnames, identify_file=''):
+    ssh_directory = os.path.expanduser('~/.ssh') if identify_file == '' else os.path.dirname(
+        os.path.abspath(os.path.expanduser(identify_file)))
+    config = paramiko.config.SSHConfig.from_path(
+        os.path.join(ssh_directory, 'config'))
+    remote_address = [config.lookup(hostname)['hostname']
+                      for hostname in remote_hostnames]
+    remote_address.append(local_address)
+    address_pool = set()
+    for addr in remote_address:
+        binary_repr = int(''.join([format(int(part), '08b')
+                                   for part in addr.split('.')]), 2)
+        address_pool.add(format(binary_repr+1, '032b'))
+        address_pool.add(format(binary_repr-1, '032b'))
+    address_pool = list(address_pool)
+    longestCommonPrefix = 0
+    for item in zip(*address_pool):
+        if len(set(item)) > 1:
+            break
+        longestCommonPrefix += 1
+    if longestCommonPrefix > 30:
+        longestCommonPrefix = 30
+    assert longestCommonPrefix >= 16, 'Hosts not in the same subnet!'
+    commonAddress = address_pool[0][:longestCommonPrefix] + \
+        '0' * (32 - longestCommonPrefix)
+    parts = [commonAddress[:8], commonAddress[8:16],
+             commonAddress[16:24], commonAddress[24:]]
+    subnet = '.'.join([str(int(part, 2))
+                       for part in parts]) + '/%d' % longestCommonPrefix
+    return subnet
+
+
+def main():
+    signal.signal(signal.SIGINT, signal_handler)
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', required=True,
+                        help='Configuration file.')
+    parser.add_argument('-i', '--identify', default='',
+                        help='SSH identify file.')
+    parser.add_argument('command', nargs=argparse.REMAINDER,
+                        help='Command to be executed.')
+    args = parser.parse_args()
+    settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+    attributes = set(['host', 'servers', 'workers', 'chief'])
+    hosts = []
+    servers, workers = {}, {}
+    chief = None
+    chief_address = socket.gethostbyname(socket.gethostname())
+    port = get_available_port(chief_address)
+    for node in settings['nodes']:
+        assert set(node.keys(
+        )) <= attributes, 'Attributes of nodes invalid, %s / %s.' % (set(node.keys()), attributes)
+        hosts.append(node['host'])
+        if node.get('servers', 0):
+            servers[node['host']] = node['servers']
+        if node.get('workers', 0):
+            workers[node['host']] = node['workers']
+        if node.get('chief', False):
+            assert chief is None, 'There should be only one chief.'
+            chief = node['host']
+    assert chief, 'There should be one chief.'
+    num_servers = sum(servers.values())
+    num_workers = sum(workers.values())
+    enable_PS = (num_servers > 0)
+    print('Cluster: {')
+    print('  Chief: %s,' % chief)
+    print('  Servers(%d): %s,' % (num_servers, servers))
+    print('  Workers(%d): %s,' % (num_workers, workers))
+    print('}')
+    if enable_PS:
+        os.environ['DMLC_PS_ROOT_URI'] = chief_address
+        os.environ['DMLC_PS_ROOT_PORT'] = str(port)
+        os.environ['DMLC_PS_VAN_TYPE'] = 'p3'
+        os.environ['DMLC_NUM_SERVER'] = str(num_servers)
+        os.environ['DMLC_NUM_WORKER'] = str(num_workers)
+
+    global executor_shell
+    if len(hosts) == 1:
+        # single machine
+        # TODO: add hostdress validation check
+        if enable_PS:
+            proc = multiprocessing.Process(target=start_sched)
+            _procs.append(proc)
+            for i in range(num_servers):
+                proc = multiprocessing.Process(target=start_server)
+                _procs.append(proc)
+        for proc in _procs:
+            proc.start()
+        mpi_command = 'mpirun --allow-run-as-root --tag-output -np %d %s' % (
+            num_workers, ' '.join(args.command))
+        env = dict(os.environ)
+        if enable_PS:
+            env["DMLC_ROLE"] = "worker"
+        executor_shell = subprocess.Popen(
+            mpi_command, shell=True, env=env, stdout=None, stderr=None)
+        for proc in _procs:
+            proc.join()
+        executor_shell.wait()
+    else:
+        # multi machines
+
+        #! nic names not used currently, use subnets instead; nccl_socket_name please specified in /etc/bash.bashrc
+        #! nic methods cannot support different nic name on different machines
+        # nics = get_nic_names(chief_address, set(hosts) - {chief}, args.identify)
+        # joined_nics = ','.join(nics)
+        subnet = get_subnet(chief_address, set(hosts) - {chief}, args.identify)
+        if enable_PS:
+            with open('/tmp/temp_hetu_config.yml', 'w') as fw:
+                yaml.dump({'shared': {'DMLC_PS_ROOT_URI': chief_address, 'DMLC_PS_ROOT_PORT': port,
+                                      'DMLC_NUM_WORKER': num_workers, 'DMLC_NUM_SERVER': num_servers, 'DMLC_PS_VAN_TYPE': 'p3'}}, fw)
+            proc = multiprocessing.Process(target=start_sched)
+            _procs.append(proc)
+        for node in hosts:
+            if node == chief:
+                for i in range(servers.get(node, 0)):
+                    proc = multiprocessing.Process(target=start_server)
+                    _procs.append(proc)
+            else:
+                if servers.get(node, 0):
+                    proc = multiprocessing.Process(target=start_remote_server, args=[
+                                                   node, servers[node], args.identify])
+                    _procs.append(proc)
+        for proc in _procs:
+            proc.start()
+        basic_args = '--allow-run-as-root --tag-output'
+        hosts_in_command = ','.join(
+            ['%s:%d' % (node, nworkers) for node, nworkers in workers.items()])
+        mpi_ssh_args = '' if args.identify == '' else '-bootstrap=ssh -bootstrap-exec-args -i %s' % args.identify
+        tcp_intf_arg = '-mca btl_tcp_if_include %s' % subnet
+        # tcp_intf_arg = '-mca btl_tcp_if_include %s' % joined_nics
+        # nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME=%s' % joined_nics
+        env_list = '-x DMLC_PS_ROOT_URI=%s -x DMLC_PS_ROOT_PORT=%s -x DMLC_PS_VAN_TYPE=p3 -x DMLC_NUM_SERVER=%s -x DMLC_NUM_WORKER=%s -x DMLC_ROLE=worker' %\
+            (chief_address, str(port), str(num_servers),
+             str(num_workers)) if enable_PS else ''
+        mpi_command = (
+            'mpirun {basic_args} '
+            '--host {hosts} '
+            '{mpi_ssh_args} '
+            '{tcp_intf_arg} '
+            # '{nccl_socket_intf_arg} '
+            '{env} '
+            '{command}'
+            .format(basic_args=basic_args,
+                    hosts=hosts_in_command,
+                    mpi_ssh_args=mpi_ssh_args,
+                    tcp_intf_arg=tcp_intf_arg,
+                    # nccl_socket_intf_arg=nccl_socket_intf_arg,
+                    env=env_list,
+                    command=' '.join(args.command))
+        )
+        executor_shell = subprocess.Popen(
+            mpi_command, shell=True, stdout=None, stderr=None)
+        for proc in _procs:
+            proc.join()
+        executor_shell.wait()
+
+
+if __name__ == '__main__':
+    #! need to modify /etc/bash.bashrc on other machines for:
+    #       * specify NCCL_SOCKET_IFNAME
+    #       * specify PATH for mpirun support
+    #       * activate conda environment
+    #       * specify PYTHONPATH for hetu support
+    #! ssh process to other machines for server CANNOT receive SIGINT from Ctrl+C on this machine, please kill on other machines
+    main()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..3f45db4
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,45 @@
+include_directories(header common memory_pool)
+include_directories(SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+aux_source_directory(common COMMON_SRC)
+aux_source_directory(cuda_common CUDA_COMMON_SRC)
+aux_source_directory(ops OPS_SRC)
+aux_source_directory(dnnl_ops DNNL_OPS_SRC)
+
+add_library(c_runtime_api SHARED)
+
+add_custom_target(hetu DEPENDS c_runtime_api)
+
+if(${HETU_COMPILE_GPU})
+    find_package(CUDNN 7.5 REQUIRED)
+    add_definitions(-DDEVICE_GPU)
+    target_sources(c_runtime_api PUBLIC ${COMMON_SRC} ${CUDA_COMMON_SRC} ${OPS_SRC})
+    target_link_libraries(c_runtime_api cudart cublas cusparse curand)
+    target_link_libraries(c_runtime_api ${CUDNN_LIBRARY_PATH})
+    target_include_directories(c_runtime_api PUBLIC ${CUDNN_INCLUDE_PATH})
+    target_include_directories(c_runtime_api PUBLIC cuda_common)
+    set_property(TARGET c_runtime_api PROPERTY CUDA_ARCHITECTURES 30 35 50 52)
+endif()
+if(${HETU_COMPILE_MKL})
+    find_package(MKL)
+    if(NOT MKL_FOUND)
+        FetchContent_Declare(mkl URL https://github.com/intel/mkl-dnn/archive/v1.6.1.tar.gz)
+        message(STATUS "Preparing mkl-dnn ...")
+        FetchContent_MakeAvailable(mkl)
+        message(STATUS "Finish mkl-dnn.")
+        target_link_libraries(c_runtime_api dnnl)
+    else()
+        target_include_directories(c_runtime_api PUBLIC ${DNNL_INCLUDE_DIR})
+        target_include_directories(c_runtime_api PUBLIC ${DNNL_BUILD_INCLUDE_DIR})
+        target_link_libraries(c_runtime_api ${DNNL_LIBRARY})
+    endif()
+    target_sources(c_runtime_api PUBLIC ${COMMON_SRC} ${DNNL_OPS_SRC})
+endif()
+
+if(${HETU_ALLREDUCE})
+    add_subdirectory(communication)
+endif()
+
+if(${HETU_CACHE})
+    add_subdirectory(hetu_cache)
+endif()
diff --git a/src/common/c_runtime_api.cc b/src/common/c_runtime_api.cc
new file mode 100644
index 0000000..1db3ab7
--- /dev/null
+++ b/src/common/c_runtime_api.cc
@@ -0,0 +1,142 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file c_runtime_api.cc
+ * \brief Device specific implementations
+ */
+#include "c_runtime_api.h"
+#include "cpu_device_api.h"
+#include "cuda_device_api.h"
+#include "runtime_base.h"
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <thread>
+
+namespace hetusys { namespace runtime {
+class DeviceAPIManager {
+public:
+    static const int kMaxDeviceAPI = 8;
+    // Get API
+    static DeviceAPI *Get(DLContext ctx) {
+        return Global()->GetAPI(ctx.device_type);
+    }
+
+private:
+    std::array<DeviceAPI *, kMaxDeviceAPI> api_;
+    DeviceAPIManager() {
+        std::fill(api_.begin(), api_.end(), nullptr);
+        static CPUDeviceAPI cpu_device_api_inst;
+        api_[kCPU] = static_cast<DeviceAPI *>(&cpu_device_api_inst);
+#ifdef DEVICE_GPU
+        static CUDADeviceAPI gpu_device_api_inst;
+        api_[kGPU] = static_cast<DeviceAPI *>(&gpu_device_api_inst);
+#endif
+    }
+    // Get global static variable.
+    static DeviceAPIManager *Global() {
+        static DeviceAPIManager inst;
+        return &inst;
+    }
+    // Get API.
+    DeviceAPI *GetAPI(DLDeviceType type) {
+        if (api_[type] == nullptr) {
+            std::cout << "Device type:" << type << std::endl;
+            std::cerr << "Device API not supported" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        return api_[type];
+    }
+};
+
+inline DLArray *DLArrayCreate_() {
+    DLArray *arr = new DLArray();
+    arr->shape = nullptr;
+    arr->ndim = 0;
+    arr->data = nullptr;
+    return arr;
+}
+
+inline void DLArrayFree_(DLArray *arr) {
+    if (arr != nullptr) {
+        // ok to delete nullptr
+        delete[] arr->shape;
+        delete[] arr->stride;
+        if (arr->data != nullptr) {
+            DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(arr->ctx, arr->data);
+        }
+    }
+    delete arr;
+}
+
+inline size_t GetDataSize(DLArray *arr) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; ++i) {
+        size *= arr->shape[i];
+    }
+    // assume 32-bit float
+    size *= 4;
+    return size;
+}
+
+inline size_t GetDataAlignment(DLArray *arr) {
+    // assume 32-bit float
+    return 8;
+}
+
+}} // namespace hetusys::runtime
+
+using namespace hetusys::runtime;
+
+int DLArrayAlloc(const index_t *shape, const index_t *stride, index_t ndim,
+                 DLContext ctx, DLArrayHandle *out) {
+    DLArray *arr = nullptr;
+    API_BEGIN();
+    // shape
+    arr = DLArrayCreate_();
+    // ndim
+    arr->ndim = ndim;
+    index_t *shape_copy = new index_t[ndim];
+    std::copy(shape, shape + ndim, shape_copy);
+    arr->shape = shape_copy;
+    index_t *stride_copy = new index_t[ndim];
+    std::copy(stride, stride + ndim, stride_copy);
+    arr->stride = stride_copy;
+    // ctx
+    arr->ctx = ctx;
+    size_t size = GetDataSize(arr);
+    size_t alignment = GetDataAlignment(arr);
+    arr->data =
+        DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, size, alignment);
+    *out = arr;
+    API_END_HANDLE_ERROR(DLArrayFree_(arr));
+}
+
+int DLArrayFree(DLArrayHandle handle) {
+    API_BEGIN();
+    DLArray *arr = handle;
+    DLArrayFree_(arr);
+    API_END();
+}
+
+int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
+                      DLStreamHandle stream) {
+    API_BEGIN();
+    size_t from_size = GetDataSize(from);
+    size_t to_size = GetDataSize(to);
+    // The size must exactly match
+    assert(from_size == to_size);
+    DLContext ctx = from->ctx;
+    if (ctx.device_type == kCPU) {
+        ctx = to->ctx;
+    } else {
+        // Can not copy across different ctx types directly
+        assert((to->ctx.device_type == kCPU)
+               || (to->ctx.device_type == from->ctx.device_type));
+    }
+    DeviceAPIManager::Get(ctx)->CopyDataFromTo(from->data, to->data, from_size,
+                                               from->ctx, to->ctx, stream);
+    API_END();
+}
diff --git a/src/common/c_runtime_api.h b/src/common/c_runtime_api.h
new file mode 100644
index 0000000..ad92f9c
--- /dev/null
+++ b/src/common/c_runtime_api.h
@@ -0,0 +1,744 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file c_runtime_api.h
+ * \brief DL runtime library.
+ *
+ */
+
+#ifndef HETUSYS_RUNTIME_C_RUNTIME_API_H_
+#define HETUSYS_RUNTIME_C_RUNTIME_API_H_
+
+#ifdef __cplusplus
+#define HETUSYS_EXTERN_C extern "C"
+#else
+#define HETUSYS_EXTERN_C
+#endif
+
+#include "dlarray.h"
+#include <stddef.h>
+#include <stdint.h>
+
+HETUSYS_EXTERN_C {
+    // type of array index.
+    typedef int64_t index_t;
+
+    // the array handle
+    typedef DLArray *DLArrayHandle;
+    typedef DLStream *DLStreamHandle;
+    int DLStreamCreate(size_t dev_id, DLStreamHandle * handle);
+    int DLStreamDestroy(DLStreamHandle handle);
+    int DLStreamSync(DLStreamHandle handle);
+
+    typedef DLEvent *DLEventHandle;
+    int DLEventCreate(size_t dev_id, DLEventHandle * handle);
+    int DLEventDestroy(DLEventHandle handle);
+    int DLEventRecord(DLStreamHandle stream_andle, DLEventHandle event_handle);
+    int DLEventSync(DLEventHandle handle);
+
+    // Array related apis for quick proptying
+    /*!
+     * \brief Allocate a nd-array's memory,
+     *  including space of shape, of given spec.
+     *
+     * \param shape The shape of the array, the data content will be copied to
+     * out \param ndim The number of dimension of the array. \param ctx The ctx
+     * this array sits on. \param out The output handle. \return 0 when success,
+     * -1 when failure happens
+     */
+    int DLArrayAlloc(const index_t *shape, const index_t *stride, index_t ndim,
+                     DLContext ctx, DLArrayHandle *out);
+
+    /*!
+     * \brief Free the DL Array.
+     * \param handle The array handle to be freed.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLArrayFree(DLArrayHandle handle);
+
+    /*!
+     * \brief Copy the array, both from and to must be valid during the copy.
+     * \param from The array to be copied from.
+     * \param to The target space.
+     * \param stream The stream where the copy happens, can be NULL.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
+                          DLStreamHandle stream);
+
+    /*!
+     * \brief Set all array elements to given value.
+     * \param arr The array to be Set.
+     * \param value The target value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuArraySet(DLArrayHandle arr, float value,
+                      DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Broadcast input array to output array.
+     * \param input The input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output,
+                         DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Reduce sum input array by axis=0 and store to output.
+     * \param input The input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output,
+                               DLStreamHandle stream_handle);
+    int _DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output,
+                                DLArrayHandle arr_workspace,
+                                DLStreamHandle stream_handle);
+    /*!
+     * \brief Elementwise add two matrices and store to output.
+     * \param matA The left input array.
+     * \param matB The right input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixElementwiseAdd(
+        const DLArrayHandle matA, const DLArrayHandle matB,
+        DLArrayHandle output, bool lazy_input, DLStreamHandle stream_handle);
+    int DLGpuMatrixElementwiseAddSimple(
+        const DLArrayHandle matA, const DLArrayHandle matB,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+    int DLGpuMatrixElementwiseAddLazy(
+        const DLArrayHandle matA, const DLArrayHandle matB,
+        DLArrayHandle output, const DLArrayHandle gpu_buf,
+        DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Add matrix by const and store to output.
+     * \param input The input array.
+     * \param val The constant.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
+                                         DLArrayHandle output,
+                                         DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Elementwise multiply two matrices and store to output.
+     * \param matA The left input array.
+     * \param matB The right input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixElementwiseMultiply(
+        const DLArrayHandle matA, const DLArrayHandle matB,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Multiply matrix by const and store to output.
+     * \param input The input array.
+     * \param val The constant.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
+                                   DLArrayHandle output,
+                                   DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Elementwise divide two matrices and store to output.
+     * \param matA The dividend array.
+     * \param matB The divisor array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixElementwiseDivide(
+        const DLArrayHandle matA, const DLArrayHandle matB,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Divide const by matrix and store to output.
+     * \param input The input array.
+     * \param val The constant.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixDivConst(float val, const DLArrayHandle input,
+                            DLArrayHandle output, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute opposite number on all array elements, and store to
+     * output. \param input The input array. \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuOpposite(const DLArrayHandle input, DLArrayHandle output,
+                      DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Matrix multiply two matrices and store to output.
+     * \param matA The left input array.
+     * \param transposeA Whether matA needs to be transposed
+     * \param matB The right input array.
+     * \param transposeB Whether matB needs to be transposed
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
+                            const DLArrayHandle matB, bool transposeB,
+                            DLArrayHandle matC, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute sqrt on all array elements, and store to output.
+     * \param input The input array.
+     * \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuSqrt(const DLArrayHandle input, DLArrayHandle output,
+                  DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute reciprocal sqrt on all array elements, and store to
+     * output. \param input The input array. \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuReciprocalSqrt(const DLArrayHandle input, DLArrayHandle output,
+                            DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute relu on all array elements, and store to output.
+     * \param input The input array.
+     * \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output,
+                  DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute relu gradient, and store to output.
+     * \param input The input array.
+     * \param in_grad The input gradients value.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuReluGradient(const DLArrayHandle input,
+                          const DLArrayHandle in_grad, DLArrayHandle output,
+                          DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute leaky relu on all array elements, and store to output.
+     * \param input The input array.
+     * \param alpha The val to multiple when x < 0
+     * \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuLeakyRelu(const DLArrayHandle input, const float alpha,
+                       DLArrayHandle output, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute leaky relu gradient, and store to output.
+     * \param input The input array.
+     * \param in_grad The input gradients value.
+     * \param alpha The val to multiple when x < 0
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuLeakyReluGradient(
+        const DLArrayHandle input, const DLArrayHandle in_grad,
+        const float alpha, DLArrayHandle output, DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute softmax on matrix, and store to output.
+     * \param input The input array.
+     * \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuTanh(const DLArrayHandle input, DLArrayHandle output,
+                  DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute Tanh, and store to output.
+     * \param input The input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuSigmoid(const DLArrayHandle input, DLArrayHandle output,
+                     DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute Sigmoid, and store to output.
+     * \param input The input array.
+     * \param output The output array.
+     * \return 0 when success, -1 when failure happens
+     */
+
+    int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output,
+                     DLStreamHandle stream_handle);
+
+    /*!
+     * \brief Compute softmax_cross_entropy.
+     *  np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
+     * \param input_a The y array.
+     * \param input_b The y_ array.
+     * \param output The output value.
+     * \return 0 when success, -1 when failure happens
+     */
+    int DLGpuEmbeddingLookUp(const DLArrayHandle input, const DLArrayHandle ids,
+                             DLArrayHandle output,
+                             DLStreamHandle stream_handle);
+
+    int DLGpuEmbeddingLookUp_Gradient(
+        const DLArrayHandle output_grad, const DLArrayHandle ids,
+        DLArrayHandle input_grad, DLStreamHandle stream_handle);
+
+    int DLGpuSoftmaxCrossEntropy(
+        const DLArrayHandle input_a, const DLArrayHandle input_b,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+
+    int DLGpuSoftmaxCrossEntropy_Gradient(
+        const DLArrayHandle input_a, const DLArrayHandle input_b,
+        const DLArrayHandle input_c, DLArrayHandle output,
+        DLStreamHandle stream_handle);
+
+    int DLGpuConv2d(const DLArrayHandle input_x, const DLArrayHandle input_f,
+                    DLArrayHandle output, DLArrayHandle workspace_arr,
+                    const int padding, const int stride,
+                    DLStreamHandle stream_handle);
+
+    int DLGpuConv2d_Gradient_of_Filter(
+        const DLArrayHandle input_x, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_f, DLArrayHandle workspace_im2col,
+        DLArrayHandle workspace_batch_filter, const int padding,
+        const int stride, DLStreamHandle stream_handle);
+
+    int DLGpuConv2d_Gradient_of_Data(
+        const DLArrayHandle input_f, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_x, DLArrayHandle workspace_im2col,
+        const int padding, const int stride, DLStreamHandle stream_handle);
+
+    int DLGpuAvgerage_Pooling2d(
+        const DLArrayHandle input, const size_t kernel_H, const size_t kernel_W,
+        DLArrayHandle output, const size_t padding, const size_t stride,
+        DLStreamHandle stream_handle);
+
+    int DLGpuAvgerage_Pooling2d_gradient(
+        const DLArrayHandle gradient_Y, const size_t kernel_H,
+        const size_t kernel_W, DLArrayHandle gradient_X, const size_t padding,
+        const size_t stride, DLStreamHandle stream_handle);
+
+    int DLGpuMax_Pooling2d(const DLArrayHandle input, const int kernel_H,
+                           const int kernel_W, DLArrayHandle output,
+                           const int padding, const int stride,
+                           DLStreamHandle stream_handle);
+
+    int DLGpuMax_Pooling2d_gradient(
+        const DLArrayHandle input, const DLArrayHandle output_grad,
+        const int kernel_H, const int kernel_W, DLArrayHandle input_grad,
+        const int padding, const int stride, DLStreamHandle stream_handle);
+
+    int DLGpuReshape(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                     DLStreamHandle stream_handle);
+
+    int DLGpuConv2d_broadcast_to(const DLArrayHandle input_x,
+                                 DLArrayHandle output_y,
+                                 DLStreamHandle stream_handle);
+
+    int DLGpuConv2d_reduce_sum(const DLArrayHandle input_x,
+                               DLArrayHandle output_y,
+                               DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuConv2d(const DLArrayHandle input_x,
+                          const DLArrayHandle input_f, DLArrayHandle output,
+                          const int padding, const int stride,
+                          DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuConv2d_Gradient_of_Filter(
+        const DLArrayHandle input_x, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_f, const int padding, const int stride,
+        DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuConv2d_Gradient_of_Data(
+        const DLArrayHandle input_f, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_x, const int padding, const int stride,
+        DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuAvgerage_Pooling2d(
+        const DLArrayHandle input, const size_t kernel_H, const size_t kernel_W,
+        DLArrayHandle output, const size_t padding, const size_t stride,
+        DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuAvgerage_Pooling2d_gradient(
+        const DLArrayHandle output_Y, const DLArrayHandle gradient_Y,
+        const DLArrayHandle input_X, const size_t kernel_H,
+        const size_t kernel_W, DLArrayHandle gradient_X, const size_t padding,
+        const size_t stride, DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuDropout(const DLArrayHandle input_X, const float dropout,
+                           DLArrayHandle output_Y, int *reserve_size,
+                           void **reserve_space, int first_time,
+                           DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuDropout_gradient(const DLArrayHandle output_Y,
+                                    const float dropout, DLArrayHandle input_X,
+                                    int *reserve_size, void **reserve_space,
+                                    DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuMax_Pooling2d(
+        const DLArrayHandle input, const size_t kernel_H, const size_t kernel_W,
+        DLArrayHandle output, const size_t padding, const size_t stride,
+        DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuMax_Pooling2d_gradient(
+        const DLArrayHandle output_Y, const DLArrayHandle gradient_Y,
+        const DLArrayHandle input_X, const size_t kernel_H,
+        const size_t kernel_W, DLArrayHandle gradient_X, const size_t padding,
+        const size_t stride, DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuBatch_Normalization(
+        const DLArrayHandle input_X, const DLArrayHandle bn_scale,
+        const DLArrayHandle bn_bias, DLArrayHandle output_Y,
+        float momentum , float eps, DLArrayHandle save_mean_arr,
+        DLArrayHandle save_var_arr, DLArrayHandle running_mean_arr,
+        DLArrayHandle running_var_arr, DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuBatch_Normalization_gradient(
+        const DLArrayHandle gradient_Y, const DLArrayHandle input_X,
+        const DLArrayHandle bn_scale, DLArrayHandle gradient_X,
+        DLArrayHandle gradient_bn_scale, DLArrayHandle gradient_bn_bias,
+        float eps, DLArrayHandle running_mean_arr,
+        DLArrayHandle running_var_arr, DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuBatch_Normalization_inference(
+        const DLArrayHandle input_X, const DLArrayHandle bn_scale,
+        const DLArrayHandle bn_bias, DLArrayHandle output_Y, float eps,
+        DLArrayHandle save_mean_arr, DLArrayHandle save_var_arr,
+        DLStreamHandle stream_handle);
+
+    int DLGpuPad(const DLArrayHandle input_X, DLArrayHandle output_Y,
+                 int *paddings, int pad_len, size_t mode, float constant_values,
+                 DLStreamHandle stream_handle);
+
+    int DLGpuPad_gradient(
+        const DLArrayHandle output_gradient_Y, DLArrayHandle input_gradient_X,
+        int *paddings, int pad_len, size_t mode, DLStreamHandle stream_handle);
+
+    int DLGpuConcat(const DLArrayHandle input_x, const DLArrayHandle input_y,
+                    DLArrayHandle output, int axis,
+                    DLStreamHandle stream_handle);
+
+    int DLGpuConcat_gradient(const DLArrayHandle output_gradient,
+                             DLArrayHandle input_gradient, int axis, int id,
+                             DLStreamHandle stream_handle);
+
+    int DLGpuTranspose(const DLArrayHandle input, DLArrayHandle output,
+                       int *perm, DLStreamHandle stream_handle);
+    int DLGpuTransposeSimple(const DLArrayHandle input, DLArrayHandle output,
+                             const DLArrayHandle gpu_buffer,
+                             DLStreamHandle stream_handle);
+
+    int CuSparse_DLGpuCsrmv(
+        const DLArrayHandle data_handle, const DLArrayHandle row_handle,
+        const DLArrayHandle col_handle, int nrow, int ncol, bool transpose,
+        const DLArrayHandle input_handle, DLArrayHandle output_handle,
+        DLStreamHandle stream_handle);
+
+    int CuSparse_DLGpuCsrmm(
+        const DLArrayHandle data_handle, const DLArrayHandle row_handle,
+        const DLArrayHandle col_handle, int nrow, int ncol, bool transposeA,
+        const DLArrayHandle matB, bool transposeB, DLArrayHandle matC,
+        int start_pos, int end_pos, DLStreamHandle stream_handle);
+
+    int DLGpuSlice(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                   int64_t *begin_pos, DLStreamHandle stream_handle);
+    int DLGpuSliceSimple(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                         const DLArrayHandle gpu_buf,
+                         DLStreamHandle stream_handle);
+
+    int DLGpuSliceGradient(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                           int64_t *begin_pos, DLStreamHandle stream_handle);
+    int DLGpuSliceGradientSimple(
+        const DLArrayHandle in_arr, DLArrayHandle out_arr,
+        const DLArrayHandle gpu_buf, DLStreamHandle stream_handle);
+
+    int DLGpuWhere(const DLArrayHandle cond, const DLArrayHandle arr1,
+                   const DLArrayHandle arr2, DLArrayHandle output,
+                   DLStreamHandle stream_handle);
+
+    int DLGpuBatchMatrixMultiply(
+        const DLArrayHandle matA, bool transposeA, const DLArrayHandle matB,
+        bool transposeB, DLArrayHandle matC, DLStreamHandle stream_handle);
+
+    int DLGpuLayerNormalization(
+        const DLArrayHandle in_arr, const DLArrayHandle ln_scale,
+        const DLArrayHandle ln_bias, DLArrayHandle mean, DLArrayHandle var,
+        DLArrayHandle out_arr, float eps, DLStreamHandle stream_handle);
+
+    int DLGpuLayerNormalizationGradient(
+        const DLArrayHandle out_grads, const DLArrayHandle in_arr,
+        const DLArrayHandle ln_scale, DLArrayHandle grad_arr,
+        DLArrayHandle grad_scale, DLArrayHandle grad_bias,
+        const DLArrayHandle mean_arr, const DLArrayHandle var_arr, float eps,
+        DLStreamHandle stream_handle);
+
+    int DLGpuLayerNormalizationInference(
+        const DLArrayHandle in_arr, const DLArrayHandle ln_scale,
+        const DLArrayHandle ln_bias, DLArrayHandle mean_arr,
+        DLArrayHandle var_arr, DLArrayHandle out_arr, float eps,
+        DLStreamHandle stream_handle);
+
+    int DLGpuInstanceNormalization2d(
+        const DLArrayHandle in_arr, DLArrayHandle mean, DLArrayHandle var,
+        DLArrayHandle out_arr, float eps, DLStreamHandle stream_handle);
+
+    int DLGpuInstanceNormalization2dGradient(
+        const DLArrayHandle out_grads, const DLArrayHandle in_arr,
+        DLArrayHandle grad_arr, const DLArrayHandle mean_arr,
+        const DLArrayHandle var_arr, float eps, DLStreamHandle stream_handle);
+
+    int DLGpuBroadcastShape(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                            int *add_axes, DLStreamHandle stream_handle);
+
+    int DLGpuBroadcastShapeSimple(
+        const DLArrayHandle in_arr, DLArrayHandle out_arr,
+        const DLArrayHandle out_strides, const DLArrayHandle in_dims,
+        DLStreamHandle stream_handle);
+
+    int DLGpuReduceSum(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                       int *axes, int num_ax, DLStreamHandle stream_handle);
+
+    int DLGpuReduceMean(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                        int *axes, int num_ax, DLStreamHandle stream_handle);
+
+    int DLGpuArrayLazyCallback(const DLArrayHandle from, DLArrayHandle to,
+                               DLStreamHandle stream_handle);
+
+    int IndexedSlicesOneSideAdd(
+        const DLArrayHandle indices, const DLArrayHandle values,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+
+    int DLGpuDropout(const DLArrayHandle input, const float dropout,
+                     DLArrayHandle output, unsigned long long *pseed,
+                     DLStreamHandle stream_handle);
+
+    int DLGpuDropoutGradient(const DLArrayHandle grad, const float dropout,
+                             DLArrayHandle output, unsigned long long seed,
+                             DLStreamHandle stream_handle);
+
+    int DLGpuDropout2d(const DLArrayHandle input, const float dropout,
+                       DLArrayHandle output, unsigned long long *pseed,
+                       DLStreamHandle stream_handle);
+
+    int DLGpuDropout2dGradient(const DLArrayHandle grad, const float dropout,
+                               DLArrayHandle output, unsigned long long seed,
+                               DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output,
+                           DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuSoftmaxGradient(const DLArrayHandle y_arr,
+                                   const DLArrayHandle dy, DLArrayHandle dx,
+                                   DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuSoftmaxEntropy(
+        const DLArrayHandle input_y, const DLArrayHandle label,
+        DLArrayHandle output, DLStreamHandle stream_handle);
+
+    int CuDNN_DLGpuSoftmaxEntropyGradient(
+        const DLArrayHandle grad, const DLArrayHandle input_y,
+        const DLArrayHandle label, DLArrayHandle output,
+        DLStreamHandle stream_handle);
+
+    int DLGpuOneHot(const DLArrayHandle input, DLArrayHandle output,
+                    DLStreamHandle stream_handle);
+
+    // Initializers
+    int DLGpuNormalInit(DLArrayHandle arr, const float mean, const float stddev,
+                        unsigned long long seed, DLStreamHandle stream_handle);
+    int DLGpuUniformInit(DLArrayHandle arr, const float lb, const float ub,
+                         unsigned long long seed, DLStreamHandle stream_handle);
+    int DLGpuTruncatedNormalInit(DLArrayHandle arr, const float mean,
+                                 const float stddev, unsigned long long seed,
+                                 DLStreamHandle stream_handle);
+
+    // Optimizer Ops
+    int AddL2Regularization(const DLArrayHandle param, DLArrayHandle grad,
+                            float l2reg, DLStreamHandle stream_handle);
+    int SGDOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                           float lr, DLStreamHandle stream_handle);
+    int SGDOptimizerSparseUpdate(DLArrayHandle param,
+                                 const DLArrayHandle grad_indices,
+                                 const DLArrayHandle grad_values, float lr,
+                                 DLStreamHandle stream_handle);
+
+    int MomentumOptimizerUpdate(
+        DLArrayHandle param, const DLArrayHandle grad, DLArrayHandle velocity,
+        float lr, float momentum, bool nesterov, DLStreamHandle stream_handle);
+    int MomentumOptimizerSparseUpdate(
+        DLArrayHandle param, const DLArrayHandle grad_indices,
+        const DLArrayHandle grad_values, DLArrayHandle velocity, float lr,
+        float momentum, bool nesterov, DLStreamHandle stream_handle);
+
+    int AdaGradOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                               DLArrayHandle acc, float lr, float eps,
+                               DLStreamHandle stream_handle);
+    int AdaGradOptimizerSparseUpdate(
+        DLArrayHandle param, const DLArrayHandle grad_indices,
+        const DLArrayHandle grad_values, DLArrayHandle acc, float lr, float eps,
+        DLStreamHandle stream_handle);
+
+    int AdamOptimizerUpdate(
+        DLArrayHandle param, const DLArrayHandle grad, DLArrayHandle expavg,
+        DLArrayHandle expavgsq, float lr, float beta1, float beta2,
+        float beta1t, float beta2t, float eps, DLStreamHandle stream_handle);
+    int AdamOptimizerSparseUpdate(
+        DLArrayHandle param, const DLArrayHandle grad_indices,
+        const DLArrayHandle grad_values, DLArrayHandle expavg,
+        DLArrayHandle expavgsq, float lr, float beta1, float beta2,
+        float beta1t, float beta2t, float eps, DLStreamHandle stream_handle);
+
+    int DeduplicateIndexedSlices(
+        const DLArrayHandle origin, const DLArrayHandle inverse,
+        DLArrayHandle compressed, DLStreamHandle stream_handle);
+
+    // DNNL Ops
+    int DnnlMatrixMultiply(const DLArrayHandle matA, bool transposeA,
+                           const DLArrayHandle matB, bool transposeB,
+                           const DLArrayHandle matC);
+    int DnnlMatrixElementwiseMultiplyByConst(const DLArrayHandle mat, float val,
+                                             DLArrayHandle output);
+    int DnnlMatrixElementwiseMultiply(const DLArrayHandle matA,
+                                      const DLArrayHandle matB,
+                                      DLArrayHandle output);
+
+    int DnnlMatrixElementwiseAddByConst(const DLArrayHandle mat, float val,
+                                        DLArrayHandle output);
+    int DnnlMatrixElementwiseAdd(const DLArrayHandle matA,
+                                 const DLArrayHandle matB,
+                                 DLArrayHandle output);
+
+    int DnnlMatrixElementwiseDivideByConst(const DLArrayHandle mat, float val,
+                                           DLArrayHandle output);
+    int DnnlMatrixElementwiseDivide(const DLArrayHandle matA,
+                                    const DLArrayHandle matB,
+                                    DLArrayHandle output);
+
+    int cpu_BroadcastTo(const DLArrayHandle in_arr, DLArrayHandle out_arr);
+    int cpu_ReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output);
+
+    int cpu_ArraySet(DLArrayHandle input, float value);
+    int cpu_Reshape(const DLArrayHandle in_arr, DLArrayHandle out_arr);
+
+    int DnnlSoftmax(const DLArrayHandle input, DLArrayHandle output);
+    int DnnlSoftmaxCrossEntropy(const DLArrayHandle A, const DLArrayHandle B,
+                                DLArrayHandle output);
+
+    int DnnlSqrt(const DLArrayHandle input, DLArrayHandle output);
+    int DnnlReciprocalSqrt(const DLArrayHandle input, DLArrayHandle output);
+
+    int DnnlTanh(const DLArrayHandle input, DLArrayHandle output);
+    int DnnlOpposite(const DLArrayHandle input, DLArrayHandle output);
+    int DnnlSigmoid(const DLArrayHandle input, DLArrayHandle output);
+
+    int DnnlConv2d(const DLArrayHandle input_x, DLArrayHandle input_f,
+                   DLArrayHandle output, const int padding, const int stride);
+    int DnnlConv2d_Gradient_of_Filter(
+        const DLArrayHandle input_x, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_f, const int padding, const int stride);
+    int DnnlConv2d_Gradient_of_Data(
+        const DLArrayHandle input_f, const DLArrayHandle gradient_y,
+        DLArrayHandle gradient_x, const int padding, const int stride);
+
+    int DnnlAvgPool(const DLArrayHandle input, const int kernel_H,
+                    const int kernel_W, DLArrayHandle output, const int padding,
+                    const int stride);
+    int DnnlAvgPool_Gradient(const DLArrayHandle gradient_Y, const int kernel_H,
+                             const int kernel_W, DLArrayHandle gradient_X,
+                             const int padding, const int stride);
+
+    int DnnlMaxPool(const DLArrayHandle input, const int kernel_H,
+                    const int kernel_W, DLArrayHandle output, const int padding,
+                    const int stride);
+    int DnnlMaxPool_Gradient(const DLArrayHandle input,
+                             const DLArrayHandle input_grad, const int kernel_H,
+                             const int kernel_W, DLArrayHandle output_grad,
+                             const int padding, const int stride);
+
+    int DnnlRelu(const DLArrayHandle input, DLArrayHandle output);
+    int DnnlRelu_Gradient(const DLArrayHandle input,
+                          const DLArrayHandle in_grad, DLArrayHandle output);
+
+    int DnnlBatchNorm(const DLArrayHandle input, const DLArrayHandle bn_scale,
+                      const DLArrayHandle bn_bias, DLArrayHandle output,
+                      DLArrayHandle mean, DLArrayHandle var, float momentum,
+                      float eps);
+    int DnnlBatchNorm_Gradient(
+        const DLArrayHandle grad_y, const DLArrayHandle input,
+        const DLArrayHandle bn_scale, const DLArrayHandle bn_bias,
+        DLArrayHandle grad_x, DLArrayHandle grad_scale, DLArrayHandle grad_bias,
+        DLArrayHandle mean, DLArrayHandle var, const float eps);
+    int DnnlBatchNorm_Inference(
+        const DLArrayHandle input, const DLArrayHandle bn_scale,
+        const DLArrayHandle bn_bias, DLArrayHandle output, DLArrayHandle mean,
+        DLArrayHandle var, float momentum, float eps);
+
+    int DnnlConcat(const DLArrayHandle input_x, const DLArrayHandle input_y,
+                   DLArrayHandle output, int axis);
+    int cpu_Concat_Gradient(const DLArrayHandle output_gradient,
+                            DLArrayHandle input_gradient, int axis, int id);
+
+    int cpu_Dropout(const DLArrayHandle input_X, float dropout,
+                    DLArrayHandle output_Y);
+    int cpu_Dropout_Gradient(const DLArrayHandle output_Y, float dropout,
+                             DLArrayHandle input_X);
+
+    int cpu_Pad(const DLArrayHandle input_X, DLArrayHandle output_Y,
+                int *paddings, int pad_len, size_t mode, float constant_values);
+    int cpu_Pad_Gradient(const DLArrayHandle output_gradient_Y,
+                         DLArrayHandle input_gradient_X, int *paddings,
+                         int pad_len, size_t mode);
+
+    int cpu_Transpose(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                      int *perm);
+
+    int cpu_EmbeddingLookup(const DLArrayHandle in_mat, const DLArrayHandle ids,
+                            DLArrayHandle out_mat);
+
+    int cpu_AddL2Regularization(const DLArrayHandle param,
+                                const DLArrayHandle grad, float l2reg);
+    int cpu_SGDOptimizerUpdate(const DLArrayHandle param,
+                               const DLArrayHandle grad, float learning_rate);
+    int cpu_SGDOptimizerSparseUpdate(DLArrayHandle param,
+                                     const DLArrayHandle grad_indices,
+                                     const DLArrayHandle grad_values, float lr);
+    int cpu_MomentumOptimizerUpdate(
+        DLArrayHandle param, const DLArrayHandle grad, DLArrayHandle velocity,
+        float learning_rate, float momentum, bool nesterov);
+    int cpu_AdaGradOptimizerUpdate(DLArrayHandle param,
+                                   const DLArrayHandle grad, DLArrayHandle acc,
+                                   float learning_rate, float eps);
+    int cpu_AdamOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                                DLArrayHandle expavg, DLArrayHandle expavgsq,
+                                float learning_rate, float beta1, float beta2,
+                                float beta1t, float beta2t, float eps);
+
+    int cpu_NormalInit(DLArrayHandle arr, const float mean, const float stddev,
+                       unsigned long long seed);
+    int cpu_UniformInit(DLArrayHandle arr, const float lb, const float ub,
+                        unsigned long long seed);
+    int cpu_TruncatedNormalInit(DLArrayHandle arr, const float mean,
+                                const float stddev, unsigned long long seed);
+
+    int DLGpuBinaryCrossEntropy(const DLArrayHandle prediction,
+                                const DLArrayHandle label, DLArrayHandle loss,
+                                DLStreamHandle stream_handle);
+
+    int DLGpuBinaryCrossEntropy_Gradient(
+        const DLArrayHandle prediction, const DLArrayHandle label,
+        const DLArrayHandle output_grad, DLArrayHandle output,
+        DLStreamHandle stream_handle);
+
+    int DLGpuDot(const DLArrayHandle matA, const DLArrayHandle matB,
+                 DLArrayHandle output, DLStreamHandle stream_handle);
+
+} // HETUSYS_EXTERN_C
+
+#endif // HETUSYS_RUNTIME_C_RUNTIME_API_H_
diff --git a/src/common/cpu_device_api.cc b/src/common/cpu_device_api.cc
new file mode 100644
index 0000000..65c1fda
--- /dev/null
+++ b/src/common/cpu_device_api.cc
@@ -0,0 +1,53 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cpu_device_api.cc
+ */
+#include "cpu_device_api.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#ifdef DEVICE_GPU
+#include <cuda_runtime.h>
+#define CUDA_CALL(cmd)                                                         \
+    do {                                                                       \
+        cudaError_t e = cmd;                                                   \
+        if (e != cudaSuccess) {                                                \
+            printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   cudaGetErrorString(e));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+#endif
+
+namespace hetusys { namespace runtime {
+void *CPUDeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
+                                   size_t alignment) {
+    void *ptr;
+#ifdef DEVICE_GPU
+    CUDA_CALL(cudaMallocHost((void **)&ptr, size));
+#else
+    int ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0)
+        throw std::bad_alloc();
+#endif
+    return ptr;
+}
+
+void CPUDeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) {
+#ifdef DEVICE_GPU
+    cudaFreeHost(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+void CPUDeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
+                                  DLContext ctx_from, DLContext ctx_to,
+                                  DLStreamHandle stream) {
+    memcpy(to, from, size);
+}
+
+void CPUDeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {
+}
+
+}} // namespace hetusys::runtime
diff --git a/src/common/cpu_device_api.h b/src/common/cpu_device_api.h
new file mode 100644
index 0000000..25d1255
--- /dev/null
+++ b/src/common/cpu_device_api.h
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file device_api.h
+ * \brief Device specific API
+ */
+#ifndef HETUSYS_RUNTIME_CPU_DEVICE_API_H_
+#define HETUSYS_RUNTIME_CPU_DEVICE_API_H_
+
+#include "c_runtime_api.h"
+#include "device_api.h"
+#include <assert.h>
+#include <string>
+
+namespace hetusys { namespace runtime {
+
+class CPUDeviceAPI : public DeviceAPI {
+public:
+    void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
+
+    void FreeDataSpace(DLContext ctx, void *ptr) final;
+
+    void CopyDataFromTo(const void *from, void *to, size_t size,
+                        DLContext ctx_from, DLContext ctx_to,
+                        DLStreamHandle stream) final;
+
+    void StreamSync(DLContext ctx, DLStreamHandle stream) final;
+};
+
+}}     // namespace hetusys::runtime
+#endif // HETUSYS_RUNTIME_CPU_DEVICE_API_H_
diff --git a/src/common/cuda_device_api.h b/src/common/cuda_device_api.h
new file mode 100644
index 0000000..9816963
--- /dev/null
+++ b/src/common/cuda_device_api.h
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file device_api.h
+ * \brief Device specific API
+ */
+#ifndef HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
+#define HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
+
+#include "c_runtime_api.h"
+#include "device_api.h"
+
+#include <assert.h>
+#include <string>
+
+namespace hetusys { namespace runtime {
+
+class CUDADeviceAPI : public DeviceAPI {
+public:
+    void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
+
+    void FreeDataSpace(DLContext ctx, void *ptr) final;
+
+    void CopyDataFromTo(const void *from, void *to, size_t size,
+                        DLContext ctx_from, DLContext ctx_to,
+                        DLStreamHandle stream) final;
+
+    void StreamSync(DLContext ctx, DLStreamHandle stream) final;
+};
+
+}}     // namespace hetusys::runtime
+#endif // HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
diff --git a/src/common/device_api.h b/src/common/device_api.h
new file mode 100644
index 0000000..7a12257
--- /dev/null
+++ b/src/common/device_api.h
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file device_api.h
+ * \brief Device specific API
+ */
+#ifndef HETUSYS_RUNTIME_DEVICE_API_H_
+#define HETUSYS_RUNTIME_DEVICE_API_H_
+
+#include "c_runtime_api.h"
+#include <assert.h>
+#include <string>
+
+namespace hetusys { namespace runtime {
+
+class DeviceAPI {
+public:
+    /*! \brief virtual destructor */
+    virtual ~DeviceAPI() {
+    }
+    /*!
+     * \brief Allocate a data space on device.
+     * \param ctx The device context to perform operation.
+     * \param size The size of the memory
+     * \param alignment The alignment of the memory.
+     * \return The allocated device pointer
+     */
+    virtual void *AllocDataSpace(DLContext ctx, size_t size,
+                                 size_t alignment) = 0;
+    /*!
+     * \brief Free a data space on device.
+     * \param ctx The device context to perform operation.
+     * \param ptr The data space.
+     * \tparam xpu The device mask.
+     */
+    virtual void FreeDataSpace(DLContext ctx, void *ptr) = 0;
+    /*!
+     * \brief copy data from one place to another
+     * \param dev The device to perform operation.
+     * \param from The source array.
+     * \param to The target array.
+     * \param size The size of the memory
+     * \param ctx_from The source context
+     * \param ctx_to The target context
+     */
+    virtual void CopyDataFromTo(const void *from, void *to, size_t size,
+                                DLContext ctx_from, DLContext ctx_to,
+                                DLStreamHandle stream) = 0;
+    /*!
+     * \brief Synchronize the stream
+     * \param ctx The context to perform operation.
+     * \param stream The stream to be sync.
+     */
+    virtual void StreamSync(DLContext ctx, DLStreamHandle stream) = 0;
+};
+
+}}     // namespace hetusys::runtime
+#endif // HETUSYS_RUNTIME_DEVICE_API_H_
diff --git a/src/common/dlarray.h b/src/common/dlarray.h
new file mode 100644
index 0000000..963277e
--- /dev/null
+++ b/src/common/dlarray.h
@@ -0,0 +1,67 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlarray.h
+ * \brief Header that defines array struct.
+ */
+#ifndef HETUSYS_H_
+#define HETUSYS_H_
+
+#ifdef __cplusplus
+#define HETUSYS_EXTERN_C extern "C"
+#else
+#define HETUSYS_EXTERN_C
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+HETUSYS_EXTERN_C {
+    /*!
+     * \brief The device type in DLContext.
+     */
+    typedef enum {
+        kCPU = 1,
+        kGPU = 2,
+    } DLDeviceType;
+
+    /*!
+     * \brief A Device context for array.
+     */
+    typedef struct {
+        /*! \brief The device index */
+        int device_id;
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+    } DLContext;
+
+    /*!
+     * \brief Plain C Array object, does not manage memory.
+     */
+    typedef struct {
+        /*!
+         * \brief The opaque data pointer points to the allocated data.
+         *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+         *  This pointer is always aligns to 256 bytes as in CUDA.
+         */
+        void *data;
+        /*! \brief The device context of the tensor */
+        DLContext ctx;
+        /*! \brief Number of dimensions */
+        int ndim;
+        /*! \brief The shape of the tensor */
+        int64_t *shape;
+        /*! \brief The stride of the tensor */
+        int64_t *stride;
+    } DLArray;
+
+    typedef struct {
+        int device_id;
+        void *handle;
+    } DLStream;
+
+    typedef struct {
+        int device_id;
+        void *handle;
+    } DLEvent;
+} // HETUSYS_EXTERN_C
+#endif // HETUSYS_H_
diff --git a/src/common/runtime_base.h b/src/common/runtime_base.h
new file mode 100644
index 0000000..7495e12
--- /dev/null
+++ b/src/common/runtime_base.h
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file runtime_base.h
+ * \brief Base of all C APIs
+ */
+#ifndef HETUSYS_RUNTIME_RUNTIME_BASE_H_
+#define HETUSYS_RUNTIME_RUNTIME_BASE_H_
+
+#include "c_runtime_api.h"
+#include <stdexcept>
+
+/*! \brief  macro to guard beginning and end section of all functions */
+#define API_BEGIN() try {
+/*!
+ * \brief every function starts with API_BEGIN(), and finishes with API_END()
+ *  or API_END_HANDLE_ERROR
+ */
+#define API_END()                                                              \
+    }                                                                          \
+    catch (std::runtime_error & _except_) {                                    \
+        return HETUSYSAPIHandleException(_except_);                            \
+    }                                                                          \
+    return 0;
+
+/*!
+ * \brief every function starts with API_BEGIN() and finishes with API_END() or
+ * API_END_HANDLE_ERROR. The finally clause contains procedure to cleanup states
+ * when an error happens.
+ */
+#define API_END_HANDLE_ERROR(Finalize)                                         \
+    }                                                                          \
+    catch (std::runtime_error & _except_) {                                    \
+        Finalize;                                                              \
+        return HETUSYSAPIHandleException(_except_);                            \
+    }                                                                          \
+    return 0;
+
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int HETUSYSAPIHandleException(const std::runtime_error &e) {
+    // TODO
+    // TVMAPISetLastError(e.what());
+    return -1;
+}
+
+#endif // HETUSYS_RUNTIME_RUNTIME_BASE_H_
diff --git a/src/communication/CMakeLists.txt b/src/communication/CMakeLists.txt
new file mode 100644
index 0000000..7382d4a
--- /dev/null
+++ b/src/communication/CMakeLists.txt
@@ -0,0 +1,47 @@
+# compile nccl and mpi communication
+
+add_library(_nccl_runtime SHARED nccl_communication.cu)
+add_library(_mpi_nccl_runtime_api SHARED mpi_nccl_communication.cu)
+add_library(_mpi_runtime_api SHARED mpi_communication.cc)
+
+find_package(NCCL 2.8 REQUIRED)
+
+find_package(MPI 3.1)
+if(NOT MPI_FOUND)
+    message(STATUS "MPI with version >= 3.1 not Found, Preparing OpenMPI ...")
+    FetchContent_Declare(openmpi URL https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz)
+    message(STATUS "Fetching openmpi source code ...")
+    FetchContent_MakeAvailable(openmpi)
+    set(MPI_CXX_LIBRARIES ${openmpi_BINARY_DIR}/lib/libmpi.so)
+    set(MPI_CXX_INCLUDE_DIRS ${openmpi_BINARY_DIR}/include)
+    add_custom_command(OUTPUT ${MPI_CXX_INCLUDE_DIRS}/mpi.h
+        COMMAND ./configure --prefix=${openmpi_BINARY_DIR}
+        COMMAND make -j8
+        COMMAND make install
+        COMMENT "configure, compile and install openmpi"
+        WORKING_DIRECTORY ${openmpi_SOURCE_DIR}
+    )
+    add_custom_target(compile_openmpi DEPENDS ${MPI_CXX_INCLUDE_DIRS}/mpi.h)
+    add_dependencies(_mpi_runtime_api compile_openmpi)
+    add_dependencies(_mpi_nccl_runtime_api compile_openmpi)
+endif()
+
+# compile nccl communication
+target_link_libraries(_nccl_runtime PUBLIC ${NCCL_LIBRARIES})
+target_include_directories(_nccl_runtime PUBLIC
+    ${NCCL_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_PATH})
+add_custom_target(nccl DEPENDS _nccl_runtime)
+
+# compile nccl+mpi communication
+target_link_libraries(_mpi_nccl_runtime_api PUBLIC ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES})
+target_include_directories(_mpi_nccl_runtime_api PUBLIC
+    ${NCCL_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_INCLUDE_PATH})
+add_custom_target(mpi_nccl DEPENDS _mpi_nccl_runtime_api)
+
+# compile mpi communication
+target_include_directories(_mpi_runtime_api PUBLIC ${MPI_CXX_INCLUDE_DIRS})
+target_link_libraries(_mpi_runtime_api PUBLIC ${MPI_CXX_LIBRARIES})
+add_custom_target(mpi DEPENDS _mpi_runtime_api)
+add_custom_target(allreduce)
+add_dependencies(allreduce _nccl_runtime _mpi_nccl_runtime_api)
+add_dependencies(allreduce _mpi_runtime_api)
diff --git a/src/communication/c_communication_nthread.cc b/src/communication/c_communication_nthread.cc
new file mode 100644
index 0000000..e816f59
--- /dev/null
+++ b/src/communication/c_communication_nthread.cc
@@ -0,0 +1,506 @@
+#include "../common/dlarray.h"
+#include <stdio.h>
+#include <omp.h>
+#include <chrono>
+#include <stdlib.h>
+#include <zmq.h>
+#include <string.h>
+#include <thread>
+#include <iostream>
+using namespace std;
+using namespace std::chrono;
+
+extern "C" {
+int worker_number;
+int worker_id;
+const char *this_worker;
+int *segment_begin;
+int *segment_end;
+float *output;
+char buf[10];
+float *receive_buf;
+
+void *context_backward = zmq_ctx_new();
+void *socket_backward = zmq_socket(context_backward, ZMQ_REQ);
+
+void *context_forward = zmq_ctx_new();
+void *socket_forward = zmq_socket(context_forward, ZMQ_REP);
+
+const char *pre = "tcp://";
+
+void show(float *, int);
+void DL_Connect_Init(int _worker_number, int _worker_id, char *source_ip,
+                     char *dest_ip) {
+    worker_number = _worker_number;
+    worker_id = _worker_id;
+    char *_source_ip = new char[50];
+    char *_dest_ip = new char[50];
+
+    int pre_len = strlen(pre);
+
+    for (int i = 0; i < pre_len; i++) {
+        _source_ip[i] = pre[i];
+        _dest_ip[i] = pre[i];
+    }
+    int len1 = strlen(source_ip);
+    for (int i = 0; i < len1; i++) {
+        _source_ip[i + pre_len] = source_ip[i];
+    }
+    _source_ip[len1 + pre_len] = 0;
+
+    int len2 = strlen(dest_ip);
+    for (int i = 0; i < len2; i++) {
+        _dest_ip[i + pre_len] = dest_ip[i];
+    }
+    _dest_ip[len2 + pre_len] = 0;
+
+    // connect the server
+    zmq_connect(socket_backward, _dest_ip);
+    zmq_bind(socket_forward, _source_ip);
+    delete[] _source_ip;
+    delete[] _dest_ip;
+}
+
+void send_msg(void *_socket, void *data, int len) {
+    char buff[10];
+    int bytes;
+    bytes = zmq_send(_socket, data, len, 0);
+    bytes = zmq_recv(socket_backward, buf, 10, 0);
+    buf[bytes] = '\0';
+    if (strcmp(buf, "ok") != 0)
+        printf("error\n");
+}
+
+void receive_msg(void *_socket, void *data, int len) {
+    int bytes;
+    bytes = zmq_recv(socket_forward, data, len, 0);
+    const char *rep = "ok";
+    bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+}
+
+int DLArray_len;
+void DL_Communicate_Init(DLArray *arr) {
+    delete[] segment_begin;
+    delete[] segment_end;
+    delete[] receive_buf;
+    DLArray_len = 1;
+    segment_begin = new int[worker_number + 2];
+    segment_end = new int[worker_number + 2];
+
+    for (int i = 0; i < arr->ndim; i++) {
+        DLArray_len *= arr->shape[i];
+    }
+    // divide
+    const int segment_size = DLArray_len / worker_number;
+    const int residual = DLArray_len % worker_number;
+
+    // the buffer for receive
+    receive_buf = new float[segment_size + 1];
+
+    for (int i = 0; i < worker_number; i++) {
+        segment_begin[i] = segment_size;
+        if (i < residual)
+            segment_begin[i]++;
+    }
+    segment_end[0] = segment_begin[0];
+    for (int i = 1; i < worker_number; i++) {
+        segment_end[i] = segment_end[i - 1] + segment_begin[i];
+    }
+    for (int i = 0; i < worker_number; i++) {
+        segment_begin[i] = segment_end[i] - segment_begin[i];
+    }
+}
+
+void add(float *now, float *delta, int begin, int len) {
+#pragma omp parallel for
+    for (int i = 0; i < len; i++) {
+        now[begin + i] += delta[i];
+    }
+}
+
+void average(float *now, int len) {
+#pragma omp parallel for
+    for (int i = 0; i < len; i++) {
+        now[i] /= worker_number;
+    }
+}
+
+void check() {
+    if (errno == EAGAIN)
+        printf("EAGAIN\n");
+    else if (errno == ENOTSUP)
+        printf("ENOTSUP\n");
+    else if (errno == EFSM)
+        printf("EFSM\n");
+    else if (errno == ETERM)
+        printf("ETERM\n");
+    else if (errno == ENOTSOCK)
+        printf("ENOTSOCK\n");
+    else if (errno == EINTR)
+        printf("EINTR\n");
+    else if (errno == EHOSTUNREACH)
+        printf("EHOSTUNREACH\n");
+}
+
+void RingAllreduce_Server(float *data, int len) {
+    output = new float[len];
+
+    memcpy(output, data, len * sizeof(float));
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+
+        // send
+        bytes = zmq_send(
+            socket_backward, output + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+
+        // receive
+
+        bytes = zmq_recv(
+            socket_forward, receive_buf,
+            sizeof(float)
+                * (segment_end[(segment - 1 + worker_number) % worker_number]
+                   - segment_begin[(segment - 1 + worker_number)
+                                   % worker_number]),
+            0);
+
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        add(output, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+    }
+    memcpy(data, output, len * sizeof(float));
+    delete[] output;
+}
+
+void _RingAllreduce_Server(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+
+        // send
+        bytes = zmq_send(
+            socket_backward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+
+        // receive
+        bytes = zmq_recv(
+            socket_forward, receive_buf,
+            sizeof(float)
+                * (segment_end[(segment - 1 + worker_number) % worker_number]
+                   - segment_begin[(segment - 1 + worker_number)
+                                   % worker_number]),
+            0);
+
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        add(data, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+    }
+}
+
+void _RingAllreduce_Server_nthread(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+
+        // send thread
+        std::thread send_thread(
+            send_msg, socket_backward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]));
+
+        // receive thread
+        std::thread receive_thread(
+            receive_msg, socket_forward, receive_buf,
+            sizeof(float)
+                * (segment_end[(segment - 1 + worker_number) % worker_number]
+                   - segment_begin[(segment - 1 + worker_number)
+                                   % worker_number]));
+        send_thread.join();
+        receive_thread.join();
+        add(data, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+    }
+}
+
+void RingAllgather_Server(float *data, int len) {
+    float *output;
+    output = new float[len];
+    memcpy(output, data, len * sizeof(float));
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // send
+        bytes = zmq_send(socket_backward,
+                         output + segment_begin[(segment + 1) % worker_number],
+                         sizeof(float)
+                             * (segment_end[(segment + 1) % worker_number]
+                                - segment_begin[(segment + 1) % worker_number]),
+                         0);
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+
+        // receive
+        bytes = zmq_recv(
+            socket_forward, output + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+    }
+    average(output, len);
+    memcpy(data, output, len * sizeof(float));
+    delete[] output;
+}
+
+void _RingAllgather_Server(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // send
+        bytes = zmq_send(socket_backward,
+                         data + segment_begin[(segment + 1) % worker_number],
+                         sizeof(float)
+                             * (segment_end[(segment + 1) % worker_number]
+                                - segment_begin[(segment + 1) % worker_number]),
+                         0);
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+
+        // receive
+        bytes = zmq_recv(
+            socket_forward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+    }
+    average(data, len);
+}
+
+void _RingAllgather_Server_nthread(float *data, int len) {
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // send thread
+        std::thread send_thread(
+            send_msg, socket_backward,
+            data + segment_begin[(segment + 1) % worker_number],
+            sizeof(float)
+                * (segment_end[(segment + 1) % worker_number]
+                   - segment_begin[(segment + 1) % worker_number]));
+
+        // receive thread
+        std::thread receive_thread(
+            receive_msg, socket_forward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]));
+        send_thread.join();
+        receive_thread.join();
+    }
+    average(data, len);
+}
+
+void RingAllreduce_Client(float *data, int len) {
+    float *output;
+    output = new float[len];
+    memcpy(output, data, len * sizeof(float));
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // receive
+        bytes = zmq_recv(
+            socket_forward, receive_buf,
+            sizeof(float)
+                * (segment_end[(segment - 1 + worker_number) % worker_number]
+                   - segment_begin[(segment - 1 + worker_number)
+                                   % worker_number]),
+            0);
+
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        add(output, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+        // send
+        bytes = zmq_send(
+            socket_backward, output + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+    }
+    memcpy(data, output, len * sizeof(float));
+    delete[] output;
+}
+
+void _RingAllreduce_Client(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // receive
+        bytes = zmq_recv(
+            socket_forward, receive_buf,
+            sizeof(float)
+                * (segment_end[(segment - 1 + worker_number) % worker_number]
+                   - segment_begin[(segment - 1 + worker_number)
+                                   % worker_number]),
+            0);
+
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        // send
+        bytes = zmq_send(
+            socket_backward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+        add(data, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+    }
+}
+
+void _RingAllreduce_Client_nthread(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // receive thread
+        std::thread receive_thread(
+            receive_msg, socket_forward, receive_buf,
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]));
+
+        // send thread
+        std::thread send_thread(
+            send_msg, socket_backward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]));
+
+        send_thread.join();
+        receive_thread.join();
+        add(data, receive_buf,
+            segment_begin[(segment - 1 + worker_number) % worker_number],
+            (segment_end[(segment - 1 + worker_number) % worker_number]
+             - segment_begin[(segment - 1 + worker_number) % worker_number]));
+    }
+}
+
+void RingAllgather_Client(float *data, int len) {
+    float *output;
+    output = new float[len * sizeof(float)];
+    memcpy(output, data, len * sizeof(float));
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+
+        // receive
+        bytes = zmq_recv(
+            socket_forward, output + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        // send
+        bytes = zmq_send(socket_backward,
+                         output + segment_begin[(segment + 1) % worker_number],
+                         sizeof(float)
+                             * (segment_end[(segment + 1) % worker_number]
+                                - segment_begin[(segment + 1) % worker_number]),
+                         0);
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+    }
+    average(output, len);
+    memcpy(data, output, len * sizeof(float));
+    delete[] output;
+}
+
+void _RingAllgather_Client(float *data, int len) {
+    int bytes;
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // receive
+        bytes = zmq_recv(
+            socket_forward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]), 0);
+        const char *rep = "ok";
+        bytes = zmq_send(socket_forward, rep, strlen(rep), 0);
+        // send
+        bytes = zmq_send(socket_backward,
+                         data + segment_begin[(segment + 1) % worker_number],
+                         sizeof(float)
+                             * (segment_end[(segment + 1) % worker_number]
+                                - segment_begin[(segment + 1) % worker_number]),
+                         0);
+        bytes = zmq_recv(socket_backward, buf, 10, 0);
+        buf[bytes] = '\0';
+        if (strcmp(buf, "ok") != 0)
+            printf("error\n");
+    }
+    average(data, len);
+}
+
+void _RingAllgather_Client_nthread(float *data, int len) {
+    for (int i = 0; i < worker_number - 1; i++) {
+        int segment = (worker_id - i + worker_number) % worker_number;
+        // receive thread
+        std::thread receive_thread(
+            receive_msg, socket_forward, data + segment_begin[segment],
+            sizeof(float) * (segment_end[segment] - segment_begin[segment]));
+
+        // send thread
+        std::thread send_thread(
+            send_msg, socket_backward,
+            data + segment_begin[(segment + 1) % worker_number],
+            sizeof(float)
+                * (segment_end[(segment + 1) % worker_number]
+                   - segment_begin[(segment + 1) % worker_number]));
+
+        send_thread.join();
+        receive_thread.join();
+    }
+    average(data, len);
+}
+
+void DL_Communicate_Close() {
+    zmq_close(socket_forward);
+    zmq_ctx_destroy(context_forward);
+
+    zmq_close(socket_backward);
+    zmq_ctx_destroy(context_backward);
+}
+
+void DL_Communicate_Server(DLArray *arr) {
+    _RingAllreduce_Server_nthread((float *)(arr->data), DLArray_len);
+    _RingAllgather_Server_nthread((float *)(arr->data), DLArray_len);
+}
+
+void DL_Communicate_Client(DLArray *arr) {
+    _RingAllreduce_Client_nthread((float *)(arr->data), DLArray_len);
+    _RingAllgather_Client_nthread((float *)(arr->data), DLArray_len);
+}
+}
diff --git a/src/communication/mpi_communication.cc b/src/communication/mpi_communication.cc
new file mode 100644
index 0000000..5e5537c
--- /dev/null
+++ b/src/communication/mpi_communication.cc
@@ -0,0 +1,51 @@
+#include "../header/mpi_communication.h"
+
+static const MPI_Datatype TYPE2TYPE_V1[] = {
+    MPI_CHAR,  MPI_INT,   MPI_UNSIGNED, MPI_LONG_LONG, MPI_UNSIGNED_LONG_LONG,
+    MPI_FLOAT, MPI_DOUBLE};
+
+MPI_Datatype _get_proper_Datatype(int datatype) {
+    return TYPE2TYPE_V1[datatype];
+}
+
+static const MPI_Op TYPE2TYPE_V2[] = {
+    MPI_OP_NULL, MPI_MAX,    MPI_MIN,    MPI_SUM,    MPI_PROD,
+    MPI_LAND,    MPI_BAND,   MPI_LOR,    MPI_BOR,    MPI_LXOR,
+    MPI_BXOR,    MPI_MINLOC, MPI_MAXLOC, MPI_REPLACE};
+
+MPI_Op _get_proper_MPIOp(int optype) {
+    return TYPE2TYPE_V2[optype];
+}
+
+void MPIInit() {
+    // MPICHECK(MPI_Init(argc, &argv));
+    MPICHECK(MPI_Init(NULL, NULL));
+}
+
+void MPIGetComm(MPI_Comm *comm) {
+    *comm = MPI_COMM_WORLD;
+}
+
+void getMPICommRank(MPI_Comm *comm, int *myRank) {
+    MPICHECK(MPI_Comm_rank(*comm, myRank));
+}
+
+void getMPICommSize(MPI_Comm *comm, int *nRanks) {
+    MPICHECK(MPI_Comm_size(*comm, nRanks));
+}
+
+void dlarrayAllReduce(DLArray *array, int datatype, int op, MPI_Comm *comm) {
+    int size = 1;
+    for (int i = 0; i < array->ndim; i++) {
+        size = size * array->shape[i];
+    }
+    float *data_buffer = (float *)(array->data);
+    MPI_Datatype red_datatype = _get_proper_Datatype(datatype);
+    MPI_Op red_Op = _get_proper_MPIOp(op);
+    MPICHECK(MPI_Allreduce(MPI_IN_PLACE, data_buffer, size, red_datatype,
+                           red_Op, *comm));
+}
+
+void MPIFinalize() {
+    MPICHECK(MPI_Finalize());
+}
\ No newline at end of file
diff --git a/src/communication/mpi_nccl_communication.cu b/src/communication/mpi_nccl_communication.cu
new file mode 100644
index 0000000..f9f6d7c
--- /dev/null
+++ b/src/communication/mpi_nccl_communication.cu
@@ -0,0 +1,251 @@
+#include "../header/mpi_nccl_communication.h"
+
+static const ncclDataType_t TYPE2TYPE_V1[] = {
+    ncclChar,    // ncclInt8, ncclChar
+    ncclUint8,   // ncclUint8
+    ncclInt32,   // ncclInt32, ncclInt
+    ncclUint32,  // ncclUint32
+    ncclInt64,   // ncclInt64
+    ncclUint64,  // ncclUint64
+    ncclFloat16, // ncclFloat16, ncclHalf
+    ncclFloat32, // ncclFloat32, ncclFloat
+    ncclFloat64  // ncclFloat64, ncclDouble
+};
+
+ncclDataType_t _get_proper_datatype(int datatype) {
+    return TYPE2TYPE_V1[datatype];
+}
+
+static const ncclRedOp_t TYPE2TYPE_V2[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+
+ncclRedOp_t _get_proper_redop(int redop) {
+    return TYPE2TYPE_V2[redop];
+}
+
+void MPIInit() {
+    MPICHECK(MPI_Init(NULL, NULL));
+}
+
+void MPIFinalize() {
+    MPICHECK(MPI_Finalize());
+}
+
+void MPIGetComm(MPI_Comm *comm) {
+    *comm = MPI_COMM_WORLD;
+}
+
+void MPIBcast(void *buffer, int size, MPI_Datatype datatype, int root,
+              MPI_Comm comm) {
+    MPICHECK(MPI_Bcast(buffer, size, datatype, root, comm));
+}
+
+void getMPICommRank(MPI_Comm *comm, int *myRank) {
+    MPICHECK(MPI_Comm_rank(*comm, myRank));
+}
+
+void getMPICommSize(MPI_Comm *comm, int *nRanks) {
+    MPICHECK(MPI_Comm_size(*comm, nRanks));
+}
+
+uint64_t getHostHash(const char *string) {
+    // Based on DJB2, result = result * 33 + char
+    uint64_t result = 5381;
+    for (int c = 0; string[c] != '\0'; c++) {
+        result = ((result << 5) + result) + string[c];
+    }
+    return result;
+}
+
+void getHostName(char *hostname, int maxlen) {
+    gethostname(hostname, maxlen);
+    for (int i = 0; i < maxlen; i++) {
+        if (hostname[i] == '.') {
+            hostname[i] = '\0';
+            return;
+        }
+    }
+}
+
+void getLocalRank(MPI_Comm *comm, int nRanks, int myRank, int *localRank,
+                  unsigned long long hostHashs[]) {
+    int _localRank = 0;
+    char hostname[1024];
+    getHostName(hostname, 1024);
+    hostHashs[myRank] = getHostHash(hostname);
+    MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs,
+                           sizeof(unsigned long long), MPI_BYTE, *comm));
+    for (int p = 0; p < nRanks; p++) {
+        if (p == myRank)
+            break;
+        if (hostHashs[p] == hostHashs[myRank])
+            (_localRank)++;
+    }
+    *localRank = _localRank;
+}
+
+void getGlobalDevice(MPI_Comm *comm, int nRanks, int myRank, int device_id,
+                     int hostDevices[]) {
+    hostDevices[myRank] = device_id;
+    MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostDevices,
+                           sizeof(int), MPI_BYTE, *comm));
+}
+
+void setDevice(int device_id) {
+    CUDACHECK(cudaSetDevice(device_id));
+}
+
+void getNcclUniqueId(ncclUniqueId *Id, MPI_Comm mpi_comm, int localRank,
+                     int senderRank) {
+    if (localRank == 0)
+        NCCLCHECK(ncclGetUniqueId(Id));
+    MPIBcast((void *)Id, sizeof(ncclUniqueId), MPI_BYTE, senderRank, mpi_comm);
+}
+
+void getGroupNcclUniqueId(ncclUniqueId *Id, MPI_Comm mpi_comm, int rank,
+                          int dests[], int group_size, int group_id) {
+    // we assume that group size >= 2
+    if (dests[0] == rank) {
+        NCCLCHECK(ncclGetUniqueId(Id));
+        for (int i = 1; i < group_size; ++i) {
+            MPICHECK(MPI_Send((const void *)Id, sizeof(ncclUniqueId), MPI_BYTE,
+                              dests[i], group_id, mpi_comm));
+        }
+    } else {
+        MPICHECK(MPI_Recv((void *)Id, sizeof(ncclUniqueId), MPI_BYTE, dests[0],
+                          group_id, mpi_comm, MPI_STATUS_IGNORE));
+    }
+}
+
+void initNcclCommRank(ncclComm_t *comm, int nranks, ncclUniqueId *commId,
+                      int rank, int localRank) {
+    NCCLCHECK(ncclCommInitRank(comm, nranks, *commId, rank));
+}
+
+void _ncclAllReduce(const void *sendbuff, void *recvbuff, int size,
+                    int datatype, int op, ncclComm_t comm,
+                    cudaStream_t stream) {
+    NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size,
+                            _get_proper_datatype(datatype),
+                            _get_proper_redop(op), comm, stream));
+}
+
+void _ncclBroadcast(const void *sendbuff, void *recvbuff, int size,
+                    int datatype, int root, ncclComm_t comm,
+                    cudaStream_t stream) {
+    NCCLCHECK(ncclBroadcast((const void *)sendbuff, (void *)recvbuff, size,
+                            _get_proper_datatype(datatype), root, comm,
+                            stream));
+}
+
+void _ncclAllGather(const void *sendbuff, void *recvbuff, int size,
+                    int datatype, ncclComm_t comm, cudaStream_t stream) {
+    NCCLCHECK(ncclAllGather((const void *)sendbuff, (void *)recvbuff, size,
+                            _get_proper_datatype(datatype), comm, stream));
+}
+
+void _ncclSend(const void *sendbuff, int size, int datatype, int target,
+               ncclComm_t comm, cudaStream_t stream) {
+    NCCLCHECK(ncclSend(sendbuff, size, _get_proper_datatype(datatype), target,
+                       comm, stream));
+}
+
+void _ncclRecv(void *recvbuff, int size, int datatype, int src, ncclComm_t comm,
+               cudaStream_t stream) {
+    NCCLCHECK(ncclRecv(recvbuff, size, _get_proper_datatype(datatype), src,
+                       comm, stream));
+}
+
+void dlarrayAllReduce(DLArray *input_array, DLArray *output_array, int datatype,
+                      int op, ncclComm_t comm, DLStreamHandle stream_handle) {
+    int size = 1;
+    for (int i = 0; i < input_array->ndim; i++) {
+        size = size * input_array->shape[i];
+    }
+    float *input_data_buffer = (float *)(input_array->data);
+    float *output_data_buffer = (float *)(output_array->data);
+    cudaStream_t stream = *(cudaStream_t *)stream_handle->handle;
+    _ncclAllReduce(input_data_buffer, output_data_buffer, size, datatype, op,
+                   comm, stream);
+}
+
+void dlarrayBroadcast(DLArray *input_array, DLArray *output_array, int datatype,
+                      int root, ncclComm_t comm, DLStreamHandle stream_handle) {
+    int size = 1;
+    for (int i = 0; i < input_array->ndim; i++) {
+        size = size * input_array->shape[i];
+    }
+    float *input_data_buffer = (float *)(input_array->data);
+    float *output_data_buffer = (float *)(output_array->data);
+    cudaStream_t stream = *(cudaStream_t *)stream_handle->handle;
+    _ncclBroadcast(input_data_buffer, output_data_buffer, size, datatype, root,
+                   comm, stream);
+}
+
+void dlarrayAllGather(DLArray *array, DLArray *output_array, int datatype,
+                      ncclComm_t comm, DLStreamHandle stream_handle) {
+    int size = 1;
+    for (int i = 0; i < array->ndim; i++) {
+        size = size * array->shape[i];
+    }
+    int output_size = 1;
+    for (int i = 0; i < output_array->ndim; i++) {
+        output_size = output_size * output_array->shape[i];
+    }
+    float *input_buffer = (float *)(array->data);
+    float *output_buffer = (float *)(output_array->data);
+    cudaStream_t stream = *(cudaStream_t *)stream_handle->handle;
+    _ncclAllGather(input_buffer, output_buffer, size, datatype, comm, stream);
+}
+
+void dlarraySend(DLArray *array, int datatype, int target, ncclComm_t comm,
+                 DLStreamHandle stream_handle) {
+    int size = 1;
+    for (int i = 0; i < array->ndim; i++) {
+        size = size * array->shape[i];
+    }
+    float *data_buffer = (float *)(array->data);
+    cudaStream_t stream = *(cudaStream_t *)stream_handle->handle;
+
+    _ncclSend(data_buffer, size, datatype, target, comm, stream);
+}
+
+void dlarrayRecv(DLArray *array, int datatype, int src, ncclComm_t comm,
+                 DLStreamHandle stream_handle) {
+    int size = 1;
+    for (int i = 0; i < array->ndim; i++) {
+        size = size * array->shape[i];
+    }
+    float *data_buffer = (float *)(array->data);
+    cudaStream_t stream = *(cudaStream_t *)stream_handle->handle;
+
+    _ncclRecv(data_buffer, size, datatype, src, comm, stream);
+}
+
+void commDestroyNccl(ncclComm_t *comm) {
+    NCCLCHECK(ncclCommDestroy(*comm));
+}
+
+void display(const float *device_data, int dev_id, int size) {
+    printf("Display Device %d:\n", dev_id);
+    CUDACHECK(cudaSetDevice(dev_id));
+    float *host_buff;
+    CUDACHECK(
+        cudaHostAlloc(&host_buff, size * sizeof(float), cudaHostAllocDefault));
+    CUDACHECK(cudaMemcpy(host_buff, device_data, size * sizeof(float),
+                         cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size; i++) {
+        printf("%f ", host_buff[i]);
+    }
+    printf("\n");
+    CUDACHECK(cudaFreeHost(host_buff));
+}
+
+void print_array(float *array, int size) {
+    float *output;
+    output = (float *)malloc(sizeof(float) * size);
+    cudaMemcpy(output, array, size * sizeof(float), cudaMemcpyHostToHost);
+    for (int i = 0; i < size; i++) {
+        printf("%f ", output[i]);
+    }
+    printf("\n");
+}
diff --git a/src/communication/nccl_communication.cu b/src/communication/nccl_communication.cu
new file mode 100644
index 0000000..5abc0c4
--- /dev/null
+++ b/src/communication/nccl_communication.cu
@@ -0,0 +1,96 @@
+#include "../header/nccl_communication.h"
+
+__global__ void array_set(float *a, float tmp, int size) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= size)
+        return;
+    a[id] = tmp;
+}
+
+void create_streams(cudaStream_t *stream, int *devices, int devices_numbers) {
+    for (int i = 0; i < devices_numbers; i++) {
+        CUDACHECK(cudaSetDevice(devices[i]));
+        CUDACHECK(cudaStreamCreate((cudaStream_t *)stream + i));
+    }
+}
+
+void update_stream(size_t dev_id, cudaStream_t *stream,
+                   cudaStream_t *stream_handle) {
+    stream[dev_id] = *stream_handle;
+}
+
+void free_streams(cudaStream_t *stream, int *devices, int devices_numbers) {
+    for (int i = 0; i < devices_numbers; i++) {
+        CUDACHECK(cudaSetDevice(devices[i]));
+        CUDACHECK(cudaStreamDestroy(stream[i]));
+    }
+}
+
+void init_NCCL(ncclComm_t *comms, int *devices, int devices_numbers) {
+    NCCLCHECK(ncclCommInitAll(comms, devices_numbers, devices));
+}
+
+void finish_NCCL(ncclComm_t *comms, int devices_numbers) {
+    for (int i = 0; i < devices_numbers; i++)
+        NCCLCHECK(ncclCommDestroy(comms[i]));
+}
+
+void Synchronize_streams(cudaStream_t *stream, int *devices,
+                         int devices_numbers) {
+    for (int i = 0; i < devices_numbers; i++) {
+        CUDACHECK(cudaSetDevice(devices[i]));
+        CUDACHECK(cudaStreamSynchronize(stream[i]));
+    }
+}
+
+void NCCL_AllReduce(float **sendbuff, float **recvbuff, int size,
+                    ncclComm_t *comms, cudaStream_t *stream,
+                    int devices_numbers) {
+    NCCLCHECK(ncclGroupStart());
+    for (int i = 0; i < devices_numbers; ++i)
+        NCCLCHECK(ncclAllReduce((const void *)sendbuff[i], (void *)recvbuff[i],
+                                size, ncclFloat, ncclSum, comms[i], stream[i]));
+    NCCLCHECK(ncclGroupEnd());
+}
+
+void display(const float *device_data, int dev_id, int size) {
+    printf("Display Device %d:\n", dev_id);
+    CUDACHECK(cudaSetDevice(dev_id));
+    float *host_buff;
+    CUDACHECK(
+        cudaHostAlloc(&host_buff, size * sizeof(float), cudaHostAllocDefault));
+    CUDACHECK(cudaMemcpy(host_buff, device_data, size * sizeof(float),
+                         cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size; i++) {
+        printf("%f ", host_buff[i]);
+    }
+    printf("\n");
+    CUDACHECK(cudaFreeHost(host_buff));
+}
+
+void create(int **a, int n) {
+    *a = (int *)malloc(sizeof(int) * n);
+    for (int i = 0; i < n; i++) {
+        (*a)[i] = 1;
+    }
+}
+
+void for_each(int *a, int n) {
+    for (int i = 0; i < n; i++) {
+        printf("%d ", a[i]);
+    }
+    printf("\n");
+}
+
+void show_int(int a) {
+    printf("the num is %d\n", a);
+}
+
+void show_array2D(float **a, int row, int col) {
+    for (int i = 0; i < row; i++) {
+        for (int j = 0; j < col; j++) {
+            printf("%f ", a[i][j]);
+        }
+        printf("\n");
+    }
+}
diff --git a/src/cuda_common/cuda_device_api.cc b/src/cuda_common/cuda_device_api.cc
new file mode 100644
index 0000000..1d53f82
--- /dev/null
+++ b/src/cuda_common/cuda_device_api.cc
@@ -0,0 +1,76 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cuda_device_api.cc
+ * \brief GPU specific API
+ */
+
+#include "cuda_device_api.h"
+#include <cassert>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define CUDA_CALL(cmd)                                                         \
+    do {                                                                       \
+        cudaError_t e = cmd;                                                   \
+        if (e != cudaSuccess) {                                                \
+            printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   cudaGetErrorString(e));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+namespace hetusys { namespace runtime {
+
+static void GPUCopy(const void *from, void *to, size_t size,
+                    cudaMemcpyKind kind, cudaStream_t stream) {
+    if (stream != 0) {
+        CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
+    } else {
+        CUDA_CALL(cudaMemcpy(to, from, size, kind));
+    }
+}
+
+void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
+                                    size_t alignment) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes";
+    void *ret;
+    CUDA_CALL(cudaMalloc(&ret, size));
+    return ret;
+}
+
+void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(cudaFree(ptr));
+}
+
+void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
+                                   DLContext ctx_from, DLContext ctx_to,
+                                   DLStreamHandle stream) {
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream ? *(cudaStream_t *)(stream->handle) : NULL);
+    if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
+        CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+        if (ctx_from.device_id == ctx_to.device_id) {
+            GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+        } else {
+            cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id,
+                                size, cu_stream);
+        }
+    } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
+        CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+        GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
+    } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
+        CUDA_CALL(cudaSetDevice(ctx_to.device_id));
+        GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
+    } else {
+        std::cerr << "expect copy from/to GPU or between GPU" << std::endl;
+    }
+}
+
+void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStreamSynchronize(*(cudaStream_t *)(stream->handle));
+}
+
+}} // namespace hetusys::runtime
diff --git a/src/cuda_common/gpu_chunk.cc b/src/cuda_common/gpu_chunk.cc
new file mode 100644
index 0000000..6ed2f23
--- /dev/null
+++ b/src/cuda_common/gpu_chunk.cc
@@ -0,0 +1,68 @@
+#include "gpu_runtime.h"
+
+void DebugCudaMalloc(cudaError_t tmp) {
+    if (tmp == cudaErrorInvalidValue) {
+        std::cout << " cudaErrorInvalidValue" << std::endl;
+        assert(false);
+    } else if (tmp == cudaErrorMemoryAllocation) {
+        std::cout << "cudaErrorMemoryAllocation" << std::endl;
+        assert(false);
+    } else if (tmp == cudaSuccess) {
+        // std::cout<<"cudaSuccess"<<std::endl;
+    } else {
+        std::cout << "Unknown Type" << std::endl;
+        assert(false);
+    }
+}
+
+std::map<int, bool> init_free_chunk_set;
+
+std::map<int, std::multiset<Chunk>> free_chunk_set;
+
+std::map<int, std::map<void *, size_t>> all_malloced_chunk;
+
+bool is_chunk_init(size_t dev_id) {
+    if (init_free_chunk_set.empty()
+        || init_free_chunk_set.find(dev_id) == init_free_chunk_set.end()) {
+        return false;
+    } else {
+        return init_free_chunk_set[dev_id] == 1;
+    }
+}
+
+void chunk_init(size_t dev_id) {
+    bool is_chunk_empty =
+        init_free_chunk_set.find(dev_id) == init_free_chunk_set.end();
+    if (is_chunk_empty) {
+        init_free_chunk_set.insert(std::pair<int, bool>(dev_id, false));
+        free_chunk_set.insert(std::pair<int, std::multiset<Chunk>>(
+            dev_id, std::multiset<Chunk>()));
+        all_malloced_chunk.insert(std::pair<int, std::map<void *, size_t>>(
+            dev_id, std::map<void *, size_t>()));
+    }
+    init_free_chunk_set[dev_id] = true;
+    free_chunk_set[dev_id].clear();
+    all_malloced_chunk[dev_id].clear();
+}
+
+void del_chunk(void *ptr, size_t dev_id) {
+    free_chunk_set[dev_id].insert(Chunk(ptr, all_malloced_chunk[dev_id][ptr]));
+}
+
+void *find_chunk(size_t _chunk_size, size_t dev_id) {
+    std::multiset<Chunk>::iterator it;
+    it = free_chunk_set[dev_id].lower_bound(Chunk(NULL, _chunk_size));
+    if ((it == free_chunk_set[dev_id].end())
+        || (it->chunk_size != _chunk_size)) {
+        void *work_data = NULL;
+        cudaSetDevice(dev_id);
+        DebugCudaMalloc(cudaMalloc(&work_data, _chunk_size));
+        all_malloced_chunk[dev_id].insert(
+            std::pair<void *, size_t>(work_data, _chunk_size));
+        return work_data;
+    } else {
+        void *ans = it->ptr;
+        free_chunk_set[dev_id].erase(it);
+        return ans;
+    }
+}
diff --git a/src/cuda_common/gpu_runtime.cc b/src/cuda_common/gpu_runtime.cc
new file mode 100644
index 0000000..566e4dc
--- /dev/null
+++ b/src/cuda_common/gpu_runtime.cc
@@ -0,0 +1,111 @@
+#include "gpu_runtime.h"
+
+// cuda init for set the cuda device
+void cuda_init() {
+    int devicedChoosed;
+    cudaGetDevice(&devicedChoosed);
+    cudaSetDevice(devicedChoosed);
+}
+
+// check whether the cudnn is init
+// if not : init a cudnn handle
+// else: do nothing
+std::map<size_t, bool> is_cudnn_init;
+std::map<size_t, cudnnHandle_t> cudnn_map;
+void cudnn_init(size_t dev_id, DLStreamHandle stream) {
+    if (is_cudnn_init.find(dev_id) == is_cudnn_init.end()) {
+        is_cudnn_init.insert(std::pair<size_t, bool>(dev_id, false));
+        cudnnHandle_t cudnn;
+        cudnn_map.insert(std::pair<size_t, cudnnHandle_t>(dev_id, cudnn));
+        CUDNN_CALL(cudnnCreate(&cudnn_map[dev_id]));
+        is_cudnn_init[dev_id] = true;
+    }
+    if (stream) {
+        CUDNN_CALL(
+            cudnnSetStream(cudnn_map[dev_id], *(cudaStream_t *)stream->handle));
+    }
+}
+
+std::map<size_t, bool> is_cusp_init;
+std::map<size_t, cusparseHandle_t> cusp_map;
+void cusp_init(size_t dev_id, DLStreamHandle stream) {
+    if (is_cusp_init.find(dev_id) == is_cusp_init.end()) {
+        is_cusp_init.insert(std::pair<size_t, bool>(dev_id, false));
+        cusparseHandle_t cusp;
+        cusp_map.insert(std::pair<size_t, cusparseHandle_t>(dev_id, cusp));
+        CUSP_CALL(cusparseCreate(&cusp_map[dev_id]));
+        is_cusp_init[dev_id] = true;
+    }
+    if (stream) {
+        CUSP_CALL(cusparseSetStream(cusp_map[dev_id],
+                                    *(cudaStream_t *)stream->handle));
+    }
+}
+
+std::map<size_t, bool> is_cublas_init;
+std::map<size_t, cublasHandle_t> cublas_map;
+void cublas_init(size_t dev_id, DLStreamHandle stream) {
+    if (is_cublas_init.find(dev_id) == is_cublas_init.end()) {
+        is_cublas_init.insert(std::pair<size_t, bool>(dev_id, false));
+        cublasHandle_t cusp;
+        cublas_map.insert(std::pair<size_t, cublasHandle_t>(dev_id, cusp));
+        CUBLAS_CALL(cublasCreate(&cublas_map[dev_id]));
+        is_cublas_init[dev_id] = true;
+    }
+    if (stream) {
+        CUBLAS_CALL(cublasSetStream(cublas_map[dev_id],
+                                    *(cudaStream_t *)stream->handle));
+    }
+}
+
+int DLStreamCreate(size_t dev_id, DLStreamHandle *handle) {
+    DLStream *s = new DLStream();
+    s->device_id = dev_id;
+    s->handle = nullptr;
+    CUDA_CALL(cudaSetDevice(dev_id));
+    cudaStream_t *stream_handle = new cudaStream_t();
+    CUDA_CALL(cudaStreamCreate(stream_handle));
+    s->handle = stream_handle;
+    *handle = s;
+    // delete s;
+    return 0;
+}
+
+int DLStreamDestroy(DLStreamHandle handle) {
+    CUDA_CALL(cudaStreamDestroy(*(cudaStream_t *)handle->handle));
+    return 0;
+}
+
+int DLStreamSync(DLStreamHandle handle) {
+    cudaStreamSynchronize(*(cudaStream_t *)handle->handle);
+    return 0;
+}
+
+int DLEventCreate(size_t dev_id, DLEventHandle *handle) {
+    DLEvent *s = new DLEvent();
+    s->device_id = dev_id;
+    s->handle = nullptr;
+    CUDA_CALL(cudaSetDevice(dev_id));
+    cudaEvent_t *event_handle = new cudaEvent_t();
+    CUDA_CALL(cudaEventCreate(event_handle));
+    s->handle = event_handle;
+    *handle = s;
+    // delete s;
+    return 0;
+}
+
+int DLEventDestroy(DLEventHandle handle) {
+    CUDA_CALL(cudaEventDestroy(*(cudaEvent_t *)handle->handle));
+    return 0;
+}
+
+int DLEventRecord(DLStreamHandle stream_handle, DLEventHandle event_handle) {
+    CUDA_CALL(cudaEventRecord(*(cudaEvent_t *)event_handle->handle,
+                              *(cudaStream_t *)stream_handle->handle));
+    return 0;
+}
+
+int DLEventSync(DLEventHandle handle) {
+    CUDA_CALL(cudaEventSynchronize(*(cudaEvent_t *)handle->handle));
+    return 0;
+}
diff --git a/src/cuda_common/gpu_runtime.h b/src/cuda_common/gpu_runtime.h
new file mode 100644
index 0000000..a19a131
--- /dev/null
+++ b/src/cuda_common/gpu_runtime.h
@@ -0,0 +1,148 @@
+#ifndef HETUSYS_SRC_GPU_RUNTIME_H
+#define HETUSYS_SRC_GPU_RUNTIME_H
+
+#include "../common/c_runtime_api.h"
+#include <iostream>
+#include <vector>
+#include <set>
+#include <map>
+#include <cassert>
+#include <cstdio>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cfloat>
+#include <cudnn.h>
+#include <stack>
+#include <cusparse.h>
+#include <cusparse_v2.h>
+#include <curand.h>
+
+#define THREADS_PER_BLOCK 1024
+
+#define CUDNN_CALL(cmd)                                                        \
+    do {                                                                       \
+        cudnnStatus_t e = cmd;                                                 \
+        if (e != CUDNN_STATUS_SUCCESS) {                                       \
+            printf("Failed: Cudnn error %s:%d '%s'\n", __FILE__, __LINE__,     \
+                   cudnnGetErrorString(e));                                    \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+#define CUBLAS_CALL(func)                                                      \
+    {                                                                          \
+        cublasStatus_t err = (func);                                           \
+        assert(err == CUBLAS_STATUS_SUCCESS);                                  \
+    }
+
+#define CUDA_CALL(cmd)                                                         \
+    do {                                                                       \
+        cudaError_t e = cmd;                                                   \
+        if (e != cudaSuccess) {                                                \
+            printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   cudaGetErrorString(e));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+#define CUSP_CALL(func)                                                        \
+    {                                                                          \
+        cusparseStatus_t e = (func);                                           \
+        assert((e == CUSPARSE_STATUS_SUCCESS));                                \
+    }
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);               \
+         i += blockDim.x * gridDim.x)
+
+static const char *curandGetErrorString(curandStatus_t status) {
+    switch (status) {
+    case CURAND_STATUS_SUCCESS:
+        return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+        return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+        return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+        return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+        return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+        return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+        return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+        return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+        return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+        return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+        return "CURAND_STATUS_INTERNAL_ERROR";
+    }
+    return "Unknown cuRAND error";
+}
+
+#define CURAND_CALL(cmd)                                                       \
+    do {                                                                       \
+        curandStatus_t e = cmd;                                                \
+        if (e != CURAND_STATUS_SUCCESS) {                                      \
+            printf("Failed: Curand error %s:%d '%s'\n", __FILE__, __LINE__,    \
+                   curandGetErrorString(e));                                   \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+void DebugCudaMalloc(cudaError_t tmp);
+
+// extern struct Chunk;
+struct Chunk {
+    void *ptr;
+    size_t chunk_size;
+    Chunk() {
+        ptr = NULL;
+        chunk_size = 0;
+    }
+    Chunk(void *_ptr, size_t _chunk_size) {
+        ptr = _ptr;
+        chunk_size = _chunk_size;
+    }
+    bool operator<(const Chunk &tmp) const {
+        if (chunk_size != tmp.chunk_size)
+            return chunk_size < tmp.chunk_size;
+        else if (ptr != tmp.ptr)
+            return ptr < tmp.ptr;
+        else {
+            return false;
+        }
+    }
+};
+
+extern std::map<int, bool> init_free_chunk_set;
+extern std::map<int, std::multiset<Chunk>> free_chunk_set;
+
+bool is_chunk_init(size_t dev_id = 0);
+void chunk_init(size_t dev_id = 0);
+void del_chunk(void *ptr, size_t dev_id = 0);
+void *find_chunk(size_t _chunk_size, size_t dev_id = 0);
+
+extern std::map<size_t, bool> is_cudnn_init;
+extern std::map<size_t, cudnnHandle_t> cudnn_map;
+void cudnn_init(size_t dev_id = 0, DLStreamHandle stream = NULL);
+
+extern std::map<size_t, bool> is_cusp_init;
+extern std::map<size_t, cusparseHandle_t> cusp_map;
+void cusp_init(size_t dev_id = 0, DLStreamHandle stream = NULL);
+
+extern std::map<size_t, bool> is_cublas_init;
+extern std::map<size_t, cublasHandle_t> cublas_map;
+void cublas_init(size_t dev_id = 0, DLStreamHandle stream = NULL);
+
+void cuda_init();
+
+#endif
diff --git a/src/dnnl_ops/AddConst.cpp b/src/dnnl_ops/AddConst.cpp
new file mode 100644
index 0000000..ca889ee
--- /dev/null
+++ b/src/dnnl_ops/AddConst.cpp
@@ -0,0 +1,49 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseAddByConst(const DLArrayHandle input,
+                                               float val,
+                                               DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto AddConst_d =
+        eltwise_forward::desc(prop_kind::forward_training,
+                              algorithm::eltwise_linear, mat_md, 1.f, val);
+    auto AddConst_pd = eltwise_forward::primitive_desc(AddConst_d, eng);
+    auto AddConst = eltwise_forward(AddConst_pd);
+
+    AddConst.execute(engine_stream,
+                     {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/AddElewise.cpp b/src/dnnl_ops/AddElewise.cpp
new file mode 100644
index 0000000..b07af62
--- /dev/null
+++ b/src/dnnl_ops/AddElewise.cpp
@@ -0,0 +1,108 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseAdd(const DLArrayHandle matA,
+                                        const DLArrayHandle matB,
+                                        DLArrayHandle output) {
+    // use openmp to fit different shape of mat_a,mat_b.such as [3,5]+[5,]=[3,5]
+    size_t A_size = 1;
+    for (index_t i = 0; i < matA->ndim; i++) {
+        A_size *= matA->shape[i];
+    }
+    size_t B_size = 1;
+    for (index_t i = 0; i < matB->ndim; i++) {
+        B_size *= matB->shape[i];
+    }
+    size_t output_size = A_size > B_size ? A_size : B_size;
+    int *A_strides = new int[matA->ndim];
+    int *B_strides = new int[matB->ndim];
+    size_t tmp_size = 1;
+    for (int i = matA->ndim - 1; i >= 0; --i) {
+        A_strides[i] = tmp_size;
+        tmp_size *= matA->shape[i];
+    }
+    tmp_size = 1;
+    for (int i = matB->ndim - 1; i >= 0; --i) {
+        B_strides[i] = tmp_size;
+        tmp_size *= matB->shape[i];
+    }
+    const float *A_data = (const float *)matA->data;
+    const float *B_data = (const float *)matB->data;
+    float *out_data = (float *)output->data;
+
+    if (A_size == B_size) {
+        dnnl_stream_init();
+        vector<long int> shape, format, one;
+        for (int i = 0; i < matA->ndim; i++)
+            shape.push_back(matA->shape[i]);
+        format.resize(matA->ndim);
+        format[(matA->ndim) - 1] = 1;
+        for (int i = format.size() - 2; i >= 0; i--)
+            format[i] = format[i + 1] * shape[i + 1];
+        auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+        auto srcA_mem = memory(mat_md, eng, matA->data);
+        auto srcB_mem = memory(mat_md, eng, matB->data);
+        auto dst_mem = memory(mat_md, eng, output->data);
+        auto AddElewise_d =
+            binary::desc(algorithm::binary_add, mat_md, mat_md, mat_md);
+        auto AddElewise_pd = binary::primitive_desc(AddElewise_d, eng);
+        auto AddElewise = binary(AddElewise_pd);
+
+        AddElewise.execute(engine_stream, {{DNNL_ARG_SRC_0, srcA_mem},
+                                           {DNNL_ARG_SRC_1, srcB_mem},
+                                           {DNNL_ARG_DST, dst_mem}});
+        engine_stream.wait();
+
+    }
+
+    else if (A_size > B_size) {
+#pragma omp parallel for
+        for (size_t i = 0; i < output_size; i++) {
+            size_t s_ind = 0;
+            uint temp = i;
+            for (int j = 0; j < matA->ndim; ++j) {
+                uint adder = temp / A_strides[j];
+                if (matA->ndim - j <= matB->ndim
+                    && matB->shape[j - (matA->ndim - matB->ndim)] > 1) {
+                    s_ind += B_strides[j - (matA->ndim - matB->ndim)] * adder;
+                }
+                temp %= A_strides[j];
+            }
+            out_data[i] = A_data[i] + B_data[s_ind];
+        }
+    } else {
+#pragma omp parallel for
+        for (size_t i = 0; i < output_size; i++) {
+            size_t s_ind = 0;
+            uint temp = i;
+            for (int j = 0; j < matB->ndim; ++j) {
+                uint adder = temp / B_strides[j];
+                if (matB->ndim - j <= matA->ndim
+                    && matA->shape[j - (matB->ndim - matA->ndim)] > 1) {
+                    s_ind += A_strides[j - (matB->ndim - matA->ndim)] * adder;
+                }
+                temp %= B_strides[j];
+            }
+            out_data[i] = B_data[i] + A_data[s_ind];
+        }
+    }
+    delete A_strides;
+    delete B_strides;
+    return 0;
+}
diff --git a/src/dnnl_ops/ArraySet.cpp b/src/dnnl_ops/ArraySet.cpp
new file mode 100644
index 0000000..4cd7d9b
--- /dev/null
+++ b/src/dnnl_ops/ArraySet.cpp
@@ -0,0 +1,30 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+#include <omp.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int cpu_ArraySet(DLArrayHandle input, float value) {
+    int num = 1;
+    for (int i = 0; i < input->ndim; i++)
+        num *= input->shape[i];
+    float *data = (float *)(input->data);
+#pragma omp parallel for
+    for (int i = 0; i < num; i++)
+        data[i] = value;
+    return 0;
+}
diff --git a/src/dnnl_ops/AvgPool.cpp b/src/dnnl_ops/AvgPool.cpp
new file mode 100644
index 0000000..7f0003e
--- /dev/null
+++ b/src/dnnl_ops/AvgPool.cpp
@@ -0,0 +1,108 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+
+extern "C" int DnnlAvgPool(const DLArrayHandle input, const int kernel_H,
+                           const int kernel_W, DLArrayHandle output,
+                           const int padding, const int stride) {
+    //	engine eng(engine::kind::cpu, 0);
+    //	stream engine_stream(eng);
+    dnnl_stream_init();
+
+    int input_N = input->shape[0];
+    int input_C = input->shape[1];
+    int input_H = input->shape[2];
+    int input_W = input->shape[3];
+    int pooled_H = output->shape[2];
+    int pooled_W = output->shape[3];
+
+    memory::dims pool_src_tz = {input_N, input_C, input_H, input_W};
+    memory::dims pool_kernel = {kernel_H, kernel_W};
+    memory::dims pool_strides = {stride, stride};
+    memory::dims pool_padding = {padding, padding};
+    auto pool_src_md = memory::desc({pool_src_tz}, memory::data_type::f32,
+                                    memory::format_tag::nchw);
+    // dst memory
+    memory::dims pool_dst_tz = {input_N, input_C, pooled_H, pooled_W};
+
+    auto pool_dst_md = memory::desc({pool_dst_tz}, memory::data_type::f32,
+                                    memory::format_tag::nchw);
+
+    auto input_data = memory(pool_src_md, eng, (void *)input->data);
+    auto output_data = memory(pool_dst_md, eng, (void *)output->data);
+
+    //[Create pooling primitive]
+    auto pool_desc = pooling_forward::desc(
+        prop_kind::forward_inference, algorithm::pooling_avg_include_padding,
+        pool_src_md, pool_dst_md, pool_strides, pool_kernel, pool_padding,
+        pool_padding);
+
+    auto pool_pd = pooling_forward::primitive_desc(pool_desc, eng);
+    auto pool_p = pooling_forward(pool_pd);
+
+    //	stream s(eng);
+    pool_p.execute(engine_stream,
+                   {{DNNL_ARG_SRC, input_data}, {DNNL_ARG_DST, output_data}});
+    engine_stream.wait();
+    return 0;
+}
+
+extern "C" int DnnlAvgPool_Gradient(const DLArrayHandle gradient_Y,
+                                    const int kernel_H, const int kernel_W,
+                                    DLArrayHandle gradient_X, const int padding,
+                                    const int stride) {
+    dnnl_stream_init();
+    auto N = gradient_Y->shape[0];
+    auto C = gradient_Y->shape[1];
+    auto H = gradient_Y->shape[2];
+    auto W = gradient_Y->shape[3];
+
+    auto pooled_H = gradient_X->shape[2];
+    auto pooled_W = gradient_X->shape[3];
+
+    auto gradient_Y_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                                      memory::format_tag::nchw);
+    auto gradient_X_md =
+        memory::desc({N, C, pooled_H, pooled_W}, memory::data_type::f32,
+                     memory::format_tag::nchw);
+
+    auto gradient_Y_mem = memory(gradient_Y_md, eng, (void *)gradient_Y->data);
+    auto gradient_X_mem = memory(gradient_X_md, eng, (void *)gradient_X->data);
+
+    auto pooling_backward_d = pooling_backward::desc(
+        algorithm::pooling_avg_include_padding, gradient_X_md, gradient_Y_md,
+        {stride, stride}, {kernel_H, kernel_W}, {padding, padding},
+        {padding, padding});
+
+    // forward
+    auto pool_desc = pooling_forward::desc(
+        prop_kind::forward_training, algorithm::pooling_avg_include_padding,
+        gradient_X_md, gradient_Y_md, {stride, stride}, {kernel_H, kernel_W},
+        {padding, padding}, {padding, padding});
+
+    auto pool_pd = pooling_forward::primitive_desc(pool_desc, eng);
+    // forward
+
+    auto pooling_backward_pd =
+        pooling_backward::primitive_desc(pooling_backward_d, eng, pool_pd);
+    auto pooling_backward_p = pooling_backward(pooling_backward_pd);
+
+    //	stream s(eng);
+    pooling_backward_p.execute(engine_stream,
+                               {{DNNL_ARG_DIFF_SRC, gradient_X_mem},
+                                {DNNL_ARG_DIFF_DST, gradient_Y_mem}});
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/BatchNorm.cpp b/src/dnnl_ops/BatchNorm.cpp
new file mode 100644
index 0000000..770c8a7
--- /dev/null
+++ b/src/dnnl_ops/BatchNorm.cpp
@@ -0,0 +1,212 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+int DnnlBatchNorm(const DLArrayHandle input, const DLArrayHandle bn_scale,
+                  const DLArrayHandle bn_bias, DLArrayHandle output,
+                  DLArrayHandle mean, DLArrayHandle var, float momentum,
+                  float eps) {
+    dnnl_stream_init();
+
+    assert(input->ndim == 4);
+
+    int N = input->shape[0];
+    int C = input->shape[1];
+    int H = input->shape[2];
+    int W = input->shape[3];
+    assert(N == output->shape[0]);
+    assert(C == output->shape[1]);
+    assert(H == output->shape[2]);
+    assert(W == output->shape[3]);
+
+    float *scale = (float *)(bn_scale->data);
+    float *bias = (float *)(bn_bias->data);
+    float *ptr = new float[2 * C];
+    for (int i = 0; i < C; i++)
+        ptr[i] = scale[i];
+    for (int i = 0; i < C; i++)
+        ptr[i + C] = bias[i];
+
+    auto data_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                                memory::format_tag::nchw);
+    auto ptr_md =
+        memory::desc({2, C}, memory::data_type::f32, memory::format_tag::ab);
+    auto mean_var_md =
+        memory::desc({C}, memory::data_type::f32, memory::format_tag::a);
+
+    auto input_mem = memory(data_md, eng, input->data);
+    auto output_mem = memory(data_md, eng, output->data);
+    auto ptr_mem = memory(ptr_md, eng, ptr);
+    auto mean_mem = memory(mean_var_md, eng, mean->data);
+    auto var_mem = memory(mean_var_md, eng, var->data);
+
+    auto bn_d = batch_normalization_forward::desc(
+        prop_kind::forward_training, data_md, eps,
+        normalization_flags::use_scale_shift);
+    auto bn_pd = batch_normalization_forward::primitive_desc(bn_d, eng);
+    auto bn = batch_normalization_forward(bn_pd);
+
+    bn.execute(engine_stream, {{DNNL_ARG_SRC, input_mem},
+                               {DNNL_ARG_SCALE_SHIFT, ptr_mem},
+                               {DNNL_ARG_MEAN, mean_mem},
+                               {DNNL_ARG_VARIANCE, var_mem},
+                               {DNNL_ARG_DST, output_mem}});
+    engine_stream.wait();
+
+    return 0;
+}
+
+int DnnlBatchNorm_Gradient(const DLArrayHandle grad_y,
+                           const DLArrayHandle input,
+                           const DLArrayHandle bn_scale,
+                           const DLArrayHandle bn_bias, DLArrayHandle grad_x,
+                           DLArrayHandle grad_scale, DLArrayHandle grad_bias,
+                           DLArrayHandle mean, DLArrayHandle var,
+                           const float eps) {
+    dnnl_stream_init();
+
+    assert(input->ndim == 4);
+
+    int N = input->shape[0];
+    int C = input->shape[1];
+    int H = input->shape[2];
+    int W = input->shape[3];
+    int out_N = grad_y->shape[0];
+    int out_C = grad_y->shape[1];
+    int out_H = grad_y->shape[2];
+    int out_W = grad_y->shape[3];
+
+    float *scale = (float *)(bn_scale->data);
+    float *bias = (float *)(bn_bias->data);
+    float *ptr = new float[2 * C];
+    float *grad_ptr = new float[2 * out_C];
+    for (int i = 0; i < C; i++)
+        ptr[i] = scale[i];
+    for (int i = 0; i < C; i++)
+        ptr[i + C] = bias[i];
+
+    auto data_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                                memory::format_tag::nchw);
+    auto diff_md =
+        memory::desc({out_N, out_C, out_H, out_W}, memory::data_type::f32,
+                     memory::format_tag::nchw);
+    auto ptr_md =
+        memory::desc({2, C}, memory::data_type::f32, memory::format_tag::ab);
+    auto grad_ptr_md = memory::desc({2, out_C}, memory::data_type::f32,
+                                    memory::format_tag::ab);
+    auto mean_var_md =
+        memory::desc({C}, memory::data_type::f32, memory::format_tag::a);
+
+    auto input_mem = memory(data_md, eng, input->data);
+    auto grad_x_mem = memory(diff_md, eng, grad_x->data);
+    auto grad_y_mem = memory(diff_md, eng, grad_y->data);
+    auto ptr_mem = memory(ptr_md, eng, ptr);
+    auto grad_ptr_mem = memory(grad_ptr_md, eng, grad_ptr);
+    auto mean_mem = memory(mean_var_md, eng, mean->data);
+    auto var_mem = memory(mean_var_md, eng, var->data);
+
+    auto bn_d = batch_normalization_forward::desc(
+        prop_kind::forward_training, data_md, eps,
+        normalization_flags::use_scale_shift);
+    auto bn_pd = batch_normalization_forward::primitive_desc(bn_d, eng);
+    auto bn_grad_d = batch_normalization_backward::desc(
+        prop_kind::backward, diff_md, data_md, eps,
+        normalization_flags::use_scale_shift);
+    auto bn_grad_pd =
+        batch_normalization_backward::primitive_desc(bn_grad_d, eng, bn_pd);
+    auto bn_grad = batch_normalization_backward(bn_grad_pd);
+
+    bn_grad.execute(engine_stream, {{DNNL_ARG_SRC, input_mem},
+                                    {DNNL_ARG_SCALE_SHIFT, ptr_mem},
+                                    {DNNL_ARG_MEAN, mean_mem},
+                                    {DNNL_ARG_VARIANCE, var_mem},
+                                    {DNNL_ARG_DIFF_DST, grad_y_mem},
+                                    {DNNL_ARG_DIFF_SRC, grad_x_mem},
+                                    {DNNL_ARG_DIFF_SCALE_SHIFT, grad_ptr_mem}});
+    engine_stream.wait();
+
+    scale = (float *)(grad_scale->data);
+    bias = (float *)(grad_bias->data);
+    for (int i = 0; i < out_C; i++)
+        scale[i] = grad_ptr[i];
+    for (int i = 0; i < out_C; i++)
+        bias[i] = grad_ptr[i + C];
+    return 0;
+}
+
+int DnnlBatchNorm_Inference(const DLArrayHandle input,
+                            const DLArrayHandle bn_scale,
+                            const DLArrayHandle bn_bias, DLArrayHandle output,
+                            DLArrayHandle mean, DLArrayHandle var,
+                            float momentum, float eps) {
+    dnnl_stream_init();
+
+    assert(input->ndim == 4);
+
+    int N = input->shape[0];
+    int C = input->shape[1];
+    int H = input->shape[2];
+    int W = input->shape[3];
+    assert(N == output->shape[0]);
+    assert(C == output->shape[1]);
+    assert(H == output->shape[2]);
+    assert(W == output->shape[3]);
+
+    float *scale = (float *)(bn_scale->data);
+    float *bias = (float *)(bn_bias->data);
+    float *ptr = new float[2 * C];
+    float *running_mean = (float *)mean->data;
+    float *running_var = (float *)var->data;
+    for (int i = 0; i < C; i++)
+        ptr[i] = scale[i];
+    for (int i = 0; i < C; i++)
+        ptr[i + C] = bias[i];
+    for (int i = 0; i < C; i++)
+        running_mean[i] = running_mean[i] * momentum;
+    for (int i = 0; i < C; i++)
+        running_var[i] = running_var[i] * momentum;
+
+    auto data_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                                memory::format_tag::nchw);
+    auto ptr_md =
+        memory::desc({2, C}, memory::data_type::f32, memory::format_tag::ab);
+    auto mean_var_md =
+        memory::desc({C}, memory::data_type::f32, memory::format_tag::a);
+
+    auto input_mem = memory(data_md, eng, input->data);
+    auto output_mem = memory(data_md, eng, output->data);
+    auto ptr_mem = memory(ptr_md, eng, ptr);
+    auto mean_mem = memory(mean_var_md, eng, running_mean);
+    auto var_mem = memory(mean_var_md, eng, running_var);
+
+    auto bn_d = batch_normalization_forward::desc(
+        prop_kind::forward_inference, data_md, eps,
+        normalization_flags::use_global_stats
+            | normalization_flags::use_scale_shift);
+    auto bn_pd = batch_normalization_forward::primitive_desc(bn_d, eng);
+    auto bn = batch_normalization_forward(bn_pd);
+
+    bn.execute(engine_stream, {{DNNL_ARG_SRC, input_mem},
+                               {DNNL_ARG_SCALE_SHIFT, ptr_mem},
+                               {DNNL_ARG_MEAN, mean_mem},
+                               {DNNL_ARG_VARIANCE, var_mem},
+                               {DNNL_ARG_DST, output_mem}});
+    engine_stream.wait();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/BroadcastTo.cpp b/src/dnnl_ops/BroadcastTo.cpp
new file mode 100644
index 0000000..a3b25ca
--- /dev/null
+++ b/src/dnnl_ops/BroadcastTo.cpp
@@ -0,0 +1,41 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+
+// completed by omp, DNNL does not support this op
+extern "C" int cpu_BroadcastTo(const DLArrayHandle in_arr,
+                               DLArrayHandle out_arr) {
+    for (index_t i = 0; i < in_arr->ndim; i++) {
+        assert((in_arr->shape[i]) == (out_arr->shape[i + 1]));
+    }
+    size_t input_size = 1;
+    for (index_t i = 0; i < in_arr->ndim; i++) {
+        input_size *= in_arr->shape[i];
+    }
+    size_t N = (out_arr->shape[0]);
+    const float *in_data = (const float *)in_arr->data;
+    float *out_data = (float *)out_arr->data;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < input_size; i++) {
+        float tmp = in_data[i];
+#pragma omp parallel for
+        for (size_t j = 0; j < N; j++) {
+            out_data[j * input_size + i] = tmp;
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Concat.cpp b/src/dnnl_ops/Concat.cpp
new file mode 100644
index 0000000..7e77d95
--- /dev/null
+++ b/src/dnnl_ops/Concat.cpp
@@ -0,0 +1,112 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+
+extern "C" int DnnlConcat(const DLArrayHandle input_x,
+                          const DLArrayHandle input_y, DLArrayHandle output,
+                          int axis = 0) {
+    dnnl_stream_init();
+    assert(input_x->ndim == input_y->ndim);
+    assert(input_y->ndim == output->ndim);
+
+    int now_ndim = input_x->ndim;
+    for (int i = 0; i < now_ndim; i++) {
+        if (i != axis) {
+            assert(input_x->shape[i] == input_y->shape[i]);
+            assert(input_y->shape[i] == output->shape[i]);
+        } else {
+            assert(input_x->shape[i] + input_y->shape[i] == output->shape[i]);
+        }
+    }
+
+    std::vector<long int> shape1, format1, shape2, format2, shape3, format3;
+    for (int i = 0; i < input_x->ndim; i++) {
+        shape1.push_back(input_x->shape[i]);
+        shape2.push_back(input_y->shape[i]);
+        shape3.push_back(output->shape[i]);
+    }
+    format1.resize(input_x->ndim);
+    format2.resize(input_y->ndim);
+    format3.resize(output->ndim);
+    format1[(input_x->ndim) - 1] = format2[(input_y->ndim) - 1] =
+        format3[(output->ndim) - 1] = 1;
+    for (int i = format1.size() - 2; i >= 0; i--) {
+        format1[i] = format1[i + 1] * shape1[i + 1];
+        format2[i] = format2[i + 1] * shape2[i + 1];
+        format3[i] = format3[i + 1] * shape3[i + 1];
+    }
+    auto matA_md = memory::desc(shape1, memory::data_type::f32, format1);
+    auto matB_md = memory::desc(shape2, memory::data_type::f32, format2);
+    auto src_memA = memory(matA_md, eng, input_x->data);
+    auto src_memB = memory(matB_md, eng, input_y->data);
+    auto dst_md = memory::desc(shape3, memory::data_type::f32, format3);
+    auto dst_mem = memory(dst_md, eng, output->data);
+
+    std::vector<memory::desc> srcs_md;
+    std::vector<memory> srcs_mem;
+    srcs_md.push_back(matA_md);
+    srcs_md.push_back(matB_md);
+    srcs_mem.push_back(src_memA);
+    srcs_mem.push_back(src_memB);
+
+    auto concat_pd = concat::primitive_desc(dst_md, axis, srcs_md, eng);
+    auto concat_e = concat(concat_pd);
+    concat_e.execute(engine_stream, {{DNNL_ARG_DST, dst_mem},
+                                     {DNNL_ARG_MULTIPLE_SRC + 0, srcs_mem[0]},
+                                     {DNNL_ARG_MULTIPLE_SRC + 1, srcs_mem[1]}});
+    engine_stream.wait();
+    return 0;
+}
+
+extern "C" int cpu_Concat_Gradient(const DLArrayHandle output_gradient,
+                                   DLArrayHandle input_gradient, int axis = 0,
+                                   int id = 0) {
+    assert(output_gradient->ndim == input_gradient->ndim);
+    for (int i = 0; i < input_gradient->ndim; i++)
+        if (i != axis)
+            assert(input_gradient->shape[i] == output_gradient->shape[i]);
+
+    int start_position;
+    int end_position;
+    if (id == 0) {
+        start_position = 0;
+        end_position = 1;
+        for (int i = 0; i < output_gradient->ndim; i++)
+            if (i != axis)
+                end_position *= output_gradient->shape[i];
+            else
+                end_position *= input_gradient->shape[i];
+    } else {
+        start_position = 1;
+        end_position = 1;
+        for (int i = 0; i < output_gradient->ndim; i++) {
+            end_position *= output_gradient->shape[i];
+            if (i != axis)
+                start_position *= output_gradient->shape[i];
+            else
+                start_position *=
+                    (output_gradient->shape[i]) - (input_gradient->shape[i]);
+        }
+    }
+
+    float *output = (float *)(output_gradient->data);
+    float *input = (float *)(input_gradient->data);
+#pragma omp parallel for
+    for (int i = start_position; i < end_position; i++)
+        input[i] = output[i];
+
+    return 0;
+}
diff --git a/src/dnnl_ops/Conv2d.cpp b/src/dnnl_ops/Conv2d.cpp
new file mode 100644
index 0000000..9cf12a3
--- /dev/null
+++ b/src/dnnl_ops/Conv2d.cpp
@@ -0,0 +1,291 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+
+extern "C" int DnnlConv2d(const DLArrayHandle input_x, DLArrayHandle input_f,
+                          DLArrayHandle output, const int padding,
+                          const int stride) {
+    dnnl_stream_init();
+
+    assert(input_x->ndim == 4);
+    assert(input_f->ndim == 4);
+    assert(input_x->shape[1] == input_f->shape[1]);
+
+    int N = input_x->shape[0];
+    int C = input_x->shape[1];
+    int H = input_x->shape[2];
+    int W = input_x->shape[3];
+    int filter_outChannel = input_f->shape[0];
+    int filter_H = input_f->shape[2];
+    int filter_W = input_f->shape[3];
+    assert((H + 2 * padding - filter_H) % stride == 0);
+    assert((W + 2 * padding - filter_W) % stride == 0);
+    int out_H = (H + 2 * padding - filter_H) / stride + 1;
+    int out_W = (W + 2 * padding - filter_W) / stride + 1;
+
+    auto src_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                               memory::format_tag::nchw);
+    auto weights_md =
+        memory::desc({filter_outChannel, C, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::oihw);
+    auto dst_md =
+        memory::desc({N, filter_outChannel, out_H, out_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+
+    auto src_mem = memory(src_md, eng, input_x->data);
+    auto weights_mem = memory(weights_md, eng, input_f->data);
+    auto dst_mem = memory(dst_md, eng, output->data);
+
+    auto conv_src_md = memory::desc({N, C, H, W}, memory::data_type::f32,
+                                    memory::format_tag::any);
+    auto conv_weights_md =
+        memory::desc({filter_outChannel, C, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::any);
+    auto conv_dst_md =
+        memory::desc({N, filter_outChannel, out_H, out_W},
+                     memory::data_type::f32, memory::format_tag::any);
+
+    auto conv_src_mem = src_mem;
+    auto conv_weights_mem = weights_mem;
+    auto conv_dst_mem = dst_mem;
+
+    auto conv_desc = convolution_forward::desc(
+        prop_kind::forward, algorithm::convolution_auto, conv_src_md,
+        conv_weights_md, conv_dst_md, {stride, stride}, {padding, padding},
+        {padding, padding});
+
+    auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, eng);
+
+    if (conv_prim_desc.src_desc() != src_mem.get_desc()) {
+        conv_src_mem = memory(conv_prim_desc.src_desc(), eng);
+        reorder(src_mem, conv_src_mem)
+            .execute(engine_stream, src_mem, conv_src_mem);
+    }
+    if (conv_prim_desc.weights_desc() != weights_mem.get_desc()) {
+        conv_weights_mem = memory(conv_prim_desc.weights_desc(), eng);
+        reorder(weights_mem, conv_weights_mem)
+            .execute(engine_stream, weights_mem, conv_weights_mem);
+    }
+    if (conv_prim_desc.dst_desc() != dst_mem.get_desc()) {
+        conv_dst_mem = memory(conv_prim_desc.dst_desc(), eng);
+        reorder(dst_mem, conv_dst_mem)
+            .execute(engine_stream, dst_mem, conv_dst_mem);
+    }
+
+    auto conv = convolution_forward(conv_prim_desc);
+
+    conv.execute(engine_stream, {{DNNL_ARG_SRC, conv_src_mem},
+                                 {DNNL_ARG_WEIGHTS, conv_weights_mem},
+                                 {DNNL_ARG_DST, conv_dst_mem}});
+
+    if (conv_prim_desc.dst_desc() != dst_mem.get_desc())
+        reorder(conv_dst_mem, dst_mem)
+            .execute(engine_stream, conv_dst_mem, dst_mem);
+
+    engine_stream.wait();
+    return 1;
+}
+
+extern "C" int DnnlConv2d_Gradient_of_Data(const DLArrayHandle input_f,
+                                           const DLArrayHandle gradient_y,
+                                           DLArrayHandle gradient_x,
+                                           const int padding,
+                                           const int stride) {
+    auto input_N = gradient_x->shape[0];
+    auto input_C = gradient_x->shape[1];
+    auto input_H = gradient_x->shape[2];
+    auto input_W = gradient_x->shape[3];
+    auto filter_outChannel = input_f->shape[0];
+    auto filter_inChannel = input_f->shape[1];
+    auto filter_H = input_f->shape[2];
+    auto filter_W = input_f->shape[3];
+    auto output_N = gradient_y->shape[0];
+    auto output_C = gradient_y->shape[1];
+    auto output_H = gradient_y->shape[2];
+    auto output_W = gradient_y->shape[3];
+
+    dnnl_stream_init();
+
+    auto gradient_x_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+    auto input_f_md =
+        memory::desc({filter_outChannel, filter_inChannel, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::oihw);
+    auto gradient_y_md =
+        memory::desc({output_N, output_C, output_H, output_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+
+    auto input_f_mem = memory(input_f_md, eng, input_f->data);
+    auto gradient_y_mem = memory(gradient_y_md, eng, gradient_y->data);
+    auto gradient_x_mem = memory(gradient_x_md, eng, gradient_x->data);
+
+    auto conv_gradient_x_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::any);
+    auto conv_input_f_md =
+        memory::desc({filter_outChannel, filter_inChannel, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::any);
+    auto conv_gradient_y_md =
+        memory::desc({output_N, output_C, output_H, output_W},
+                     memory::data_type::f32, memory::format_tag::any);
+
+    auto conv_input_f_mem = input_f_mem;
+    auto conv_gradient_y_mem = gradient_y_mem;
+    auto conv_gradient_x_mem = gradient_x_mem;
+
+    auto gradient_data_d = convolution_backward_data::desc(
+        algorithm::convolution_auto, conv_gradient_x_md, conv_input_f_md,
+        conv_gradient_y_md, {stride, stride}, {padding, padding},
+        {padding, padding});
+    // forward
+    auto conv_desc = convolution_forward::desc(
+        prop_kind::forward, algorithm::convolution_auto, conv_gradient_x_md,
+        conv_input_f_md, conv_gradient_y_md, {stride, stride},
+        {padding, padding}, {padding, padding});
+    auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, eng);
+    // forward
+
+    auto gradient_data_pd = convolution_backward_data::primitive_desc(
+        gradient_data_d, eng, conv_prim_desc);
+
+    if (gradient_data_pd.weights_desc() != input_f_mem.get_desc()) {
+        conv_input_f_mem = memory(gradient_data_pd.weights_desc(), eng);
+        reorder(input_f_mem, conv_input_f_mem)
+            .execute(engine_stream, input_f_mem, conv_input_f_mem);
+    }
+    if (gradient_data_pd.diff_dst_desc() != gradient_y_mem.get_desc()) {
+        conv_gradient_y_mem = memory(gradient_data_pd.diff_dst_desc(), eng);
+        reorder(gradient_y_mem, conv_gradient_y_mem)
+            .execute(engine_stream, gradient_y_mem, conv_gradient_y_mem);
+    }
+    if (gradient_data_pd.diff_src_desc() != gradient_x_mem.get_desc()) {
+        conv_gradient_x_mem = memory(gradient_data_pd.diff_src_desc(), eng);
+        reorder(gradient_x_mem, conv_gradient_x_mem)
+            .execute(engine_stream, gradient_x_mem, conv_gradient_x_mem);
+    }
+
+    auto gradient_data = convolution_backward_data(gradient_data_pd);
+
+    gradient_data.execute(engine_stream,
+                          {{DNNL_ARG_DIFF_SRC, conv_gradient_x_mem},
+                           {DNNL_ARG_WEIGHTS, conv_input_f_mem},
+                           {DNNL_ARG_DIFF_DST, conv_gradient_y_mem}});
+
+    if (gradient_data_pd.diff_src_desc() != gradient_x_mem.get_desc())
+        reorder(conv_gradient_x_mem, gradient_x_mem)
+            .execute(engine_stream, conv_gradient_x_mem, gradient_x_mem);
+
+    engine_stream.wait();
+    return 0;
+}
+
+extern "C" int DnnlConv2d_Gradient_of_Filter(const DLArrayHandle input_x,
+                                             const DLArrayHandle gradient_y,
+                                             DLArrayHandle gradient_f,
+                                             const int padding,
+                                             const int stride) {
+    auto input_N = input_x->shape[0];
+    auto input_C = input_x->shape[1];
+    auto input_H = input_x->shape[2];
+    auto input_W = input_x->shape[3];
+    auto filter_outChannel = gradient_f->shape[0];
+    auto filter_inChannel = gradient_f->shape[1];
+    auto filter_H = gradient_f->shape[2];
+    auto filter_W = gradient_f->shape[3];
+    auto output_N = gradient_y->shape[0];
+    auto output_C = gradient_y->shape[1];
+    auto output_H = gradient_y->shape[2];
+    auto output_W = gradient_y->shape[3];
+
+    dnnl_stream_init();
+
+    auto input_x_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+    auto gradient_f_md =
+        memory::desc({filter_outChannel, filter_inChannel, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::oihw);
+    auto gradient_y_md =
+        memory::desc({output_N, output_C, output_H, output_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+
+    auto input_x_mem = memory(input_x_md, eng, input_x->data);
+    auto gradient_f_mem = memory(gradient_f_md, eng, gradient_f->data);
+    auto gradient_y_mem = memory(gradient_y_md, eng, gradient_y->data);
+
+    auto conv_input_x_mem = input_x_mem;
+    auto conv_gradient_f_mem = gradient_f_mem;
+    auto conv_gradient_y_mem = gradient_y_mem;
+
+    auto conv_input_x_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::any);
+    auto conv_gradient_f_md =
+        memory::desc({filter_outChannel, filter_inChannel, filter_H, filter_W},
+                     memory::data_type::f32, memory::format_tag::any);
+    auto conv_gradient_y_md =
+        memory::desc({output_N, output_C, output_H, output_W},
+                     memory::data_type::f32, memory::format_tag::any);
+
+    auto gradient_filter_d = convolution_backward_weights::desc(
+        algorithm::convolution_auto, conv_input_x_md, conv_gradient_f_md,
+        conv_gradient_y_md, {stride, stride}, {padding, padding},
+        {padding, padding});
+
+    // forward
+    auto conv_desc = convolution_forward::desc(
+        prop_kind::forward, algorithm::convolution_auto, conv_input_x_md,
+        conv_gradient_f_md, conv_gradient_y_md, {stride, stride},
+        {padding, padding}, {padding, padding});
+
+    auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, eng);
+    // forward
+
+    auto gradient_filter_pd = convolution_backward_weights::primitive_desc(
+        gradient_filter_d, eng, conv_prim_desc);
+
+    if (gradient_filter_pd.src_desc() != input_x_mem.get_desc()) {
+        conv_input_x_mem = memory(gradient_filter_pd.src_desc(), eng);
+        reorder(input_x_mem, conv_input_x_mem)
+            .execute(engine_stream, input_x_mem, conv_input_x_mem);
+    }
+    if (gradient_filter_pd.diff_weights_desc() != gradient_f_mem.get_desc()) {
+        conv_gradient_f_mem =
+            memory(gradient_filter_pd.diff_weights_desc(), eng);
+        reorder(gradient_f_mem, conv_gradient_f_mem)
+            .execute(engine_stream, gradient_f_mem, conv_gradient_f_mem);
+    }
+    if (gradient_filter_pd.diff_dst_desc() != gradient_y_mem.get_desc()) {
+        conv_gradient_y_mem = memory(gradient_filter_pd.diff_dst_desc(), eng);
+        reorder(gradient_y_mem, conv_gradient_y_mem)
+            .execute(engine_stream, gradient_y_mem, conv_gradient_y_mem);
+    }
+    auto gradient_filter = convolution_backward_weights(gradient_filter_pd);
+
+    gradient_filter.execute(engine_stream,
+                            {{DNNL_ARG_SRC, conv_input_x_mem},
+                             {DNNL_ARG_DIFF_WEIGHTS, conv_gradient_f_mem},
+                             {DNNL_ARG_DIFF_DST, conv_gradient_y_mem}});
+    if (gradient_filter_pd.diff_weights_desc() != gradient_f_mem.get_desc())
+        reorder(conv_gradient_f_mem, gradient_f_mem)
+            .execute(engine_stream, conv_gradient_f_mem, gradient_f_mem);
+    else
+        gradient_f_mem = conv_gradient_f_mem;
+    engine_stream.wait();
+
+    return 0;
+}
diff --git a/src/dnnl_ops/DivideConst.cpp b/src/dnnl_ops/DivideConst.cpp
new file mode 100644
index 0000000..ed487f1
--- /dev/null
+++ b/src/dnnl_ops/DivideConst.cpp
@@ -0,0 +1,48 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseDivideByConst(const DLArrayHandle input,
+                                                  float val,
+                                                  DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto DivideConst_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_pow, mat_md, val, -1.f);
+    auto DivideConst_pd = eltwise_forward::primitive_desc(DivideConst_d, eng);
+    auto DivideConst = eltwise_forward(DivideConst_pd);
+
+    DivideConst.execute(engine_stream,
+                        {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/DivideElewise.cpp b/src/dnnl_ops/DivideElewise.cpp
new file mode 100644
index 0000000..3d8d295
--- /dev/null
+++ b/src/dnnl_ops/DivideElewise.cpp
@@ -0,0 +1,59 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseDivide(const DLArrayHandle matA,
+                                           const DLArrayHandle matB,
+                                           DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < matA->ndim; i++)
+        shape.push_back(matA->shape[i]);
+    format.resize(matA->ndim);
+    format[(matA->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    float *temp = new float[shape[0] * format[0]];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto srcA_mem = memory(mat_md, eng, matA->data);
+    auto srcB_mem = memory(mat_md, eng, matB->data);
+    auto temp_mem = memory(mat_md, eng, temp);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Reciprocal_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_pow, mat_md, 1.f, -1.f);
+    auto Reciprocal_pd = eltwise_forward::primitive_desc(Reciprocal_d, eng);
+    auto Reciprocal = eltwise_forward(Reciprocal_pd);
+    auto MultiplyElewise_d =
+        binary::desc(algorithm::binary_mul, mat_md, mat_md, mat_md);
+    auto MultiplyElewise_pd = binary::primitive_desc(MultiplyElewise_d, eng);
+    auto MultiplyElewise = binary(MultiplyElewise_pd);
+
+    Reciprocal.execute(engine_stream,
+                       {{DNNL_ARG_SRC, srcB_mem}, {DNNL_ARG_DST, temp_mem}});
+    engine_stream.wait();
+    MultiplyElewise.execute(engine_stream, {{DNNL_ARG_SRC_0, srcA_mem},
+                                            {DNNL_ARG_SRC_1, temp_mem},
+                                            {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    delete temp;
+    return 0;
+}
diff --git a/src/dnnl_ops/Dropout.cpp b/src/dnnl_ops/Dropout.cpp
new file mode 100644
index 0000000..3fa61cf
--- /dev/null
+++ b/src/dnnl_ops/Dropout.cpp
@@ -0,0 +1,58 @@
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+extern "C" int cpu_Dropout(const DLArrayHandle input_X, const float dropout,
+                           DLArrayHandle output_Y) {
+    int seed = 233;
+    srand(seed);
+
+    float *output = (float *)output_Y->data;
+    float *input = (float *)input_X->data;
+    int data_size = 1;
+    assert(input_X->ndim == output_Y->ndim);
+    for (int i = 0; i < input_X->ndim; i++) {
+        assert(input_X->shape[i] == output_Y->shape[i]);
+        data_size *= input_X->shape[i];
+    }
+    for (int i = 0; i < data_size; i++) {
+        if (rand() / (double)RAND_MAX > dropout)
+            output[i] = 0;
+        else
+            output[i] = input[i] * (1 / dropout);
+    }
+    return 0;
+}
+
+extern "C" int cpu_Dropout_Gradient(const DLArrayHandle output_Y,
+                                    const float dropout,
+                                    DLArrayHandle input_X) {
+    int seed = 233;
+    srand(seed);
+
+    float *output = (float *)output_Y->data;
+    float *input = (float *)input_X->data;
+    int data_size = 1;
+    assert(input_X->ndim == output_Y->ndim);
+    for (int i = 0; i < input_X->ndim; i++) {
+        assert(input_X->shape[i] == output_Y->shape[i]);
+        data_size *= input_X->shape[i];
+    }
+    for (int i = 0; i < data_size; i++) {
+        if (rand() / (double)RAND_MAX > dropout)
+            input[i] = 0;
+        else
+            input[i] = output[i] * (1 / dropout);
+    }
+    return 0;
+}
diff --git a/src/dnnl_ops/EmbeddingLookup.cpp b/src/dnnl_ops/EmbeddingLookup.cpp
new file mode 100644
index 0000000..4b6040d
--- /dev/null
+++ b/src/dnnl_ops/EmbeddingLookup.cpp
@@ -0,0 +1,35 @@
+#include <cctype>
+#include <cstring>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+extern "C" int cpu_EmbeddingLookup(const DLArrayHandle in_mat,
+                                   const DLArrayHandle ids,
+                                   DLArrayHandle out_mat) {
+    const float *embed = (const float *)(in_mat->data);
+    const float *index = (const float *)(ids->data);
+    float *output = (float *)(out_mat->data);
+    assert(in_mat->ndim == 2);
+    size_t width = in_mat->shape[1];
+    size_t entry_size = width * sizeof(float);
+    size_t idx_size = 1;
+    for (int i = 0; i < ids->ndim; ++i)
+        idx_size *= ids->shape[i];
+
+#pragma omp parallel for
+    for (size_t i = 0; i < idx_size; ++i) {
+        memcpy(output + i * width, embed + size_t(index[i]) * width,
+               entry_size);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Initializers.cpp b/src/dnnl_ops/Initializers.cpp
new file mode 100644
index 0000000..9c876ed
--- /dev/null
+++ b/src/dnnl_ops/Initializers.cpp
@@ -0,0 +1,67 @@
+#include <cctype>
+#include <cstring>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+int cpu_NormalInit(DLArrayHandle arr, const float mean, const float stddev,
+                   unsigned long long seed) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    std::normal_distribution<float> normal_dist(mean, stddev);
+    std::default_random_engine generator(seed);
+    for (size_t j = 0; j < size; j++) {
+        arr_data[j] = normal_dist(generator);
+    }
+    return 0;
+}
+
+int cpu_UniformInit(DLArrayHandle arr, const float lb, const float ub,
+                    unsigned long long seed) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    std::uniform_real_distribution<float> uniform_dist(lb, ub);
+    std::default_random_engine generator(seed);
+    for (size_t j = 0; j < size; j++) {
+        arr_data[j] = uniform_dist(generator);
+    }
+    return 0;
+}
+
+int cpu_TruncatedNormalInit(DLArrayHandle arr, const float mean,
+                            const float stddev, unsigned long long seed) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    std::normal_distribution<float> truncated_normal_dist(mean, stddev);
+    float upper_limit = mean + 2 * stddev;
+    float lower_limit = mean - 2 * stddev;
+    std::default_random_engine generator(seed);
+    for (size_t j = 0; j < size; j++) {
+        float temp = truncated_normal_dist(generator);
+        while (temp > upper_limit || temp < lower_limit)
+            temp = truncated_normal_dist(generator);
+        arr_data[j] = temp;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/MatrixMult.cpp b/src/dnnl_ops/MatrixMult.cpp
new file mode 100644
index 0000000..4db106e
--- /dev/null
+++ b/src/dnnl_ops/MatrixMult.cpp
@@ -0,0 +1,56 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+
+extern "C" int DnnlMatrixMultiply(const DLArrayHandle matA, bool transposeA,
+                                  const DLArrayHandle matB, bool transposeB,
+                                  DLArrayHandle matC) {
+    dnnl_stream_init();
+
+    assert(matA->ndim == 2 && matB->ndim == 2 && matC->ndim == 2);
+    memory::desc srcA_md, srcB_md, dst_md;
+    if (!transposeA)
+        srcA_md = memory::desc({matA->shape[0], matA->shape[1]},
+                               memory::data_type::f32, memory::format_tag::ab);
+    else
+        srcA_md = memory::desc({matA->shape[1], matA->shape[0]},
+                               memory::data_type::f32, memory::format_tag::ba);
+    if (!transposeB)
+        srcB_md = memory::desc({matB->shape[0], matB->shape[1]},
+                               memory::data_type::f32, memory::format_tag::ab);
+    else
+        srcB_md = memory::desc({matB->shape[1], matB->shape[0]},
+                               memory::data_type::f32, memory::format_tag::ba);
+    dst_md = memory::desc({matC->shape[0], matC->shape[1]},
+                          memory::data_type::f32, memory::format_tag::ab);
+    ;
+
+    auto srcA_mem = memory(srcA_md, eng, matA->data);
+    auto srcB_mem = memory(srcB_md, eng, matB->data);
+    auto dst_mem = memory(dst_md, eng, matC->data);
+
+    auto Matmul_d = matmul::desc(srcA_md, srcB_md, dst_md);
+    auto Matmul_pd = matmul::primitive_desc(Matmul_d, eng);
+    auto Matmul = matmul(Matmul_pd);
+
+    Matmul.execute(engine_stream, {{DNNL_ARG_SRC, srcA_mem},
+                                   {DNNL_ARG_WEIGHTS, srcB_mem},
+                                   {DNNL_ARG_DST, dst_mem}});
+
+    engine_stream.wait();
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/MaxPool.cpp b/src/dnnl_ops/MaxPool.cpp
new file mode 100644
index 0000000..1966b40
--- /dev/null
+++ b/src/dnnl_ops/MaxPool.cpp
@@ -0,0 +1,123 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+
+extern "C" int DnnlMaxPool(const DLArrayHandle input, const int kernel_H,
+                           const int kernel_W, DLArrayHandle output,
+                           const int padding, const int stride) {
+    dnnl_stream_init();
+    using tag = memory::format_tag;
+    using dt = memory::data_type;
+
+    int input_N = input->shape[0];
+    int input_C = input->shape[1];
+    int input_H = input->shape[2];
+    int input_W = input->shape[3];
+    int pooled_H = output->shape[2];
+    int pooled_W = output->shape[3];
+
+    // src memory
+    memory::dims pool_src_tz = {input_N, input_C, input_H, input_W};
+    memory::dims pool_kernel = {kernel_H, kernel_W};
+    memory::dims pool_strides = {stride, stride};
+    memory::dims pool_padding = {padding, padding};
+    auto pool_src_md = memory::desc({pool_src_tz}, dt::f32, tag::nchw);
+    // dst memory
+    memory::dims pool_dst_tz = {input_N, input_C, pooled_H, pooled_W};
+
+    auto pool_dst_md = memory::desc({pool_dst_tz}, dt::f32, tag::nchw);
+
+    auto input_data = memory(pool_src_md, eng, (void *)input->data);
+    auto output_data = memory(pool_dst_md, eng, (void *)output->data);
+
+    //[Create pooling primitive]
+    auto pool_desc = pooling_forward::desc(
+        prop_kind::forward_inference, algorithm::pooling_max, pool_src_md,
+        pool_dst_md, pool_strides, pool_kernel, pool_padding, pool_padding);
+
+    auto pool_pd = pooling_forward::primitive_desc(pool_desc, eng);
+    auto pool_p = pooling_forward(pool_pd);
+
+    pool_p.execute(engine_stream,
+                   {{DNNL_ARG_SRC, input_data}, {DNNL_ARG_DST, output_data}});
+
+    return 0;
+}
+
+extern "C" int DnnlMaxPool_Gradient(const DLArrayHandle input,
+                                    const DLArrayHandle input_grad,
+                                    const int kernel_H, const int kernel_W,
+                                    DLArrayHandle output_grad,
+                                    const int padding, const int stride) {
+    int input_N = input->shape[0];
+    int input_C = input->shape[1];
+    int input_H = input->shape[2];
+    int input_W = input->shape[3];
+    int pooled_H = input_grad->shape[2];
+    int pooled_W = input_grad->shape[3];
+
+    dnnl_stream_init();
+
+    auto input_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+    auto input_grad_md =
+        memory::desc({input_N, input_C, pooled_H, pooled_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+    auto output_grad_md =
+        memory::desc({input_N, input_C, input_H, input_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+
+    auto input_mem = memory(input_md, eng, (void *)input->data);
+    auto input_grad_mem = memory(input_grad_md, eng, (void *)input_grad->data);
+    auto output_grad_mem =
+        memory(output_grad_md, eng, (void *)output_grad->data);
+
+    auto pooling_backward_d = pooling_backward::desc(
+        algorithm::pooling_max, output_grad_md, input_grad_md, {stride, stride},
+        {kernel_H, kernel_W}, {padding, padding}, {padding, padding});
+
+    // forward
+    auto input_forward_md =
+        memory::desc({input_N, input_C, pooled_H, pooled_W},
+                     memory::data_type::f32, memory::format_tag::nchw);
+    auto input_forward_mem = memory(input_forward_md, eng);
+    auto pool_desc = pooling_forward::desc(
+        prop_kind::forward_training, algorithm::pooling_max, input_md,
+        input_forward_md, {stride, stride}, {kernel_H, kernel_W},
+        {padding, padding}, {padding, padding});
+    auto pool_pd = pooling_forward::primitive_desc(pool_desc, eng);
+    auto workspace_mem = memory(pool_pd.workspace_desc(), eng);
+    auto pool_p = pooling_forward(pool_pd);
+    pool_p.execute(engine_stream, {{DNNL_ARG_SRC, input_mem},
+                                   {DNNL_ARG_DST, input_forward_mem},
+                                   {DNNL_ARG_WORKSPACE, workspace_mem}});
+
+    // backward
+    auto pooling_backward_pd =
+        pooling_backward::primitive_desc(pooling_backward_d, eng, pool_pd);
+
+    auto pooling_backward_p = pooling_backward(pooling_backward_pd);
+
+    pooling_backward_p.execute(engine_stream,
+                               {{DNNL_ARG_DIFF_SRC, output_grad_mem},
+                                {DNNL_ARG_WORKSPACE, workspace_mem},
+                                {DNNL_ARG_DIFF_DST, input_grad_mem}});
+
+    engine_stream.wait();
+    return 1;
+}
diff --git a/src/dnnl_ops/MultiplyConst.cpp b/src/dnnl_ops/MultiplyConst.cpp
new file mode 100644
index 0000000..63864e3
--- /dev/null
+++ b/src/dnnl_ops/MultiplyConst.cpp
@@ -0,0 +1,50 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseMultiplyByConst(const DLArrayHandle input,
+                                                    float val,
+                                                    DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    //  stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto MultiplyConst_d =
+        eltwise_forward::desc(prop_kind::forward_training,
+                              algorithm::eltwise_linear, mat_md, val, 0.f);
+    auto MultiplyConst_pd =
+        eltwise_forward::primitive_desc(MultiplyConst_d, eng);
+    auto MultiplyConst = eltwise_forward(MultiplyConst_pd);
+
+    MultiplyConst.execute(engine_stream,
+                          {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/MultiplyElewise.cpp b/src/dnnl_ops/MultiplyElewise.cpp
new file mode 100644
index 0000000..a6838e7
--- /dev/null
+++ b/src/dnnl_ops/MultiplyElewise.cpp
@@ -0,0 +1,48 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlMatrixElementwiseMultiply(const DLArrayHandle matA,
+                                             const DLArrayHandle matB,
+                                             DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < matA->ndim; i++)
+        shape.push_back(matA->shape[i]);
+    format.resize(matA->ndim);
+    format[(matA->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto srcA_mem = memory(mat_md, eng, matA->data);
+    auto srcB_mem = memory(mat_md, eng, matB->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+    auto MultiplyElewise_d =
+        binary::desc(algorithm::binary_mul, mat_md, mat_md, mat_md);
+    auto MultiplyElewise_pd = binary::primitive_desc(MultiplyElewise_d, eng);
+    auto MultiplyElewise = binary(MultiplyElewise_pd);
+
+    MultiplyElewise.execute(engine_stream, {{DNNL_ARG_SRC_0, srcA_mem},
+                                            {DNNL_ARG_SRC_1, srcB_mem},
+                                            {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/Opposite.cpp b/src/dnnl_ops/Opposite.cpp
new file mode 100644
index 0000000..f003ad8
--- /dev/null
+++ b/src/dnnl_ops/Opposite.cpp
@@ -0,0 +1,46 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlOpposite(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Opposite_d =
+        eltwise_forward::desc(prop_kind::forward_training,
+                              algorithm::eltwise_linear, mat_md, -1.f, 0.f);
+    auto Opposite_pd = eltwise_forward::primitive_desc(Opposite_d, eng);
+    auto Opposite = eltwise_forward(Opposite_pd);
+
+    Opposite.execute(engine_stream,
+                     {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/Optimizers.cpp b/src/dnnl_ops/Optimizers.cpp
new file mode 100644
index 0000000..86b4885
--- /dev/null
+++ b/src/dnnl_ops/Optimizers.cpp
@@ -0,0 +1,154 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+#include <omp.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int cpu_AddL2Regularization(const DLArrayHandle param,
+                                       const DLArrayHandle grad, float l2reg) {
+    int num = 1;
+    for (int i = 0; i < param->ndim; i++)
+        num *= param->shape[i];
+
+    float *param_data = (float *)(param->data);
+    float *grad_data = (float *)(grad->data);
+
+#pragma omp parallel for
+    for (int i = 0; i < num; ++i)
+        grad_data[i] += l2reg * param_data[i];
+    return 0;
+}
+
+extern "C" int cpu_SGDOptimizerUpdate(const DLArrayHandle param,
+                                      const DLArrayHandle grad,
+                                      float learning_rate) {
+    int num = 1;
+    for (int i = 0; i < param->ndim; i++)
+        num *= param->shape[i];
+
+    float *param_data = (float *)(param->data);
+    float *grad_data = (float *)(grad->data);
+
+#pragma omp parallel for
+    for (int i = 0; i < num; i++)
+        param_data[i] -= learning_rate * grad_data[i];
+    return 0;
+}
+
+extern "C" int cpu_SGDOptimizerSparseUpdate(DLArrayHandle param,
+                                            const DLArrayHandle grad_indices,
+                                            const DLArrayHandle grad_values,
+                                            float lr) {
+    size_t num = 1;
+    for (int i = 0; i < grad_indices->ndim; ++i)
+        num *= grad_indices->shape[i];
+    assert(param->ndim == 2);
+    size_t width = param->shape[1];
+
+    float *param_data = (float *)(param->data);
+    const float *indices_data = (const float *)(grad_indices->data);
+    const float *value_data = (const float *)(grad_values->data);
+
+    // to avoid lock, here not use parallel
+    for (size_t i = 0; i < num; ++i) {
+        size_t dst_offset = size_t(indices_data[i]) * width;
+        size_t src_offset = i * width;
+        for (size_t j = 0; j < width; ++j) {
+            param_data[dst_offset + j] -= lr * value_data[src_offset + j];
+        }
+    }
+    return 0;
+}
+
+extern "C" int cpu_MomentumOptimizerUpdate(DLArrayHandle param,
+                                           const DLArrayHandle grad,
+                                           DLArrayHandle velocity,
+                                           float learning_rate, float momentum,
+                                           bool nesterov) {
+    int num = 1;
+    for (int i = 0; i < param->ndim; i++)
+        num *= param->shape[i];
+
+    float *param_data = (float *)(param->data);
+    float *grad_data = (float *)(grad->data);
+    float *velocity_data = (float *)(velocity->data);
+
+    if (nesterov) {
+#pragma omp parallel for
+        for (int i = 0; i < num; i++) {
+            velocity_data[i] =
+                momentum * (velocity_data[i] - learning_rate * grad_data[i]);
+            param_data[i] =
+                param_data[i] + velocity_data[i] - learning_rate * grad_data[i];
+        }
+    } else {
+#pragma omp parallel for
+        for (int i = 0; i < num; i++) {
+            velocity_data[i] =
+                momentum * velocity_data[i] - learning_rate * grad_data[i];
+            param_data[i] = param_data[i] + velocity_data[i];
+        }
+    }
+    return 0;
+}
+
+extern "C" int cpu_AdaGradOptimizerUpdate(DLArrayHandle param,
+                                          const DLArrayHandle grad,
+                                          DLArrayHandle acc,
+                                          float learning_rate, float eps) {
+    int num = 1;
+    for (int i = 0; i < param->ndim; i++)
+        num *= param->shape[i];
+
+    float *param_data = (float *)(param->data);
+    float *grad_data = (float *)(grad->data);
+    float *acc_data = (float *)(acc->data);
+
+#pragma omp parallel for
+    for (int i = 0; i < num; i++) {
+        acc_data[i] = acc_data[i] + grad_data[i] * grad_data[i];
+        param_data[i] =
+            param_data[i]
+            - learning_rate * grad_data[i] / (sqrtf(acc_data[i]) + eps);
+    }
+    return 0;
+}
+
+extern "C" int
+cpu_AdamOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                        DLArrayHandle expavg, DLArrayHandle expavgsq,
+                        float learning_rate, float beta1, float beta2,
+                        float beta1t, float beta2t, float eps) {
+    int num = 1;
+    for (int i = 0; i < param->ndim; i++)
+        num *= param->shape[i];
+
+    float *param_data = (float *)(param->data);
+    float *grad_data = (float *)(grad->data);
+    float *expavg_data = (float *)(expavg->data);
+    float *expavgsq_data = (float *)(expavgsq->data);
+
+#pragma omp parallel for
+    for (int i = 0; i < num; i++) {
+        expavg_data[i] = beta1 * expavg_data[i] + (1 - beta1) * grad_data[i];
+        expavgsq_data[i] = beta2 * expavgsq_data[i]
+                           + (1 - beta2) * grad_data[i] * grad_data[i];
+        param_data[i] = param_data[i]
+                        - learning_rate * (expavg_data[i] / (1 - beta1t))
+                              / (sqrtf(expavgsq_data[i] / (1 - beta2t)) + eps);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Pad.cpp b/src/dnnl_ops/Pad.cpp
new file mode 100644
index 0000000..5fbb29a
--- /dev/null
+++ b/src/dnnl_ops/Pad.cpp
@@ -0,0 +1,103 @@
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+int cpu_Pad(const DLArrayHandle input_X, DLArrayHandle output_Y, int *paddings,
+            int pad_len, size_t mode = 0, float constant_values = 0) {
+    assert(input_X->ndim == 4 && output_Y->ndim == 4);
+
+    int len = pad_len;
+    int endpoint[8];
+    for (int i = 0; i < 4; i++) {
+        if (i < (4 - len / 2)) {
+            assert((input_X->shape[i]) == (output_Y->shape[i]));
+            endpoint[i * 2] = input_X->shape[i] - 1;
+            endpoint[i * 2 + 1] = endpoint[i * 2] + 1;
+        } else {
+            assert((input_X->shape[i] + paddings[(i - (4 - len / 2)) * 2]
+                    + paddings[(i - (4 - len / 2)) * 2 + 1])
+                   == (output_Y->shape[i]));
+            endpoint[i * 2] = paddings[(i - (4 - len / 2)) * 2];
+            endpoint[i * 2 + 1] =
+                paddings[(i - (4 - len / 2)) * 2] + input_X->shape[i];
+        }
+    }
+    // int in_N = input_X->shape[0];
+    int in_C = input_X->shape[1];
+    int in_H = input_X->shape[2];
+    int in_W = input_X->shape[3];
+    // int N = output_Y->shape[0];
+    int C = output_Y->shape[1];
+    int H = output_Y->shape[2];
+    int W = output_Y->shape[3];
+    int output_size = 1;
+    for (int i = 0; i < output_Y->ndim; i++)
+        output_size *= output_Y->shape[i];
+    float *input = (float *)(input_X->data);
+    float *output = (float *)(output_Y->data);
+
+    if (mode == 0) {
+#pragma omp parallel for
+        for (int i = 0; i < output_size; i++)
+            output[i] = constant_values;
+
+        for (int i = endpoint[0]; i < endpoint[1]; i++)
+            for (int j = endpoint[2]; j < endpoint[3]; j++)
+                for (int k = endpoint[4]; k < endpoint[5]; k++)
+                    for (int l = endpoint[6]; l < endpoint[7]; l++) {
+                        output[i * (C * H * W) + j * (H * W) + k * W + l] =
+                            input[(i - endpoint[0]) * (in_C * in_H * in_W)
+                                  + (j - endpoint[2]) * (in_H * in_W)
+                                  + (k - endpoint[4]) * in_W
+                                  + (l - endpoint[6])];
+                    }
+    }
+    return 0;
+}
+
+int cpu_Pad_Gradient(const DLArrayHandle output_gradient_Y,
+                     DLArrayHandle input_gradient_X, int *paddings, int pad_len,
+                     size_t mode) {
+    assert(input_gradient_X->ndim == 4 && output_gradient_Y->ndim == 4);
+    int len = pad_len;
+    int begin_p[4];
+    int N = input_gradient_X->shape[0];
+    int C = input_gradient_X->shape[1];
+    int H = input_gradient_X->shape[2];
+    int W = input_gradient_X->shape[3];
+
+    // int out_N = output_gradient_Y->shape[0];
+    int out_C = output_gradient_Y->shape[1];
+    int out_H = output_gradient_Y->shape[2];
+    int out_W = output_gradient_Y->shape[3];
+    for (int i = 0; i < 4; i++) {
+        if (i < (4 - len / 2)) {
+            begin_p[i] = 0;
+        } else {
+            begin_p[i] = paddings[(i - (4 - len / 2)) * 2];
+        }
+    }
+    float *input = (float *)input_gradient_X->data;
+    float *output = (float *)output_gradient_Y->data;
+#pragma omp parallel for
+    for (int i = begin_p[0]; i < N + begin_p[0]; i++)
+        for (int j = begin_p[1]; j - begin_p[1] < C; j++)
+            for (int k = begin_p[2]; k - begin_p[2] < H; k++)
+                for (int l = begin_p[3]; l - begin_p[3] < W; l++)
+                    input[(i - begin_p[0]) * (C * H * W)
+                          + (j - begin_p[1]) * (H * W) + (k - begin_p[2]) * W
+                          + (l - begin_p[3])] =
+                        output[i * (out_C * out_H * out_W) + j * (out_H * out_W)
+                               + k * out_W + l];
+    return 0;
+}
diff --git a/src/dnnl_ops/ReduceSumAxisZero.cpp b/src/dnnl_ops/ReduceSumAxisZero.cpp
new file mode 100644
index 0000000..de1876f
--- /dev/null
+++ b/src/dnnl_ops/ReduceSumAxisZero.cpp
@@ -0,0 +1,37 @@
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+using namespace dnnl;
+
+extern "C" int cpu_ReduceSumAxisZero(const DLArrayHandle input,
+                                     DLArrayHandle output) {
+    for (int i = 1; i < (input->ndim); i++) {
+        assert((input->shape[i]) == (output->shape[i - 1]));
+    }
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+    size_t output_size = 1;
+    for (index_t i = 0; i < (output->ndim); i++) {
+        output_size *= (output->shape[i]);
+    }
+    size_t N = input->shape[0];
+#pragma omp parallel for
+    for (size_t j = 0; j < output_size; j++)
+        output_data[j] = 0;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < N * output_size; i++)
+        output_data[i % output_size] += input_data[i];
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Relu.cpp b/src/dnnl_ops/Relu.cpp
new file mode 100644
index 0000000..b066ac8
--- /dev/null
+++ b/src/dnnl_ops/Relu.cpp
@@ -0,0 +1,83 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlRelu(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Relu_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_relu, mat_md, 0.f, 0.f);
+    auto Relu_pd = eltwise_forward::primitive_desc(Relu_d, eng);
+    auto Relu = eltwise_forward(Relu_pd);
+
+    Relu.execute(engine_stream,
+                 {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
+
+extern "C" int DnnlRelu_Gradient(const DLArrayHandle input,
+                                 const DLArrayHandle in_grad,
+                                 DLArrayHandle output) {
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto input_md = memory::desc(shape, memory::data_type::f32, format);
+    auto in_grad_md = memory::desc(shape, memory::data_type::f32, format);
+    auto output_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto input_mem = memory(input_md, eng, input->data);
+    auto in_grad_mem = memory(in_grad_md, eng, in_grad->data);
+    auto output_mem = memory(output_md, eng, output->data);
+
+    // forward
+    auto Relu_d = eltwise_forward::desc(prop_kind::forward_training,
+                                        algorithm::eltwise_relu, input_md);
+    auto Relu_pd = eltwise_forward::primitive_desc(Relu_d, eng);
+
+    // backward
+    auto Relu_gradient_d =
+        eltwise_backward::desc(algorithm::eltwise_relu, in_grad_md, output_md);
+    auto Relu_gradient_pd =
+        eltwise_backward::primitive_desc(Relu_gradient_d, eng, Relu_pd);
+    auto Relu_gradient_p = eltwise_backward(Relu_gradient_pd);
+    Relu_gradient_p.execute(engine_stream, {{DNNL_ARG_SRC, input_mem},
+                                            {DNNL_ARG_DIFF_DST, in_grad_mem},
+                                            {DNNL_ARG_DIFF_SRC, output_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/Reshape.cpp b/src/dnnl_ops/Reshape.cpp
new file mode 100644
index 0000000..4c40acc
--- /dev/null
+++ b/src/dnnl_ops/Reshape.cpp
@@ -0,0 +1,30 @@
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+extern "C" int cpu_Reshape(const DLArrayHandle in_arr, DLArrayHandle out_arr) {
+    int input_size = 1;
+    int output_size = 1;
+    float *input = (float *)(in_arr->data);
+    float *output = (float *)(out_arr->data);
+    for (int i = 0; i < in_arr->ndim; i++)
+        input_size *= in_arr->shape[i];
+    for (int i = 0; i < out_arr->ndim; i++)
+        output_size *= out_arr->shape[i];
+
+    assert(input_size == output_size);
+#pragma omp parallel for
+    for (int i = 0; i < input_size; i++)
+        output[i] = input[i];
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Sigmoid.cpp b/src/dnnl_ops/Sigmoid.cpp
new file mode 100644
index 0000000..e91c1da
--- /dev/null
+++ b/src/dnnl_ops/Sigmoid.cpp
@@ -0,0 +1,46 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlSigmoid(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Sigmoid_d =
+        eltwise_forward::desc(prop_kind::forward_training,
+                              algorithm::eltwise_logistic, mat_md, 0.f, 0.f);
+    auto Sigmoid_pd = eltwise_forward::primitive_desc(Sigmoid_d, eng);
+    auto Sigmoid = eltwise_forward(Sigmoid_pd);
+
+    Sigmoid.execute(engine_stream,
+                    {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/Softmax.cpp b/src/dnnl_ops/Softmax.cpp
new file mode 100644
index 0000000..0ab4537
--- /dev/null
+++ b/src/dnnl_ops/Softmax.cpp
@@ -0,0 +1,40 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+
+// DLArrayHandle A,B,C,D;
+
+extern "C" int DnnlSoftmax(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    assert(input->ndim == 2);
+    auto mat_md = memory::desc({input->shape[0], input->shape[1]},
+                               memory::data_type::f32, memory::format_tag::ab);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_softmax_mem = memory(mat_md, eng, output->data);
+
+    auto softmax_d =
+        softmax_forward::desc(prop_kind::forward_training, mat_md, 1);
+    auto softmax_pd = softmax_forward::primitive_desc(softmax_d, eng);
+    auto softmax = softmax_forward(softmax_pd);
+
+    softmax.execute(engine_stream,
+                    {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_softmax_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/SoftmaxCrossEntropy.cpp b/src/dnnl_ops/SoftmaxCrossEntropy.cpp
new file mode 100644
index 0000000..db355d4
--- /dev/null
+++ b/src/dnnl_ops/SoftmaxCrossEntropy.cpp
@@ -0,0 +1,55 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+
+extern "C" int DnnlSoftmaxCrossEntropy(const DLArrayHandle A,
+                                       const DLArrayHandle B,
+                                       DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    assert(A->ndim == 2 && B->ndim == 2 && A->shape[0] == B->shape[0]
+           && A->shape[1] == B->shape[1]);
+    float *C = new float[(A->shape[0]) * (A->shape[1])];
+    auto mat_md = memory::desc({A->shape[0], A->shape[1]},
+                               memory::data_type::f32, memory::format_tag::ab);
+    auto src_mem_1 = memory(mat_md, eng, A->data);
+    auto src_mem_2 = memory(mat_md, eng, B->data);
+    auto dst_logsoftmax_mem = memory(mat_md, eng, C);
+
+    auto Logsoftmax_d =
+        logsoftmax_forward::desc(prop_kind::forward_training, mat_md, 1);
+    auto Logsoftmax_pd = logsoftmax_forward::primitive_desc(Logsoftmax_d, eng);
+    auto Logsoftmax = logsoftmax_forward(Logsoftmax_pd);
+
+    Logsoftmax.execute(engine_stream, {{DNNL_ARG_SRC, src_mem_1},
+                                       {DNNL_ARG_DST, dst_logsoftmax_mem}});
+    // engine_stream.wait();
+    float *output_data = (float *)output->data;
+#pragma omp parallel for
+    for (int i = 0; i < A->shape[0]; i++) {
+        float localSum = 0;
+        for (int j = 0; j < A->shape[1]; j++) {
+            localSum += -C[i * (A->shape[1]) + j]
+                        * ((float *)(B->data))[i * (B->shape[1]) + j];
+        }
+        output_data[i] = localSum;
+    }
+    delete C;
+    return 0;
+}
diff --git a/src/dnnl_ops/Sqrt.cpp b/src/dnnl_ops/Sqrt.cpp
new file mode 100644
index 0000000..00793b2
--- /dev/null
+++ b/src/dnnl_ops/Sqrt.cpp
@@ -0,0 +1,92 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlSqrt(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Sqrt_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_sqrt, mat_md, 0.f, 0.f);
+    auto Sqrt_pd = eltwise_forward::primitive_desc(Sqrt_d, eng);
+    auto Sqrt = eltwise_forward(Sqrt_pd);
+
+    Sqrt.execute(engine_stream,
+                 {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
+
+extern "C" int DnnlReciprocalSqrt(const DLArrayHandle input,
+                                  DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    size_t size = 1;
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++) {
+        shape.push_back(input->shape[i]);
+        size *= input->shape[i];
+    }
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+
+    float *temp;
+    temp = new float[size];
+
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto temp_mem = memory(mat_md, eng, temp);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Sqrt_d = eltwise_forward::desc(prop_kind::forward_training,
+                                        algorithm::eltwise_sqrt, mat_md);
+    auto Sqrt_pd = eltwise_forward::primitive_desc(Sqrt_d, eng);
+    auto Sqrt = eltwise_forward(Sqrt_pd);
+
+    Sqrt.execute(engine_stream,
+                 {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, temp_mem}});
+    engine_stream.wait();
+
+    auto DivideConst_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_pow, mat_md, 1, -1.f);
+    auto DivideConst_pd = eltwise_forward::primitive_desc(DivideConst_d, eng);
+    auto DivideConst = eltwise_forward(DivideConst_pd);
+
+    DivideConst.execute(engine_stream,
+                        {{DNNL_ARG_SRC, temp_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+
+    delete temp;
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/Tanh.cpp b/src/dnnl_ops/Tanh.cpp
new file mode 100644
index 0000000..365ee7e
--- /dev/null
+++ b/src/dnnl_ops/Tanh.cpp
@@ -0,0 +1,45 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+using namespace dnnl;
+using namespace std;
+
+extern "C" int DnnlTanh(const DLArrayHandle input, DLArrayHandle output) {
+    // engine eng(engine::kind::cpu, 0);
+    // stream engine_stream(eng);
+    dnnl_stream_init();
+
+    vector<long int> shape, format;
+    for (int i = 0; i < input->ndim; i++)
+        shape.push_back(input->shape[i]);
+    format.resize(input->ndim);
+    format[(input->ndim) - 1] = 1;
+    for (int i = format.size() - 2; i >= 0; i--)
+        format[i] = format[i + 1] * shape[i + 1];
+    auto mat_md = memory::desc(shape, memory::data_type::f32, format);
+
+    auto src_mem = memory(mat_md, eng, input->data);
+    auto dst_mem = memory(mat_md, eng, output->data);
+
+    auto Tanh_d = eltwise_forward::desc(
+        prop_kind::forward_training, algorithm::eltwise_tanh, mat_md, 0.f, 0.f);
+    auto Tanh_pd = eltwise_forward::primitive_desc(Tanh_d, eng);
+    auto Tanh = eltwise_forward(Tanh_pd);
+
+    Tanh.execute(engine_stream,
+                 {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+    engine_stream.wait();
+    return 0;
+}
diff --git a/src/dnnl_ops/Transpose.cpp b/src/dnnl_ops/Transpose.cpp
new file mode 100644
index 0000000..ee263cc
--- /dev/null
+++ b/src/dnnl_ops/Transpose.cpp
@@ -0,0 +1,48 @@
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <omp.h>
+#include "dnnl.hpp"
+
+#include "../common/c_runtime_api.h"
+#include "dnnl_runtime.h"
+
+extern "C" int cpu_Transpose(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                             int *perm) {
+    const float *input = (const float *)(in_arr->data);
+    float *output = (float *)(out_arr->data);
+    int64_t *in_dims = in_arr->shape;
+    int64_t *out_dims = out_arr->shape;
+    uint ndim = uint(in_arr->ndim);
+
+    uint *in_strides = (uint *)malloc(ndim * sizeof(uint));
+    uint *out_strides = (uint *)malloc(ndim * sizeof(uint));
+    uint in_stride = 1;
+    uint out_stride = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+        in_strides[i] = in_stride;
+        out_strides[i] = out_stride;
+        in_stride *= uint(in_dims[i]);
+        out_stride *= uint(out_dims[i]);
+    }
+
+#pragma omp parallel for
+    for (uint o_idx = 0; o_idx < out_stride; ++o_idx) {
+        uint i_idx = 0;
+        uint temp = o_idx;
+        for (uint i = 0; i < ndim; ++i) {
+            const uint ratio = temp / out_strides[i];
+            temp -= ratio * out_strides[i];
+            i_idx += ratio * in_strides[perm[i]];
+        }
+        output[o_idx] = input[i_idx];
+    }
+    free(in_strides);
+    free(out_strides);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/dnnl_ops/dnnl_runtime.cc b/src/dnnl_ops/dnnl_runtime.cc
new file mode 100644
index 0000000..0546f0f
--- /dev/null
+++ b/src/dnnl_ops/dnnl_runtime.cc
@@ -0,0 +1,37 @@
+#include "./dnnl_runtime.h"
+
+bool is_dnnl_stream_init = 0;
+engine eng;
+stream engine_stream;
+
+void dnnl_stream_init() {
+    if (is_dnnl_stream_init == 0) {
+        engine eng1(engine::kind::cpu, 0);
+        eng = eng1;
+        stream engine_stream1(eng);
+        engine_stream = engine_stream1;
+        is_dnnl_stream_init = 1;
+    }
+}
+void print_dlarray(DLArrayHandle mat) {
+    const float *mat_data = (const float *)mat->data;
+    size_t input_size = 1;
+    for (index_t i = 0; i < mat->ndim; i++) {
+        input_size *= mat->shape[i];
+    }
+    for (size_t i = 0; i < input_size; i++) {
+        std::cout << mat_data[i] << ' ';
+    }
+    std::cout << std::endl;
+}
+
+void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t bytes = mem.get_desc().get_size();
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle());
+        for (size_t i = 0; i < bytes; ++i)
+            ((uint8_t *)handle)[i] = src[i];
+    }
+}
diff --git a/src/dnnl_ops/dnnl_runtime.h b/src/dnnl_ops/dnnl_runtime.h
new file mode 100644
index 0000000..2fbb1bd
--- /dev/null
+++ b/src/dnnl_ops/dnnl_runtime.h
@@ -0,0 +1,23 @@
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include <sys/time.h>
+
+#include "dnnl.hpp"
+#include "../common/c_runtime_api.h"
+
+using namespace dnnl;
+
+extern bool is_dnnl_stream_init;
+extern engine eng;
+extern stream engine_stream;
+
+void dnnl_stream_init();
+void print_dlarray(DLArrayHandle mat);
+void read_from_dnnl_memory(void *handle, dnnl::memory &mem);
diff --git a/src/header/mpi_communication.h b/src/header/mpi_communication.h
new file mode 100644
index 0000000..9ff9950
--- /dev/null
+++ b/src/header/mpi_communication.h
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include "mpi.h"
+#include <stdlib.h>
+#include <unistd.h>
+#include "../common/dlarray.h"
+
+#define MPICHECK(cmd)                                                          \
+    do {                                                                       \
+        int e = cmd;                                                           \
+        if (e != MPI_SUCCESS) {                                                \
+            printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e);   \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+
+extern "C" {
+void MPIInit();
+void MPIGetComm(MPI_Comm *comm);
+void getMPICommRank(MPI_Comm *comm, int *myRank);
+void getMPICommSize(MPI_Comm *comm, int *nRanks);
+void dlarrayAllReduce(DLArray *array, int datatype, int op, MPI_Comm *comm);
+void MPIFinalize();
+}
\ No newline at end of file
diff --git a/src/header/mpi_nccl_communication.h b/src/header/mpi_nccl_communication.h
new file mode 100644
index 0000000..4112b48
--- /dev/null
+++ b/src/header/mpi_nccl_communication.h
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../cuda_common/gpu_runtime.h"
+#include "nccl.h"
+#include "mpi.h"
+#include "../common/dlarray.h"
+#define THREADS_PER_BLOCKS 1024
+
+#define MPICHECK(cmd)                                                          \
+    do {                                                                       \
+        int e = cmd;                                                           \
+        if (e != MPI_SUCCESS) {                                                \
+            printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e);   \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+    do {                                                                       \
+        cudaError_t e = cmd;                                                   \
+        if (e != cudaSuccess) {                                                \
+            printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   cudaGetErrorString(e));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+#define NCCLCHECK(cmd)                                                         \
+    do {                                                                       \
+        ncclResult_t r = cmd;                                                  \
+        if (r != ncclSuccess) {                                                \
+            printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   ncclGetErrorString(r));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+extern "C" {
+void MPIInit();
+void MPIFinalize();
+void MPIGetComm(MPI_Comm *comm);
+void getMPICommRank(MPI_Comm *comm, int *myRank);
+void getMPICommSize(MPI_Comm *comm, int *nRanks);
+void getLocalRank(MPI_Comm *comm, int nRanks, int myRank, int *localRank,
+                  unsigned long long hostHashs[]);
+void getGlobalDevice(MPI_Comm *comm, int nRanks, int myRank, int device_id,
+                     int hostDevices[]);
+void getNcclUniqueId(ncclUniqueId *Id, MPI_Comm mpi_comm, int localRank,
+                     int senderRank = 0);
+void getGroupNcclUniqueId(ncclUniqueId *Id, MPI_Comm mpi_comm, int rank,
+                          int dests[], int group_size, int group_id);
+void initNcclCommRank(ncclComm_t *comm, int nranks, ncclUniqueId *commId,
+                      int rank, int localRank);
+void dlarrayAllReduce(DLArray *input_array, DLArray *output_array, int datatype,
+                      int op, ncclComm_t comm, DLStreamHandle stream_handle);
+void dlarrayBroadcast(DLArray *input_array, DLArray *output_array, int datatype,
+                      int root, ncclComm_t comm, DLStreamHandle stream_handle);
+void dlarrayAllGather(DLArray *array, DLArray *output_array, int datatype,
+                      ncclComm_t comm, DLStreamHandle stream_handle);
+void dlarraySend(DLArray *array, int datatype, int target, ncclComm_t comm,
+                 DLStreamHandle stream_handle);
+void dlarrayRecv(DLArray *array, int datatype, int src, ncclComm_t comm,
+                 DLStreamHandle stream_handle);
+void commDestroyNccl(ncclComm_t *comm);
+void setDevice(int device_id);
+}
diff --git a/src/header/nccl_communication.h b/src/header/nccl_communication.h
new file mode 100644
index 0000000..3079bb7
--- /dev/null
+++ b/src/header/nccl_communication.h
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "../cuda_common/gpu_runtime.h"
+#include "nccl.h"
+#define THREADS_PER_BLOCKS 1024
+
+#define CUDACHECK(cmd)                                                         \
+    do {                                                                       \
+        cudaError_t e = cmd;                                                   \
+        if (e != cudaSuccess) {                                                \
+            printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   cudaGetErrorString(e));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+#define NCCLCHECK(cmd)                                                         \
+    do {                                                                       \
+        ncclResult_t r = cmd;                                                  \
+        if (r != ncclSuccess) {                                                \
+            printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,      \
+                   ncclGetErrorString(r));                                     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    } while (0)
+
+extern "C" {
+void create_streams(cudaStream_t *stream, int *devices, int devices_numbers);
+
+void update_stream(size_t dev_id, cudaStream_t *stream,
+                   cudaStream_t *stream_handle);
+
+void free_streams(cudaStream_t *stream, int *devices, int devices_numbers);
+
+void init_NCCL(ncclComm_t *comms, int *devices, int devices_numbers);
+
+void finish_NCCL(ncclComm_t *comms, int devices_numbers);
+
+void Synchronize_streams(cudaStream_t *stream, int *devices,
+                         int devices_numbers);
+
+void NCCL_AllReduce(float **sendbuff, float **recvbuff, int size,
+                    ncclComm_t *comms, cudaStream_t *stream,
+                    int devices_numbers);
+
+void display(const float *device_data, int dev_id, int size);
+
+void show_int(int a);
+
+void create(int **a, int n);
+
+void for_each(int *a, int n);
+}
diff --git a/src/header/types.h b/src/header/types.h
new file mode 100644
index 0000000..19cb00a
--- /dev/null
+++ b/src/header/types.h
@@ -0,0 +1,29 @@
+#ifndef HETUSYS_DEFAULT_TYPES_H
+#define HETUSYS_DEFAULT_TYPES_H
+
+#include <atomic>
+
+typedef signed char int8;
+typedef short int16;
+typedef int int32;
+typedef long long int64;
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef unsigned long long uint64;
+
+class SharedCounter {
+public:
+    int64 get() {
+        return cnt;
+    }
+    int64 next() {
+        return ++cnt;
+    }
+
+private:
+    std::atomic<int64> cnt{0};
+};
+
+#endif
diff --git a/src/hetu_cache/CMakeLists.txt b/src/hetu_cache/CMakeLists.txt
new file mode 100644
index 0000000..6c44058
--- /dev/null
+++ b/src/hetu_cache/CMakeLists.txt
@@ -0,0 +1,11 @@
+aux_source_directory(src HETU_SRC_LIST)
+
+find_package(pybind11 2.6.0 CONFIG)
+
+if (NOT pybind11_FOUND)
+    message(FATAL_ERROR "pybind11 not found")
+else()
+    pybind11_add_module(hetu_cache ${HETU_SRC_LIST})
+    target_include_directories(hetu_cache PUBLIC include)
+    target_link_libraries(hetu_cache PUBLIC ps)
+endif()
diff --git a/src/hetu_cache/include/binding.h b/src/hetu_cache/include/binding.h
new file mode 100644
index 0000000..e9e3933
--- /dev/null
+++ b/src/hetu_cache/include/binding.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#include <vector>
+
+namespace py = pybind11;
+
+namespace hetu {
+namespace bind {
+
+// snippet for converting std::vector
+template <typename T>
+py::array_t<T> vec(std::vector<T> &v) {
+    return py::array_t<T>(v.size(), v.data());
+}
+
+// snippet for converting std::vector (but without copy)
+template <typename T>
+py::array_t<T> vec_nocp(std::vector<T> &v) {
+    return py::array_t<T>(v.size(), v.data(), py::none());
+}
+
+// snippet for converting raw pointer
+template <typename T>
+py::array_t<T> pt1d(const T *v, size_t cnt) {
+    auto result = py::array_t<T>(cnt, v);
+    return result;
+}
+
+// snippet for converting raw pointer (but without copy)
+template <typename T>
+py::array_t<T> pt1d_nocp(const T *v, size_t cnt) {
+    auto result = py::array_t<T>(cnt, v, py::none());
+    return result;
+}
+
+// snippet for converting python array
+template <typename T>
+std::vector<T> a2v(py::array_t<T> &arr) {
+    std::vector<T> v(arr.size());
+    memcpy(v.data(), arr.data(), v.size() * sizeof(T));
+    return v;
+}
+
+} // namespace bind
+
+// Check an array is continuous is C
+// so that we can confidently use its pointer
+#define PYTHON_CHECK_ARRAY(array)                                              \
+    {                                                                          \
+        if (!array.attr("flags").attr("c_contiguous").cast<bool>()) {          \
+            std::string err = "Array not continuous in C: ";                   \
+            throw std::runtime_error(err + #array);                            \
+        }                                                                      \
+    }
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/cache.h b/src/hetu_cache/include/cache.h
new file mode 100644
index 0000000..668c60a
--- /dev/null
+++ b/src/hetu_cache/include/cache.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "embedding.h"
+#include "binding.h"
+#include "common/sarray.h"
+
+#include <future>
+#include <vector>
+using std::vector;
+
+namespace hetu {
+
+typedef std::shared_future<void> wait_t;
+
+/*
+  CacheBase:
+    CacheBase is the Base class of all cache Policy
+    args:
+      limit: the number of embedding will not exceed limit
+*/
+class CacheBase {
+protected:
+    size_t limit_;
+    size_t len_, width_;
+    version_t pull_bound_ = 5;
+    version_t push_bound_ = 5;
+    int node_id_;
+    bool bypass_cache_ = false;
+    vector<EmbeddingPT> evict_;
+    std::mutex mtx;
+    bool perf_enabled_ = false;
+    py::list perf_;
+
+public:
+    /*
+      limit: cache size limit
+      len: embedding table length
+      width: embedding table width
+      node_id: the server key
+    */
+    CacheBase(size_t limit, size_t len, size_t width, int node_id);
+    ~CacheBase() {
+    }
+    size_t getLimit() {
+        return limit_;
+    }
+    //------------------------- cache policy virtual function
+    //---------------------
+    virtual size_t size() = 0;
+    virtual int count(cache_key_t k) = 0;
+    virtual void insert(EmbeddingPT e) = 0;
+    virtual EmbeddingPT lookup(cache_key_t k) = 0;
+    //------------------------- implement tools ---------------------
+    // Used to lookup/insert many keys together
+    vector<EmbeddingPT> batchedLookup(const cache_key_t *, size_t len);
+    void batchedInsert(vector<EmbeddingPT> &ptrs);
+    //------------------------- implement main python API ---------------------
+    version_t getPullBound() {
+        return pull_bound_;
+    }
+    version_t getPushBound() {
+        return push_bound_;
+    }
+    size_t getWidth() {
+        return width_;
+    }
+    void setPullBound(version_t bound) {
+        pull_bound_ = bound;
+    }
+    void setPushBound(version_t bound) {
+        push_bound_ = bound;
+    }
+    void bypass() {
+        bypass_cache_ = true;
+    }
+    void undoBypass() {
+        bypass_cache_ = false;
+    }
+    bool getPerfEnabled() {
+        return perf_enabled_;
+    }
+    void setPerfEnabled(bool value) {
+        perf_enabled_ = value;
+    }
+    /*
+      embeddingLookup is called before each training batch
+      * keys may be duplicated, unique operation is required before sending
+      server requests
+    */
+    wait_t embeddingLookup(py::array_t<cache_key_t> keys,
+                           py::array_t<embed_t> dest);
+    wait_t embeddingLookupRaw(uint64_t _keys, uint64_t dest, size_t num_keys);
+    void _embeddingLookup(SArray<cache_key_t> keys, embed_t *dest);
+    wait_t embeddingUpdate(py::array_t<cache_key_t> keys,
+                           py::array_t<embed_t> grads);
+    wait_t embeddingUpdateRaw(uint64_t _keys, uint64_t grads, size_t num_keys);
+    void _embeddingUpdate(SArray<cache_key_t> keys, const embed_t *grads);
+    wait_t embeddingPushPullRaw(uint64_t _pullkeys, uint64_t _dest,
+                                size_t num_pull_keys, uint64_t _pushkeys,
+                                uint64_t _grads, size_t num_push_keys);
+    void _embeddingPushPull(SArray<cache_key_t> keys, embed_t *dest,
+                            SArray<cache_key_t> push_keys,
+                            const embed_t *grads);
+    std::string __repr__();
+    py::list getPerf() {
+        return perf_;
+    };
+}; // class CacheBase
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/embedding.h b/src/hetu_cache/include/embedding.h
new file mode 100644
index 0000000..9c06ecc
--- /dev/null
+++ b/src/hetu_cache/include/embedding.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <unordered_map>
+#include <memory>
+#include <sstream>
+#include "binding.h"
+
+using std::default_delete;
+using std::make_shared;
+using std::shared_ptr;
+using std::unordered_map;
+
+namespace hetu {
+
+typedef uint64_t cache_key_t;
+typedef int64_t version_t;
+
+template <typename T>
+class Line {
+private:
+    version_t version_;
+    version_t updates_;
+    const size_t len_;
+    const cache_key_t key_;
+    T *data_;
+    T *grad_;
+
+public:
+    typedef T dtype;
+    Line(cache_key_t key, const T *in_vec, size_t len) : len_(len), key_(key) {
+        data_ = new T[len]();
+        grad_ = nullptr;
+        std::copy(in_vec, in_vec + len, data_);
+        updates_ = 0;
+        version_ = -1;
+    }
+    Line(cache_key_t key, size_t len, bool init_data = true) :
+        len_(len), key_(key) {
+        data_ = init_data ? new T[len]() : nullptr;
+        grad_ = nullptr;
+        updates_ = 0;
+        version_ = -1;
+    }
+    Line(const Line &other) = delete;
+    ~Line() {
+        delete[] data_;
+        delete[] grad_;
+    }
+    //-------------------------- setter getter ---------------------------------
+    T &operator[](size_t i) {
+        return data_[i];
+    }
+    T *data() {
+        return data_;
+    }
+    T *grad() {
+        _maybeInitGrad();
+        return grad_;
+    }
+    size_t size() {
+        return len_;
+    }
+    cache_key_t key() const {
+        return key_;
+    }
+
+    void setVersion(version_t version) {
+        version_ = version;
+    }
+    version_t getVersion() {
+        return version_;
+    }
+    version_t getUpdates() {
+        return updates_;
+    }
+
+    //-------------------------- handling gradients ----------------------------
+    void accumulate(const T *in_grad) {
+        _maybeInitGrad();
+        if (!data_) {
+            for (size_t i = 0; i < len_; i++) {
+                grad_[i] += in_grad[i];
+            }
+        } else {
+            for (size_t i = 0; i < len_; i++) {
+                grad_[i] += in_grad[i];
+                data_[i] += in_grad[i];
+            }
+        }
+        updates_++;
+    }
+    void addup() {
+        if (grad_)
+            for (size_t i = 0; i < len_; i++)
+                data_[i] += grad_[i];
+    }
+    double mean() const {
+        double sum = 0;
+        for (size_t i = 0; i < len_; i++) {
+            sum += data_[i];
+        }
+        return sum / len_;
+    }
+    double var() const {
+        double avg = mean();
+        double sum = 0;
+        for (size_t i = 0; i < len_; i++) {
+            sum += (data_[i] - avg) * (data_[i] - avg);
+        }
+        return sum / len_;
+    }
+    void zeroGrad() {
+        _maybeInitGrad();
+        for (size_t i = 0; i < len_; i++) {
+            grad_[i] = 0;
+        }
+        updates_ = 0;
+    }
+
+    void _maybeInitGrad() {
+        if (grad_ == nullptr)
+            grad_ = new T[len_]();
+    }
+
+    //------------------------ python api starts here ------------------------
+    // __repr__ is used in python embedding
+    std::string __repr__() {
+        std::stringstream ss;
+        ss << "<hetu.Embedding ";
+        ss << ": "
+           << "key:" << key_;
+        ss << ", "
+           << "len:" << len_;
+        ss << ", "
+           << "version:" << version_;
+        ss << ", "
+           << "mean:" << mean();
+        ss << ", "
+           << "var:" << var();
+        ss << ">";
+        return ss.str();
+    }
+    py::array_t<T> PyAPI_data() {
+        return bind::pt1d_nocp(data_, len_);
+    }
+    py::array_t<T> PyAPI_grad() {
+        return bind::pt1d_nocp(grad_, len_);
+    }
+}; // class Line
+
+typedef float embed_t;
+// Embedding is a float embedding table, usually 128 length
+typedef Line<embed_t> Embedding;
+// EmbeddingPT is the smart pointer type of Embedding
+typedef shared_ptr<Embedding> EmbeddingPT;
+
+// Factory function
+EmbeddingPT makeEmbedding(cache_key_t, version_t, py::array_t<embed_t>);
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/hetu_client.h b/src/hetu_cache/include/hetu_client.h
new file mode 100644
index 0000000..94808ff
--- /dev/null
+++ b/src/hetu_cache/include/hetu_client.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "embedding.h"
+#include "ps/worker/hetu_binding.h"
+#include <vector>
+
+using std::vector;
+
+namespace hetu {
+
+/*
+    This function should pull versions and embedding with the keys.
+    * sync function call
+    * send all the version in embed (some newly created should be -1)
+    * server will decide which embedding is outdated and send the embedding back
+    * put them back into embed
+    return how many embeddings are really pulled
+*/
+size_t syncEmbedding(int node_id, vector<EmbeddingPT> &embed, size_t bound);
+
+/*
+    This function push the gradients for given embedding
+    * push gradients in embed[i]->grads
+    * should also push updates number
+    * not responsible to call embed[i]->zeroGrad()
+*/
+void pushEmbedding(int node_id, vector<EmbeddingPT> &embed);
+
+size_t pushSyncEmbedding(int node_id, vector<EmbeddingPT> &embed, size_t bound,
+                         vector<EmbeddingPT> &push_embed);
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/lfu_cache.h b/src/hetu_cache/include/lfu_cache.h
new file mode 100644
index 0000000..feafa36
--- /dev/null
+++ b/src/hetu_cache/include/lfu_cache.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "cache.h"
+
+#include <list>
+#include <unordered_map>
+
+namespace hetu {
+
+/*
+  LFUCache:
+    use LFU policy
+    Implemented with hashmap and a 2-D list ordered by frequency
+    O(1) insert and lookup
+*/
+
+class LFUCache : public CacheBase {
+private:
+    struct CountList;
+    struct Block {
+        EmbeddingPT ptr;
+        std::list<CountList>::iterator head;
+    };
+    struct CountList {
+        std::list<Block> list;
+        size_t use;
+    };
+    std::list<CountList> list_;
+    std::unordered_map<cache_key_t, std::list<Block>::iterator> hash_;
+
+    // helper function
+    std::list<Block>::iterator _increase(std::list<Block>::iterator);
+    std::list<Block>::iterator _create(EmbeddingPT);
+    void _evict();
+
+public:
+    using CacheBase::CacheBase;
+    size_t size() final {
+        return hash_.size();
+    }
+    int count(cache_key_t k) final;
+    void insert(EmbeddingPT e) final;
+    EmbeddingPT lookup(cache_key_t k) final;
+
+    // python debug function
+    py::array_t<cache_key_t> PyAPI_keys();
+}; // class LFUCache
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/lfuopt_cache.h b/src/hetu_cache/include/lfuopt_cache.h
new file mode 100644
index 0000000..239b4c5
--- /dev/null
+++ b/src/hetu_cache/include/lfuopt_cache.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "cache.h"
+
+#include <list>
+#include <unordered_map>
+#include <map>
+
+namespace hetu {
+
+/*
+  LFUOptCache:
+    use LFU policy, move to permanent store when freq reach n
+    Implemented with hashmap and a 2-D list ordered by frequency
+    O(1) insert and lookup
+*/
+
+class LFUOptCache : public CacheBase {
+private:
+    struct Block {
+        EmbeddingPT ptr;
+        int use;
+    };
+    typedef std::list<Block> CountList;
+    const static int kUseCntMax = 10;
+    CountList clist[kUseCntMax];
+    std::unordered_map<cache_key_t, std::list<Block>::iterator> hash_;
+
+    // helper function
+    std::list<Block>::iterator _increase(std::list<Block>::iterator);
+    std::list<Block>::iterator _create(EmbeddingPT);
+    void _evict();
+
+    std::unordered_map<cache_key_t, EmbeddingPT> store_;
+
+public:
+    using CacheBase::CacheBase;
+    size_t size() final {
+        return store_.size() + hash_.size();
+    }
+    int count(cache_key_t k) final;
+    void insert(EmbeddingPT e) final;
+    EmbeddingPT lookup(cache_key_t k) final;
+
+    // python debug function
+    py::array_t<cache_key_t> PyAPI_keys();
+}; // class LFUCache
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/lru_cache.h b/src/hetu_cache/include/lru_cache.h
new file mode 100644
index 0000000..780ad7a
--- /dev/null
+++ b/src/hetu_cache/include/lru_cache.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cache.h"
+
+#include <list>
+#include <unordered_map>
+
+namespace hetu {
+
+/*
+  LRUCache:
+    use LRU policy
+    Implemented with a double-linked list and a hash map
+    O(1) insert, lookup
+*/
+
+class LRUCache : public CacheBase {
+private:
+    std::unordered_map<cache_key_t, std::list<EmbeddingPT>::iterator> hash_;
+    std::list<EmbeddingPT> list_;
+
+public:
+    using CacheBase::CacheBase;
+    size_t size() final {
+        return hash_.size();
+    }
+    int count(cache_key_t k) final;
+    void insert(EmbeddingPT e) final;
+    EmbeddingPT lookup(cache_key_t k) final;
+
+    // python debug function
+    py::array_t<cache_key_t> PyAPI_keys();
+}; // class LRUCache
+
+} // namespace hetu
diff --git a/src/hetu_cache/include/unqiue_tools.h b/src/hetu_cache/include/unqiue_tools.h
new file mode 100644
index 0000000..a479ee6
--- /dev/null
+++ b/src/hetu_cache/include/unqiue_tools.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <vector>
+#include <algorithm>
+
+namespace hetu {
+
+// Implement Argsort
+template <typename T>
+std::vector<size_t> argsort(const T *array, size_t num) {
+    std::vector<size_t> array_index(num, 0);
+    for (size_t i = 0; i < num; ++i)
+        array_index[i] = i;
+
+    std::sort(
+        array_index.begin(), array_index.end(),
+        [array](int pos1, int pos2) { return (array[pos1] < array[pos2]); });
+
+    return array_index;
+}
+
+/*
+  Unique is used to handle key duplicate in sparse pull operation
+  Unique itself is a vector that contains sorted unique data
+  also has a mapping from old indices to new unique indices
+*/
+template <typename T>
+class Unique : public std::vector<T> {
+public:
+    Unique(const T *data, size_t num) {
+        map_indices_.resize(num);
+        this->reserve(num);
+        auto args = argsort(data, num);
+        for (size_t i = 0; i < num; i++) {
+            if (i == 0 || data[args[i]] != data[args[i - 1]]) {
+                this->push_back(data[args[i]]);
+            }
+            map_indices_[args[i]] = this->size() - 1;
+        }
+    }
+
+    inline size_t map(size_t idx) {
+        return map_indices_[idx];
+    }
+
+private:
+    std::vector<size_t> map_indices_;
+};
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/cache.cc b/src/hetu_cache/src/cache.cc
new file mode 100644
index 0000000..e3079d5
--- /dev/null
+++ b/src/hetu_cache/src/cache.cc
@@ -0,0 +1,296 @@
+#include <chrono>
+
+#include "cache.h"
+#include "hetu_client.h"
+#include "unqiue_tools.h"
+#include "common/thread_pool.h"
+
+namespace hetu {
+
+CacheBase::CacheBase(size_t limit, size_t len, size_t width, int node_id) :
+    limit_(limit), width_(width), node_id_(node_id) {
+}
+
+vector<EmbeddingPT> CacheBase::batchedLookup(const cache_key_t *keys,
+                                             size_t len) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (bypass_cache_)
+        return vector<EmbeddingPT>(len, nullptr);
+    vector<EmbeddingPT> result(len);
+    for (size_t i = 0; i < len; i++) {
+        EmbeddingPT ptr = lookup(keys[i]);
+        result[i] = ptr;
+    }
+    return result;
+}
+
+void CacheBase::batchedInsert(vector<EmbeddingPT> &ptrs) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (bypass_cache_)
+        return;
+    for (auto &ptr : ptrs) {
+        insert(ptr);
+    }
+}
+
+wait_t CacheBase::embeddingLookup(py::array_t<cache_key_t> _keys,
+                                  py::array_t<embed_t> _dest) {
+    PYTHON_CHECK_ARRAY(_keys);
+    PYTHON_CHECK_ARRAY(_dest);
+    auto dest = _dest.mutable_data();
+    size_t num_keys = _keys.size();
+    assert(size_t(_dest.size()) == num_keys * width_);
+    SArray<cache_key_t> keys(_keys.mutable_data(), num_keys);
+    return ThreadPool::Get()->Enqueue(&CacheBase::_embeddingLookup, this, keys,
+                                      dest);
+}
+
+wait_t CacheBase::embeddingLookupRaw(uint64_t _keys, uint64_t dest,
+                                     size_t num_keys) {
+    float *keys = reinterpret_cast<float *>(_keys);
+    SArray<cache_key_t> intkeys(num_keys);
+    for (size_t i = 0; i < num_keys; i++)
+        intkeys[i] = (cache_key_t)keys[i];
+    return ThreadPool::Get()->Enqueue(&CacheBase::_embeddingLookup, this,
+                                      intkeys,
+                                      reinterpret_cast<embed_t *>(dest));
+}
+
+void CacheBase::_embeddingLookup(SArray<cache_key_t> keys, embed_t *dest) {
+    auto start_time = std::chrono::system_clock::now();
+    // Unique operation
+    auto unique_keys = Unique<cache_key_t>(keys.data(), keys.size());
+    auto unique_time = std::chrono::system_clock::now();
+    // Lookup all the keys together
+    auto embeds = batchedLookup(unique_keys.data(), unique_keys.size());
+    auto lookup_time = std::chrono::system_clock::now();
+
+    // Scan out missed keys and pull from server
+    vector<EmbeddingPT> should_insert;
+    for (size_t i = 0; i < unique_keys.size(); i++) {
+        if (!embeds[i]) {
+            embeds[i].reset(new Embedding(unique_keys[i], width_));
+            should_insert.push_back(embeds[i]);
+        }
+    }
+    auto mem_time = std::chrono::system_clock::now();
+    size_t pulled = syncEmbedding(node_id_, embeds, pull_bound_);
+    auto trans_time = std::chrono::system_clock::now();
+    // Copy embedding to destination
+    for (size_t _i = 0; _i < keys.size(); _i++) {
+        auto i = unique_keys.map(_i);
+        std::copy(embeds[i]->data(), embeds[i]->data() + embeds[i]->size(),
+                  dest + _i * embeds[i]->size());
+    }
+    auto copy_time = std::chrono::system_clock::now();
+    batchedInsert(should_insert);
+    auto end_time = std::chrono::system_clock::now();
+    if (perf_enabled_) {
+        py::gil_scoped_acquire acquire;
+        py::dict performance;
+        performance["type"] = "Pull";
+        performance["is_full"] = size() == limit_;
+        performance["num_all"] = keys.size();
+        performance["num_unique"] = unique_keys.size();
+        performance["num_miss"] = should_insert.size();
+        performance["num_transfered"] = pulled;
+        performance["time"] = (end_time - start_time).count() / 1e6;
+        performance["sort_time"] = (unique_time - start_time).count() / 1e6;
+        performance["lookup_time"] = (lookup_time - unique_time).count() / 1e6;
+        performance["prepare_time"] = (mem_time - lookup_time).count() / 1e6;
+        performance["transfer_time"] = (trans_time - mem_time).count() / 1e6;
+        performance["copy_time"] = (copy_time - trans_time).count() / 1e6;
+        performance["insert_time"] = (end_time - copy_time).count() / 1e6;
+        perf_.append(performance);
+    }
+}
+
+wait_t CacheBase::embeddingUpdate(py::array_t<cache_key_t> _keys,
+                                  py::array_t<embed_t> _grads) {
+    PYTHON_CHECK_ARRAY(_keys);
+    PYTHON_CHECK_ARRAY(_grads);
+    auto grads = _grads.data();
+    size_t num_keys = _keys.size();
+    assert(size_t(_grads.size()) == num_keys * width_);
+    SArray<cache_key_t> keys(_keys.mutable_data(), num_keys);
+    return ThreadPool::Get()->Enqueue(&CacheBase::_embeddingUpdate, this, keys,
+                                      grads);
+}
+
+wait_t CacheBase::embeddingUpdateRaw(uint64_t _keys, uint64_t grads,
+                                     size_t num_keys) {
+    float *keys = reinterpret_cast<float *>(_keys);
+    SArray<cache_key_t> intkeys(num_keys);
+    for (size_t i = 0; i < num_keys; i++)
+        intkeys[i] = (cache_key_t)keys[i];
+    return ThreadPool::Get()->Enqueue(&CacheBase::_embeddingUpdate, this,
+                                      intkeys,
+                                      reinterpret_cast<embed_t *>(grads));
+}
+
+void CacheBase::_embeddingUpdate(SArray<cache_key_t> keys,
+                                 const embed_t *grads) {
+    auto start_time = std::chrono::system_clock::now();
+    // Unique operation
+    auto unique_keys = Unique<cache_key_t>(keys.data(), keys.size());
+    auto unique_time = std::chrono::system_clock::now();
+    // Lookup all the keys together
+    auto embeds = batchedLookup(unique_keys.data(), unique_keys.size());
+    auto lookup_time = std::chrono::system_clock::now();
+    // Do local updates
+    size_t miss_cnt = 0, evict_cnt = evict_.size();
+    vector<EmbeddingPT> should_push;
+    vector<EmbeddingPT> evict = std::move(evict_);
+    for (size_t _i = 0; _i < keys.size(); _i++) {
+        auto i = unique_keys.map(_i);
+        if (!embeds[i]) {
+            // !! This is not likely to happen, newly pulled embedding should be
+            // in cache
+            embeds[i].reset(new Embedding(unique_keys[i], width_, false));
+            miss_cnt++;
+        }
+        embeds[i]->accumulate(grads + _i * width_);
+    }
+    auto evict_iter = evict.begin();
+    for (size_t i = 0; i < unique_keys.size(); i++) {
+        if (embeds[i]->getUpdates() > push_bound_ || !embeds[i]->data()) {
+            // merge sort
+            while (evict_iter != evict.end()
+                   && (*evict_iter)->key() < embeds[i]->key())
+                should_push.push_back(*evict_iter++);
+            should_push.push_back(embeds[i]);
+        }
+    }
+    while (evict_iter != evict.end())
+        should_push.push_back(*evict_iter++);
+    auto accum_time = std::chrono::system_clock::now();
+    pushEmbedding(node_id_, should_push);
+    // After push do some clean up
+    auto trans_time = std::chrono::system_clock::now();
+    for (size_t i = 0; i < unique_keys.size(); i++) {
+        if (embeds[i]->getUpdates() > push_bound_ && embeds[i]->data()) {
+            embeds[i]->setVersion(embeds[i]->getVersion()
+                                  + embeds[i]->getUpdates());
+            embeds[i]->zeroGrad();
+        }
+    }
+    auto end_time = std::chrono::system_clock::now();
+    if (perf_enabled_) {
+        py::gil_scoped_acquire acquire;
+        py::dict performance;
+        performance["type"] = "Push";
+        performance["is_full"] = size() == limit_;
+        performance["num_all"] = keys.size();
+        performance["num_unique"] = unique_keys.size();
+        performance["num_evict"] = evict_cnt;
+        performance["num_miss"] = miss_cnt;
+        performance["num_transfered"] = should_push.size();
+        performance["time"] = (end_time - start_time).count() / 1e6;
+        performance["sort_time"] = (unique_time - start_time).count() / 1e6;
+        performance["lookup_time"] = (lookup_time - unique_time).count() / 1e6;
+        performance["copy_time"] = (accum_time - lookup_time).count() / 1e6;
+        performance["transfer_time"] = (trans_time - accum_time).count() / 1e6;
+        performance["cleanup_time"] = (end_time - trans_time).count() / 1e6;
+        perf_.append(performance);
+    }
+}
+
+wait_t CacheBase::embeddingPushPullRaw(uint64_t _pullkeys, uint64_t _dest,
+                                       size_t num_pull_keys, uint64_t _pushkeys,
+                                       uint64_t _grads, size_t num_push_keys) {
+    float *pullkeys = reinterpret_cast<float *>(_pullkeys);
+    float *pushkeys = reinterpret_cast<float *>(_pushkeys);
+    SArray<cache_key_t> intpullkeys(num_pull_keys), intpushkeys(num_push_keys);
+    for (size_t i = 0; i < num_pull_keys; i++)
+        intpullkeys[i] = (cache_key_t)pullkeys[i];
+    for (size_t i = 0; i < num_push_keys; i++)
+        intpushkeys[i] = (cache_key_t)pushkeys[i];
+    return ThreadPool::Get()->Enqueue([this, intpullkeys, intpushkeys, _grads,
+                                       _dest]() {
+        // _embeddingUpdate(intpushkeys, reinterpret_cast<embed_t*>(_grads));
+        // _embeddingLookup(intpullkeys, reinterpret_cast<embed_t*>(_dest));
+        _embeddingPushPull(intpullkeys, reinterpret_cast<embed_t *>(_dest),
+                           intpushkeys, reinterpret_cast<embed_t *>(_grads));
+    });
+}
+
+void CacheBase::_embeddingPushPull(SArray<cache_key_t> keys, embed_t *dest,
+                                   SArray<cache_key_t> push_keys,
+                                   const embed_t *grads) {
+    auto unique_keys = Unique<cache_key_t>(keys.data(), keys.size());
+    // Lookup all the keys together
+    auto embeds = batchedLookup(unique_keys.data(), unique_keys.size());
+    // Scan out missed keys and pull from server
+    vector<EmbeddingPT> should_insert;
+    for (size_t i = 0; i < unique_keys.size(); i++) {
+        if (!embeds[i]) {
+            embeds[i].reset(new Embedding(unique_keys[i], width_));
+            should_insert.push_back(embeds[i]);
+        }
+    }
+
+    auto push_unique_keys =
+        Unique<cache_key_t>(push_keys.data(), push_keys.size());
+    auto push_embeds =
+        batchedLookup(push_unique_keys.data(), push_unique_keys.size());
+    size_t miss_cnt = 0;
+    // size_t evict_cnt = evict_.size();
+    vector<EmbeddingPT> should_push;
+    vector<EmbeddingPT> evict = std::move(evict_);
+    for (size_t _i = 0; _i < push_keys.size(); _i++) {
+        auto i = push_unique_keys.map(_i);
+        if (!push_embeds[i]) {
+            // !! This is not likely to happen, newly pulled embedding should be
+            // in cache
+            push_embeds[i].reset(
+                new Embedding(push_unique_keys[i], width_, false));
+            miss_cnt++;
+        }
+        push_embeds[i]->accumulate(grads + _i * width_);
+    }
+    auto evict_iter = evict.begin();
+    for (size_t i = 0; i < push_unique_keys.size(); i++) {
+        if (push_embeds[i]->getUpdates() > push_bound_
+            || !push_embeds[i]->data()) {
+            // merge sort
+            while (evict_iter != evict.end()
+                   && (*evict_iter)->key() < push_embeds[i]->key())
+                should_push.push_back(*evict_iter++);
+            should_push.push_back(push_embeds[i]);
+        }
+    }
+    while (evict_iter != evict.end())
+        should_push.push_back(*evict_iter++);
+    // size_t pulled =
+    pushSyncEmbedding(node_id_, embeds, pull_bound_, should_push);
+
+    // Copy embedding to destination
+    for (size_t _i = 0; _i < keys.size(); _i++) {
+        auto i = unique_keys.map(_i);
+        std::copy(embeds[i]->data(), embeds[i]->data() + embeds[i]->size(),
+                  dest + _i * embeds[i]->size());
+    }
+    batchedInsert(should_insert);
+
+    for (size_t i = 0; i < push_unique_keys.size(); i++) {
+        if (push_embeds[i]->getUpdates() > push_bound_
+            && push_embeds[i]->data()) {
+            push_embeds[i]->setVersion(push_embeds[i]->getVersion()
+                                       + push_embeds[i]->getUpdates());
+            push_embeds[i]->zeroGrad();
+        }
+    }
+}
+
+std::string CacheBase::__repr__() {
+    std::stringstream ss;
+    ss << "<Cache : ";
+    ss << size() << "/" << limit_;
+    ss << " , id:" << node_id_;
+    ss << " , width:" << width_;
+    ss << " , bound:" << pull_bound_ << " " << push_bound_;
+    ss << ">";
+    return ss.str();
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/embedding.cc b/src/hetu_cache/src/embedding.cc
new file mode 100644
index 0000000..57372cb
--- /dev/null
+++ b/src/hetu_cache/src/embedding.cc
@@ -0,0 +1,13 @@
+#include "embedding.h"
+namespace hetu {
+
+EmbeddingPT makeEmbedding(cache_key_t k, version_t version,
+                          py::array_t<embed_t> val) {
+    assert(val.ndim() == 1);
+    PYTHON_CHECK_ARRAY(val);
+    auto res = make_shared<Embedding>(k, val.data(), val.shape(0));
+    res->setVersion(version);
+    return res;
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/hetu_client.cc b/src/hetu_cache/src/hetu_client.cc
new file mode 100644
index 0000000..5c0d717
--- /dev/null
+++ b/src/hetu_cache/src/hetu_client.cc
@@ -0,0 +1,105 @@
+#include "hetu_client.h"
+#include <iostream>
+
+namespace hetu {
+
+size_t syncEmbedding(int node_id, vector<EmbeddingPT> &embed, size_t bound) {
+    if (embed.empty())
+        return 0;
+    size_t n = embed.size();
+    size_t width = embed[0]->size();
+    SArray<cache_key_t> keys(n);
+    SArray<version_t> vers(n);
+    for (size_t i = 0; i < n; i++) {
+        keys[i] = embed[i]->key();
+        vers[i] = embed[i]->getVersion();
+    }
+    auto pulled = std::make_shared<size_t>();
+    auto closure =
+        [embed, width,
+         pulled](const ps::PSFData<ps::kSyncEmbedding>::Response &response,
+                 size_t offset) {
+            auto &ret_idx = std::get<0>(response);
+            auto &ret_vers = std::get<1>(response);
+            auto &value = std::get<2>(response);
+            for (size_t i = 0; i < ret_idx.size(); i++) {
+                size_t idx = ret_idx[i] + offset;
+                embed[idx]->setVersion(ret_vers[i]);
+                std::copy(&value[i * width], &value[(i + 1) * width],
+                          embed[idx]->data());
+                embed[idx]->addup();
+            }
+            *pulled = ret_idx.size();
+        };
+    ps::syncEmbedding(node_id, keys, vers, bound, closure);
+
+    return *pulled;
+}
+
+void pushEmbedding(int node_id, vector<EmbeddingPT> &embed) {
+    if (embed.empty())
+        return;
+    size_t n = embed.size();
+    size_t width = embed[0]->size();
+    SArray<cache_key_t> keys(n);
+    for (size_t i = 0; i < n; i++)
+        keys[i] = embed[i]->key();
+    SArray<embed_t> value(n * width);
+    SArray<version_t> updates(n);
+    for (size_t i = 0; i < n; i++) {
+        std::copy(embed[i]->grad(), embed[i]->grad() + width,
+                  &value[i * width]);
+        updates[i] = embed[i]->getUpdates();
+    }
+    ps::PushEmbedding(node_id, keys, value, updates);
+}
+
+size_t pushSyncEmbedding(int node_id, vector<EmbeddingPT> &embed, size_t bound,
+                         vector<EmbeddingPT> &push_embed) {
+    if (embed.empty() && push_embed.empty())
+        return 0;
+    size_t n = embed.size();
+    size_t width = embed[0]->size();
+    SArray<cache_key_t> keys(n);
+    SArray<version_t> vers(n);
+    for (size_t i = 0; i < n; i++) {
+        keys[i] = embed[i]->key();
+        vers[i] = embed[i]->getVersion();
+    }
+    auto pulled = std::make_shared<size_t>();
+    auto closure =
+        [embed, width,
+         pulled](const ps::PSFData<ps::kSyncEmbedding>::Response &response,
+                 size_t offset) {
+            auto &ret_idx = std::get<0>(response);
+            auto &ret_vers = std::get<1>(response);
+            auto &value = std::get<2>(response);
+            for (size_t i = 0; i < ret_idx.size(); i++) {
+                size_t idx = ret_idx[i] + offset;
+                embed[idx]->setVersion(ret_vers[i]);
+                std::copy(&value[i * width], &value[(i + 1) * width],
+                          embed[idx]->data());
+                embed[idx]->addup();
+            }
+            *pulled = ret_idx.size();
+        };
+
+    // Handle Push Embedding
+    n = push_embed.size();
+    SArray<cache_key_t> push_keys(n);
+    for (size_t i = 0; i < n; i++)
+        push_keys[i] = push_embed[i]->key();
+    SArray<embed_t> value(n * width);
+    SArray<version_t> updates(n);
+    for (size_t i = 0; i < n; i++) {
+        std::copy(push_embed[i]->grad(), push_embed[i]->grad() + width,
+                  &value[i * width]);
+        updates[i] = push_embed[i]->getUpdates();
+    }
+    ps::PushSyncEmbedding(node_id, keys, vers, bound, closure, push_keys, value,
+                          updates);
+
+    return *pulled;
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/lfu_cache.cc b/src/hetu_cache/src/lfu_cache.cc
new file mode 100644
index 0000000..76a4d99
--- /dev/null
+++ b/src/hetu_cache/src/lfu_cache.cc
@@ -0,0 +1,81 @@
+#include "lfu_cache.h"
+
+namespace hetu {
+
+int LFUCache::count(cache_key_t k) {
+    return hash_.count(k);
+}
+
+void LFUCache::insert(EmbeddingPT e) {
+    assert(e->size() == width_);
+    auto iter = hash_.find(e->key());
+    if (iter == hash_.end()) {
+        if (hash_.size() == limit_)
+            _evict();
+        hash_[e->key()] = _create(e);
+    } else {
+        iter->second->ptr = e;
+        hash_[e->key()] = _increase(iter->second);
+    }
+}
+
+EmbeddingPT LFUCache::lookup(cache_key_t k) {
+    auto iter = hash_.find(k);
+    if (iter == hash_.end())
+        return nullptr;
+    hash_[k] = _increase(iter->second);
+    auto result = iter->second->ptr;
+    return result;
+}
+
+void LFUCache::_evict() {
+    auto clist = list_.begin();
+    auto embed = clist->list.back().ptr;
+    auto key = embed->key();
+    if (embed->getUpdates() != 0)
+        evict_.push_back(embed);
+    hash_.erase(key);
+    clist->list.pop_back();
+    if (clist->list.empty())
+        list_.erase(clist);
+}
+
+std::list<LFUCache::Block>::iterator LFUCache::_create(EmbeddingPT embed) {
+    if (list_.empty() || list_.begin()->use > 1) {
+        list_.push_front({std::list<Block>(), 1});
+    }
+    list_.begin()->list.push_front({embed, list_.begin()});
+    return list_.begin()->list.begin();
+}
+
+std::list<LFUCache::Block>::iterator
+LFUCache::_increase(std::list<Block>::iterator iter) {
+    std::list<Block>::iterator result;
+    auto clist = iter->head;
+    auto clist_nxt = ++iter->head;
+    size_t use = clist->use + 1;
+    if (clist_nxt != list_.end() && clist_nxt->use == use) {
+        clist_nxt->list.push_front({iter->ptr, clist_nxt});
+        result = clist_nxt->list.begin();
+    } else {
+        CountList temp = {{}, use};
+        auto clist_new = list_.emplace(clist_nxt, temp);
+        clist_new->list.push_front({iter->ptr, clist_new});
+        result = clist_new->list.begin();
+    }
+    clist->list.erase(iter);
+    if (clist->list.empty())
+        list_.erase(clist);
+    return result;
+}
+
+py::array_t<cache_key_t> LFUCache::PyAPI_keys() {
+    std::vector<cache_key_t> keys;
+    for (auto &iter : hash_) {
+        keys.push_back(iter.first);
+    }
+    std::sort(keys.begin(), keys.end());
+    return bind::vec(keys);
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/lfuopt_cache.cc b/src/hetu_cache/src/lfuopt_cache.cc
new file mode 100644
index 0000000..79e0ae5
--- /dev/null
+++ b/src/hetu_cache/src/lfuopt_cache.cc
@@ -0,0 +1,83 @@
+#include "lfuopt_cache.h"
+
+namespace hetu {
+
+int LFUOptCache::count(cache_key_t k) {
+    return store_.count(k) + hash_.count(k);
+}
+
+void LFUOptCache::insert(EmbeddingPT e) {
+    if (store_.count(e->key())) {
+        store_[e->key()] = e;
+        return;
+    }
+    auto iter = hash_.find(e->key());
+    if (iter != hash_.end()) {
+        iter->second->ptr = e;
+    } else {
+        if (size() == limit_) {
+            if (hash_.size() > 0)
+                _evict();
+            else
+                return;
+        }
+        hash_[e->key()] = _create(e);
+    }
+}
+
+EmbeddingPT LFUOptCache::lookup(cache_key_t k) {
+    auto ptr = store_.find(k);
+    if (ptr != store_.end())
+        return ptr->second;
+    auto iter = hash_.find(k);
+    if (iter == hash_.end())
+        return nullptr;
+    auto result = iter->second->ptr;
+    if (iter->second->use + 1 < kUseCntMax)
+        hash_[k] = _increase(iter->second);
+    else {
+        store_[k] = iter->second->ptr;
+        clist[kUseCntMax - 1].erase(iter->second);
+        hash_.erase(k);
+    }
+    return result;
+}
+
+std::list<LFUOptCache::Block>::iterator
+LFUOptCache::_create(EmbeddingPT embed) {
+    clist[0].push_front({embed, 0});
+    return clist[0].begin();
+}
+
+void LFUOptCache::_evict() {
+    for (int i = 0; i < kUseCntMax; i++) {
+        if (!clist[i].empty()) {
+            auto embed = clist[i].back().ptr;
+            hash_.erase(embed->key());
+            clist[i].pop_back();
+            if (embed->getUpdates())
+                evict_.push_back(embed);
+            break;
+        }
+    }
+}
+
+std::list<LFUOptCache::Block>::iterator
+LFUOptCache::_increase(std::list<Block>::iterator iter) {
+    size_t use = iter->use;
+    clist[use + 1].push_front({iter->ptr, iter->use + 1});
+    clist[use].erase(iter);
+    return clist[use + 1].begin();
+}
+
+py::array_t<cache_key_t> LFUOptCache::PyAPI_keys() {
+    std::vector<cache_key_t> keys;
+    for (auto &iter : store_)
+        keys.push_back(iter.first);
+    for (auto &iter : hash_)
+        keys.push_back(iter.first);
+    std::sort(keys.begin(), keys.end());
+    return bind::vec(keys);
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/lru_cache.cc b/src/hetu_cache/src/lru_cache.cc
new file mode 100644
index 0000000..278e52e
--- /dev/null
+++ b/src/hetu_cache/src/lru_cache.cc
@@ -0,0 +1,50 @@
+#include "lru_cache.h"
+
+namespace hetu {
+
+int LRUCache::count(cache_key_t k) {
+    return hash_.count(k);
+}
+
+void LRUCache::insert(EmbeddingPT e) {
+    assert(e->size() == width_);
+    if (hash_.count(e->key())) {
+        auto embed = hash_[e->key()];
+        list_.erase(embed);
+    }
+    list_.push_front(e);
+    hash_[e->key()] = list_.begin();
+    // Evict the least resently used if exceeds
+    if (hash_.size() > limit_) {
+        auto embed = list_.back();
+        hash_.erase(embed->key());
+        if (embed->getUpdates() != 0)
+            evict_.push_back(embed);
+        list_.pop_back();
+    }
+}
+
+EmbeddingPT LRUCache::lookup(cache_key_t k) {
+    auto iter = hash_.find(k);
+    if (iter == hash_.end()) {
+        return nullptr;
+    }
+    auto list_iterator = iter->second;
+    auto result = *list_iterator;
+    // Move the recently used cache line to the front of the list
+    list_.erase(list_iterator);
+    list_.push_front(result);
+    hash_[k] = list_.begin();
+    return result;
+}
+
+py::array_t<cache_key_t> LRUCache::PyAPI_keys() {
+    std::vector<cache_key_t> keys;
+    for (auto &iter : hash_) {
+        keys.push_back(iter.first);
+    }
+    std::sort(keys.begin(), keys.end());
+    return bind::vec(keys);
+}
+
+} // namespace hetu
diff --git a/src/hetu_cache/src/python_api.cc b/src/hetu_cache/src/python_api.cc
new file mode 100644
index 0000000..fbd2004
--- /dev/null
+++ b/src/hetu_cache/src/python_api.cc
@@ -0,0 +1,76 @@
+#include "binding.h"
+
+#include "lru_cache.h"
+#include "lfu_cache.h"
+#include "lfuopt_cache.h"
+#include "hetu_client.h"
+
+using namespace hetu;
+
+#include "common/thread_pool.h"
+
+PYBIND11_MODULE(hetu_cache, m) {
+    m.doc() = "hetu cache C++ plugin"; // optional module docstring
+
+    // Used for Thread pool
+    py::class_<wait_t>(m, "_waittype").def("wait", [](const wait_t &w) {
+        py::gil_scoped_release release;
+        w.wait();
+    });
+
+    py::class_<Embedding, EmbeddingPT>(m, "Embedding")
+        .def(py::init(&makeEmbedding))
+        .def("mean", &Embedding::mean)
+        .def("var", &Embedding::var)
+        .def("__repr__", &Embedding::__repr__)
+        .def_property_readonly("data", &Embedding::PyAPI_data)
+        .def_property_readonly("grad", &Embedding::PyAPI_grad)
+        .def_property_readonly("key", &Embedding::key)
+        .def_property("version", &Embedding::getVersion,
+                      &Embedding::setVersion);
+
+    py::class_<CacheBase>(m, "CacheBase")
+        .def_property_readonly("limit", &CacheBase::getLimit)
+        .def_property_readonly("width", &CacheBase::getWidth)
+        .def_property_readonly("perf", &CacheBase::getPerf)
+        .def_property("pull_bound", &CacheBase::getPullBound,
+                      &CacheBase::setPullBound)
+        .def_property("push_bound", &CacheBase::getPushBound,
+                      &CacheBase::setPushBound)
+        .def_property("perf_enabled", &CacheBase::getPerfEnabled,
+                      &CacheBase::setPerfEnabled)
+        .def("bypass", &CacheBase::bypass)
+        .def("undo_bypass", &CacheBase::undoBypass)
+        .def("embedding_lookup", &CacheBase::embeddingLookup)
+        .def("embedding_update", &CacheBase::embeddingUpdate)
+        .def("embedding_lookup_raw", &CacheBase::embeddingLookupRaw)
+        .def("embedding_update_raw", &CacheBase::embeddingUpdateRaw)
+        .def("embedding_push_pull_raw", &CacheBase::embeddingPushPullRaw)
+        .def("__repr__", &CacheBase::__repr__);
+
+    py::class_<LRUCache, CacheBase>(m, "LRUCache")
+        .def(py::init<size_t, size_t, size_t, int>())
+        .def("count", &LRUCache::count)
+        .def("lookup", &LRUCache::lookup)
+        .def("insert", &LRUCache::insert)
+        .def("size", &LRUCache::size)
+        .def("keys", &LRUCache::PyAPI_keys);
+
+    py::class_<LFUCache, CacheBase>(m, "LFUCache")
+        .def(py::init<size_t, size_t, size_t, int>())
+        .def("count", &LFUCache::count)
+        .def("lookup", &LFUCache::lookup)
+        .def("insert", &LFUCache::insert)
+        .def("size", &LFUCache::size)
+        .def("keys", &LFUCache::PyAPI_keys);
+
+    py::class_<LFUOptCache, CacheBase>(m, "LFUOptCache")
+        .def(py::init<size_t, size_t, size_t, int>())
+        .def("count", &LFUOptCache::count)
+        .def("lookup", &LFUOptCache::lookup)
+        .def("insert", &LFUOptCache::insert)
+        .def("size", &LFUOptCache::size)
+        .def("keys", &LFUOptCache::PyAPI_keys);
+
+    m.def("debug", ps::debug);
+} // PYBIND11_MODULE
diff --git a/src/memory_pool/BFC_allocator.cc b/src/memory_pool/BFC_allocator.cc
new file mode 100644
index 0000000..631c372
--- /dev/null
+++ b/src/memory_pool/BFC_allocator.cc
@@ -0,0 +1 @@
+#include "BFC_allocator.h"
diff --git a/src/memory_pool/BFC_allocator.h b/src/memory_pool/BFC_allocator.h
new file mode 100644
index 0000000..572c9aa
--- /dev/null
+++ b/src/memory_pool/BFC_allocator.h
@@ -0,0 +1,186 @@
+#include "allocator.h"
+#include "absl/types/optional.h"
+#include <set>
+#include <deque>
+// size
+// requested_size
+// allocation_id
+// ptr
+// pre_ptr
+// next_ptr
+// bin_num
+
+class BFCAllocator : public Allocator {
+public:
+    BFCAllocator(SubAllocator *sub_allocator, size_t total_memory,
+                 bool allow_growth, const string &name,
+                 bool garbage_collection = false);
+    ~BFCAllocator();
+
+    string Name() {
+        return _name;
+    }
+
+    void *Allocate(size_t alignment, size_t num_bytes) override {
+        return Allocate(alignment, num_bytes, AllocationAttributes());
+    }
+
+    void *Allocate(size_t alignment, size_t num_bytes,
+                   const AllocationAttributes &allocation_attr) override;
+
+    void Deallocate(void *ptr) override;
+
+    bool TracksAllocationSizes() const override;
+
+    size_t RequestedSize(const void *ptr) const override;
+
+    size_t AllocatedSize(const void *ptr) const override;
+
+    int64 AllocationId(const void *ptr) const override;
+
+    absl::optional<AllocatorStats> GetStats() override;
+
+    void ClearStats() override;
+
+    void SetTimingCounter(SharedCounter *sc) {
+        timing_counter_ = sc;
+    }
+
+    void SetSafeFrontier(uint64 count) override;
+
+private:
+    struct Bin;
+
+    void *AllocateInternal(size_t alignment, size_t num_bytes,
+                           bool dump_log_on_failure, uint64 freed_before_count);
+
+    void *
+    AllocateInternalWithRetry(size_t alignment, size_t num_bytes,
+                              const AllocationAttributes &allocation_attr);
+
+    void DellocateInternal(void *ptr);
+
+    string _name;
+    typedef size_t ChunkHandle;
+    static const int kInvalidChunkHandle = -1;
+
+    typedef int BinNum;
+    static const int kInvalidBinNum = -1;
+    // bin size : BinBaseSize*2^(i)   i = 0,1,2,3,kNumBins
+    static const int kNumBins = 21;
+
+    struct Chunk {
+        // buffer size
+        size_t size = 0;
+        // the needed memory size for an allocation
+        size_t requested_size = 0;
+
+        // allocation_id = -1 : the chunk is unused
+        int64 allocation_id = -1;
+
+        // the pointer to the chunk memory
+        void *ptr = nullptr;
+        // if None, pre = -1
+        // else pre is the ptr of the previous chunk
+        // pre = ptr - pre->size
+        ChunkHandle pre = kInvalidChunkHandle;
+        // if None, next = -1
+        // else next is the ptr of the next chunk
+        // next = ptr + size
+        ChunkHandle next = kInvalidChunkHandle;
+        // The bin that this Chunk in
+        BinNum bin_num = kInvalidBinNum;
+
+        // Optional count when this Chunk was most recently made free
+        uint64 freed_at_count = 0;
+
+        bool in_use() const {
+            return allocation_id != -1;
+        }
+
+        // string DebugString(){
+
+        // }
+    };
+
+    // manage some chunks
+    // chunks in this Bin sorted by chunk size
+    struct Bin {
+        // Chunks in this bin must chunk_size >= bin_size
+        size_t bin_size = 0;
+
+        class ChunkComparator {
+        public:
+            explicit ChunkComparator(BFCAllocator *allocator) :
+                allocator_(allocator) {
+            }
+            bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
+                const Chunk *a = allocator_->ChunkFromHandle(ha);
+                const Chunk *b = allocator_->ChunkFromHandle(hb);
+                if (a->size != b->size) {
+                    return a->size < b->size;
+                }
+                return a->ptr < b->ptr;
+            }
+
+        private:
+            BFCAllocator *allocator_;
+        };
+
+        typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
+        // List of free chunks within the bin, sorted by chunk size
+        FreeChunkSet free_chunks;
+        Bin(BFCAllocator *allocator, size_t bs) :
+            bin_size(bs), free_chunks(ChunkComparator(allocator)) {
+        }
+    };
+
+    static const size_t kMinAllocationBits = 8;
+    static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
+
+    Chunk *ChunkFromHandle(ChunkHandle h) const;
+
+    SharedCounter *timing_counter_ = nullptr;
+
+    static size_t RoundBytes(size_t bytes);
+
+    // Try to add a new memory
+    bool Extend(size_t aligment, size_t rounded_bytes);
+
+    // Variable
+    size_t memory_limit_ = 0;
+
+    char bins_space_[sizeof(Bin) * kNumBins];
+    //
+    size_t curr_region_allocation_bytes;
+    //
+    size_t total_region_allocated_bytes_ = 0;
+    //
+    bool started_backpedal_ = false;
+
+    bool garbage_collection_;
+
+    std::unique_ptr<SubAllocator> sub_allocator_;
+
+    string name_;
+    SharedCounter *timing_counter_ = nullptr;
+    std::deque<ChunkHandle> timestamped_chunks_;
+
+    std::atomic<uint64> safe_frontier_ = {0};
+
+    // Structures mutable after construction
+
+    // RegionManager region_manager_ ;
+
+    std::vector<Chunk> chunks_;
+
+    // Pointer to head of linked list of free Chunks
+    ChunkHandle free_chunks_list_;
+
+    // Counter containing the next unique identifier to assign to a
+    // newly-created chunk.
+    int64 next_allocation_id_;
+
+    // Stats.
+    AllocatorStats stats_;
+};
diff --git a/src/memory_pool/allocator.cc b/src/memory_pool/allocator.cc
new file mode 100644
index 0000000..bf4c963
--- /dev/null
+++ b/src/memory_pool/allocator.cc
@@ -0,0 +1,35 @@
+#include "src/memory_pool/allocator.h"
+
+string number_to_string(int64 num) {
+    char str[30];
+    sprintf(str, "%20lld\n", num);
+    return str;
+}
+
+string AllocatorStats::DebugString() const {
+    string str = "Limit:       " + number_to_string(this->byte_limit);
+    str += "InUse:       " + number_to_string(this->used_bytes);
+    str += "MaxInUse:    " + number_to_string(this->used_peak_bytes);
+    str += "Alloc_num:   " + number_to_string(this->allocation_number);
+    str += "MaxAllocSize:" + number_to_string(this->largest_allocation_size);
+
+    return str;
+}
+
+SubAllocator::SubAllocator(const std::vector<Vistor> &alloc_visitors,
+                           const std::vector<Vistor> &free_visitors) :
+    alloc_visitors_(alloc_visitors),
+    free_visitors_(free_visitors) {
+}
+
+void SubAllocator::VisitAlloc(void *ptr, int index, size_t num_bytes) {
+    for (const auto &v : alloc_visitors_) {
+        v(ptr, index, num_bytes);
+    }
+}
+
+void SubAllocator::VisitFree(void *ptr, int index, size_t num_bytes) {
+    for (int i = free_visitors_.size() - 1; i >= 0; --i) {
+        free_visitors_[i](ptr, index, num_bytes);
+    }
+}
diff --git a/src/memory_pool/allocator.h b/src/memory_pool/allocator.h
new file mode 100644
index 0000000..6410164
--- /dev/null
+++ b/src/memory_pool/allocator.h
@@ -0,0 +1,251 @@
+#ifndef HETUSYS_SRC_MEMORY_POOL_ALLOCATOR_H
+#define HETUSYS_SRC_MEMORY_POOL_ALLOCATOR_H
+
+#include <iostream>
+#include <cstdio>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <functional>
+#include "src/default/types.h"
+
+#define string std::string
+
+// Attribures for a single allocation call
+struct AllocationAttributes {
+    // if an allocation is fail, whether to retry
+    bool _retry_after_failure = false;
+
+    // whether an allocation willed be logging
+    bool _allocation_will_be_logged = false;
+
+    // timing count: use to decide whether the memory to be freed
+    std::function<uint64()> *_freed_by_func = nullptr;
+
+    AllocationAttributes() = default;
+
+    AllocationAttributes(bool retry_after_failure,
+                         bool allocation_will_be_logged,
+                         std::function<uint64()> *freed_by_func) :
+        _retry_after_failure(retry_after_failure),
+        _allocation_will_be_logged(allocation_will_be_logged),
+        _freed_by_func(freed_by_func){};
+};
+
+// Runtime statistics for a allocator
+struct AllocatorStats {
+    int64 allocation_number = 0;       // Number of allocation
+    int64 used_bytes = 0;              // Number of used bytes
+    int64 used_peak_bytes = 0;         // The peak bytes in use
+    int64 largest_allocation_size = 0; // The largest single allocation bytes
+
+    int64 reserved_bytes = 0;      // Number of bytes reserved
+    int64 reserved_peak_bytes = 0; // The peak bytes of bytes reserved
+
+    int64 byte_limit = 0;            // the max bytes user can allocate now
+    int64 byte_reservable_limit = 0; // the max bytes of reserved memory
+
+    // AllocatorStats():
+    // 		allocation_number(0), used_bytes(0),
+    // 		used_peak_bytes(0), largest_allocation_size(0),
+    // 		reserved_bytes(0), reserved_peak_bytes(0),
+    // 		byte_limit(0), byte_reservable_limit(0){};
+
+    string DebugString() const;
+};
+
+class Allocator {
+public:
+    static const size_t AllocatorAlign = 64;
+
+    // Return the name of the allocator
+    virtual string Name() = 0;
+
+    // Allocate for a block of memory
+    // its size = num_bytes
+    // its pointer % alignment = 0 (alignment is a power of 2)
+    virtual void *Allocate(size_t alignment, size_t num_bytes) = 0;
+
+    // with specified allocation
+    virtual void *Allocate(size_t alignment, size_t num_bytes,
+                           const AllocationAttributes &allocation_attr) {
+        return Allocate(alignment, num_bytes);
+    }
+
+    // Deallocate a block of memory pointer
+    virtual void Deallocate(void *ptr) = 0;
+
+    // Returns true if this allocator tracks the sizes of allocations.
+    // RequestedSize and AllocatedSize must be overridden if
+    // TracksAllocationSizes is overridden to return true.
+    virtual bool TracksAllocationSizes() const {
+        return false;
+    }
+
+    // Returns true if this allocator allocates an opaque handle rather than the
+    // requested number of bytes.
+    //
+    // This method returns false for most allocators, but may be used by
+    // special-case allocators that track tensor usage. If this method returns
+    // true, AllocateRaw() should be invoked for all values of `num_bytes`,
+    // including 0.
+    //
+    // NOTE: It is the caller's responsibility to track whether an allocated
+    // object is a buffer or an opaque handle. In particular, when this method
+    // returns `true`, users of this allocator must not run any constructors or
+    // destructors for complex objects, since there is no backing store for the
+    // tensor in which to place their outputs.
+    virtual bool AllocatesOpaqueHandle() const {
+        return false;
+    }
+
+    // Returns the user-requested size of the data allocated at
+    // 'ptr'.  Note that the actual buffer allocated might be larger
+    // than requested, but this function returns the size requested by
+    // the user.
+    //
+    // REQUIRES: TracksAllocationSizes() is true.
+    //
+    // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+    // allocated by this allocator.
+    virtual size_t RequestedSize(const void *ptr) const {
+        // CHECK(false) << "allocator doesn't track sizes";
+        return size_t(0);
+    }
+
+    // Returns the allocated size of the buffer at 'ptr' if known,
+    // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is
+    // guaranteed to be >= RequestedSize(ptr).
+    //
+    // REQUIRES: TracksAllocationSizes() is true.
+    //
+    // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+    // allocated by this allocator.
+    virtual size_t AllocatedSize(const void *ptr) const {
+        return RequestedSize(ptr);
+    }
+
+    // Returns either 0 or an identifier assigned to the buffer at 'ptr'
+    // when the buffer was returned by AllocateRaw. If non-zero, the
+    // identifier differs from every other ID assigned by this
+    // allocator.
+    //
+    // REQUIRES: TracksAllocationSizes() is true.
+    //
+    // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+    // allocated by this allocator.
+    virtual int64 AllocationId(const void *ptr) const {
+        return 0;
+    }
+
+    // Returns the allocated size of the buffer at 'ptr' if known,
+    // otherwise returns 0. This method can be called when
+    // TracksAllocationSizes() is false, but can be extremely slow.
+    //
+    // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
+    // allocated by this allocator.
+    virtual size_t AllocatedSizeSlow(const void *ptr) const {
+        if (TracksAllocationSizes()) {
+            return AllocatedSize(ptr);
+        }
+        return 0;
+    }
+
+    // Fills in 'stats' with statistics collected by this allocator.
+    virtual absl::optional<AllocatorStats> GetStats() {
+        return absl::nullopt;
+    }
+
+    // Clears the internal stats except for the `in_use` field.
+    virtual void ClearStats() {
+    }
+
+    virtual void SetSafeFrontier(uint64 count) {
+    }
+
+    virtual ~Allocator();
+};
+
+class AllocatorWrapper : public Allocator {
+private:
+    Allocator *const wrapped_;
+
+public:
+    explicit AllocatorWrapper(Allocator *wrapped) : wrapped_(wrapped) {
+    }
+
+    ~AllocatorWrapper() {
+    }
+
+    // Return the wrapped allocator
+    Allocator *wrapped() const {
+        return wrapped_;
+    }
+
+    string Name() override {
+        return wrapped_->Name();
+    }
+
+    void *Allocate(size_t alignment, size_t num_bytes) override {
+        return wrapped_->Allocate(alignment, num_bytes);
+    }
+
+    void *Allocate(size_t alignment, size_t num_bytes,
+                   const AllocationAttributes &allocation_attr) override {
+        return wrapped_->Allocate(alignment, num_bytes, allocation_attr);
+    }
+
+    void Deallocate(void *ptr) override {
+        wrapped_->Deallocate(ptr);
+    }
+    bool TracksAllocationSizes() const override {
+        return wrapped_->TracksAllocationSizes();
+    }
+
+    bool AllocatesOpaqueHandle() const override {
+        return wrapped_->AllocatesOpaqueHandle();
+    }
+
+    size_t RequestedSize(const void *ptr) const override {
+        return wrapped_->RequestedSize(ptr);
+    }
+
+    size_t AllocatedSize(const void *ptr) const override {
+        return wrapped_->AllocatedSize(ptr);
+    }
+
+    int64 AllocationId(const void *ptr) const override {
+        return wrapped_->AllocationId(ptr);
+    }
+
+    size_t AllocatedSizeSlow(const void *ptr) const override {
+        return wrapped_->AllocatedSizeSlow(ptr);
+    }
+};
+
+// For infrequently Alloc and Free.(< Allocator)
+// To experiment with cache and pool management
+class SubAllocator {
+public:
+    // a pointer to a memory area,
+    // index value for numa_node(CPU) or GPU id(GPU)
+    typedef std::function<void(void *, int index, size_t)> Vistor;
+
+    SubAllocator(const std::vector<Vistor> &alloc_visitors,
+                 const std::vector<Vistor> &free_visitors);
+
+    virtual ~SubAllocator() {
+    }
+    virtual void *Alloc(size_t alignment, size_t num_bytes) = 0;
+    virtual void Free(void *ptr, size_t num_bytes) = 0;
+
+protected:
+    void VisitAlloc(void *ptr, int index, size_t num_bytes);
+
+    void VisitFree(void *ptr, int index, size_t num_bytes);
+
+    const std::vector<Vistor> alloc_visitors_;
+    const std::vector<Vistor> free_visitors_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/ops/AddConst.cu b/src/ops/AddConst.cu
new file mode 100644
index 0000000..b29eb21
--- /dev/null
+++ b/src/ops/AddConst.cu
@@ -0,0 +1,37 @@
+#include "gpu_runtime.h"
+
+__global__ void add_const_kernel(const float *input, float *output, float value,
+                                 size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = input[ind] + value;
+}
+
+int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
+                                     DLArrayHandle output,
+                                     DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *input_data = (const float *)input->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        add_const_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, val, size);
+    else
+        add_const_kernel<<<blocks, threads>>>(input_data, output_data, val,
+                                              size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/AddElewise.cu b/src/ops/AddElewise.cu
new file mode 100644
index 0000000..29b05ba
--- /dev/null
+++ b/src/ops/AddElewise.cu
@@ -0,0 +1,289 @@
+#include "gpu_runtime.h"
+
+__global__ void ele_add_kernel(const float *matA, const float *matB,
+                               float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = matA[ind] + matB[ind];
+}
+__global__ void ele_broadcast_add_kernel(const float *matL, const float *matS,
+                                         float *output, size_t size,
+                                         uint *L_strides, uint *S_strides,
+                                         uint *L_dims, uint *S_dims,
+                                         size_t L_ndims, size_t S_ndims) {
+    size_t o_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_ind >= size)
+        return;
+
+    size_t s_ind = 0;
+    uint temp = o_ind;
+    for (int i = 0; i < L_ndims; ++i) {
+        uint adder = temp / L_strides[i];
+        if (L_ndims - i <= S_ndims && S_dims[i - (L_ndims - S_ndims)] > 1) {
+            s_ind += S_strides[i - (L_ndims - S_ndims)] * adder;
+        }
+        temp %= L_strides[i];
+    }
+    output[o_ind] = matL[o_ind] + matS[s_ind];
+}
+
+__global__ void ele_lazy_add_kernel(const float *matA, index_t *matA_stride,
+                                    const float *matB, index_t *matB_stride,
+                                    float *output, index_t *output_stride,
+                                    size_t size, size_t ndim) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t index_matA = 0;
+    size_t index_matB = 0;
+    size_t output_index = ind;
+    for (index_t i = 0; i < ndim; i++) {
+        index_matA += output_index / output_stride[i] * matA_stride[i];
+        index_matB += output_index / output_stride[i] * matB_stride[i];
+        output_index = output_index % output_stride[i];
+    }
+    output[ind] = matA[index_matA] + matB[index_matB];
+}
+
+int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
+                              const DLArrayHandle matB, DLArrayHandle output,
+                              bool lazy_input,
+                              DLStreamHandle stream_handle = NULL) {
+    int dev_id = (matA->ctx).device_id;
+    cudaSetDevice(dev_id);
+    size_t allocatedA = matA->ndim * sizeof(uint);
+    size_t allocatedB = matB->ndim * sizeof(uint);
+
+    uint *A_strides = (uint *)malloc(allocatedA);
+    uint *B_strides = (uint *)malloc(allocatedB);
+    uint *A_dims = (uint *)malloc(allocatedA);
+    uint *B_dims = (uint *)malloc(allocatedB);
+    size_t tmp_size = 1;
+    for (int i = matA->ndim - 1; i >= 0; --i) {
+        A_dims[i] = matA->shape[i];
+        A_strides[i] = tmp_size;
+        tmp_size *= matA->shape[i];
+    }
+    tmp_size = 1;
+    for (int i = matB->ndim - 1; i >= 0; --i) {
+        B_dims[i] = matB->shape[i];
+        B_strides[i] = tmp_size;
+        tmp_size *= matB->shape[i];
+    }
+    size_t size = 1, size_A = 1, size_B = 1;
+    size_A = A_strides[0] * matA->shape[0];
+    size_B = B_strides[0] * matB->shape[0];
+    size = size_A > size_B ? size_A : size_B;
+
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (lazy_input) {
+        assert(size_A == size_B);
+        if (is_chunk_init(dev_id) == false) {
+            chunk_init(dev_id);
+        }
+
+        index_t ndim = matA->ndim;
+
+        index_t *matA_stride =
+            (index_t *)find_chunk(ndim * sizeof(index_t), dev_id);
+        index_t *matB_stride =
+            (index_t *)find_chunk(ndim * sizeof(index_t), dev_id);
+        index_t *output_stride =
+            (index_t *)find_chunk(ndim * sizeof(index_t), dev_id);
+
+        if (cu_stream != NULL) {
+            CUDA_CALL(cudaMemcpyAsync(matA_stride, matA->stride,
+                                      ndim * sizeof(index_t),
+                                      cudaMemcpyHostToDevice, cu_stream));
+            CUDA_CALL(cudaMemcpyAsync(matB_stride, matB->stride,
+                                      ndim * sizeof(index_t),
+                                      cudaMemcpyHostToDevice, cu_stream));
+            CUDA_CALL(cudaMemcpyAsync(output_stride, output->stride,
+                                      ndim * sizeof(index_t),
+                                      cudaMemcpyHostToDevice, cu_stream));
+            ele_lazy_add_kernel<<<blocks, threads, 0, cu_stream>>>(
+                matA_data, matA_stride, matB_data, matB_stride, output_data,
+                output_stride, size, ndim);
+        } else {
+            CUDA_CALL(cudaMemcpy(matA_stride, matA->stride,
+                                 ndim * sizeof(index_t),
+                                 cudaMemcpyHostToDevice));
+            CUDA_CALL(cudaMemcpy(matB_stride, matB->stride,
+                                 ndim * sizeof(index_t),
+                                 cudaMemcpyHostToDevice));
+            CUDA_CALL(cudaMemcpy(output_stride, output->stride,
+                                 ndim * sizeof(index_t),
+                                 cudaMemcpyHostToDevice));
+            ele_lazy_add_kernel<<<blocks, threads>>>(
+                matA_data, matA_stride, matB_data, matB_stride, output_data,
+                output_stride, size, ndim);
+        }
+    } else {
+        if (size_A == size_B) {
+            if (cu_stream != NULL)
+                ele_add_kernel<<<blocks, threads, 0, cu_stream>>>(
+                    matA_data, matB_data, output_data, size);
+            else
+                ele_add_kernel<<<blocks, threads>>>(matA_data, matB_data,
+                                                    output_data, size);
+        } else {
+            uint *gpu_stridesA = (uint *)find_chunk(allocatedA, dev_id);
+            uint *gpu_dimsA = (uint *)find_chunk(allocatedA, dev_id);
+            uint *gpu_stridesB = (uint *)find_chunk(allocatedB, dev_id);
+            uint *gpu_dimsB = (uint *)find_chunk(allocatedB, dev_id);
+            if (cu_stream != NULL) {
+                CUDA_CALL(cudaMemcpyAsync(gpu_stridesA, A_strides, allocatedA,
+                                          cudaMemcpyHostToDevice, cu_stream));
+                CUDA_CALL(cudaMemcpyAsync(gpu_dimsA, A_dims, allocatedA,
+                                          cudaMemcpyHostToDevice, cu_stream));
+                CUDA_CALL(cudaMemcpyAsync(gpu_stridesB, B_strides, allocatedB,
+                                          cudaMemcpyHostToDevice, cu_stream));
+                CUDA_CALL(cudaMemcpyAsync(gpu_dimsB, B_dims, allocatedB,
+                                          cudaMemcpyHostToDevice, cu_stream));
+                if (size_A > size_B) {
+                    ele_broadcast_add_kernel<<<blocks, threads, 0, cu_stream>>>(
+                        matA_data, matB_data, output_data, size, gpu_stridesA,
+                        gpu_stridesB, gpu_dimsA, gpu_dimsB, (size_t)matA->ndim,
+                        (size_t)matB->ndim);
+                } else {
+                    ele_broadcast_add_kernel<<<blocks, threads, 0, cu_stream>>>(
+                        matB_data, matA_data, output_data, size, gpu_stridesB,
+                        gpu_stridesA, gpu_dimsB, gpu_dimsA, (size_t)matB->ndim,
+                        (size_t)matA->ndim);
+                }
+            } else {
+                CUDA_CALL(cudaMemcpy(gpu_stridesA, A_strides, allocatedA,
+                                     cudaMemcpyHostToDevice));
+                CUDA_CALL(cudaMemcpy(gpu_dimsA, A_dims, allocatedA,
+                                     cudaMemcpyHostToDevice));
+                CUDA_CALL(cudaMemcpy(gpu_stridesB, B_strides, allocatedB,
+                                     cudaMemcpyHostToDevice));
+                CUDA_CALL(cudaMemcpy(gpu_dimsB, B_dims, allocatedB,
+                                     cudaMemcpyHostToDevice));
+                if (size_A > size_B) {
+                    ele_broadcast_add_kernel<<<blocks, threads>>>(
+                        matA_data, matB_data, output_data, size, gpu_stridesA,
+                        gpu_stridesB, gpu_dimsA, gpu_dimsB, (size_t)matA->ndim,
+                        (size_t)matB->ndim);
+                } else {
+                    ele_broadcast_add_kernel<<<blocks, threads>>>(
+                        matB_data, matA_data, output_data, size, gpu_stridesB,
+                        gpu_stridesA, gpu_dimsB, gpu_dimsA, (size_t)matB->ndim,
+                        (size_t)matA->ndim);
+                }
+            }
+        }
+    }
+    free(A_strides);
+    free(B_strides);
+    free(A_dims);
+    free(B_dims);
+    return 0;
+}
+
+/* below is the simple version of add elementwise */
+
+__global__ void ele_lazy_add_kernel_simple(const float *matA, const float *matB,
+                                           float *output,
+                                           const uint *gpu_buffer, size_t size,
+                                           size_t ndim) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    const uint *matA_stride = gpu_buffer;
+    const uint *matB_stride = matA_stride + ndim;
+    const uint *output_stride = matB_stride + ndim;
+    size_t index_matA = 0;
+    size_t index_matB = 0;
+    size_t output_index = ind;
+    for (int i = 0; i < ndim; i++) {
+        index_matA += output_index / output_stride[i] * matA_stride[i];
+        index_matB += output_index / output_stride[i] * matB_stride[i];
+        output_index = output_index % output_stride[i];
+    }
+    output[ind] = matA[index_matA] + matB[index_matB];
+}
+
+int DLGpuMatrixElementwiseAddSimple(const DLArrayHandle matA,
+                                    const DLArrayHandle matB,
+                                    DLArrayHandle output,
+                                    DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t ndim = matA->ndim;
+    for (int i = 0; i < ndim; ++i)
+        size *= matA->shape[i];
+
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        cudaStream_t cu_stream = *(cudaStream_t *)(stream_handle->handle);
+        ele_add_kernel<<<blocks, threads, 0, cu_stream>>>(matA_data, matB_data,
+                                                          output_data, size);
+    } else {
+        ele_add_kernel<<<blocks, threads>>>(matA_data, matB_data, output_data,
+                                            size);
+    }
+    return 0;
+}
+
+int DLGpuMatrixElementwiseAddLazy(const DLArrayHandle matA,
+                                  const DLArrayHandle matB,
+                                  DLArrayHandle output,
+                                  const DLArrayHandle gpu_buf,
+                                  DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t ndim = matA->ndim;
+    for (int i = 0; i < ndim; ++i)
+        size *= matA->shape[i];
+
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    const uint *gpu_buffer = (const uint *)gpu_buf->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        cudaStream_t cu_stream = *(cudaStream_t *)(stream_handle->handle);
+        ele_lazy_add_kernel_simple<<<blocks, threads, 0, cu_stream>>>(
+            matA_data, matB_data, output_data, gpu_buffer, size, ndim);
+    } else {
+        ele_lazy_add_kernel_simple<<<blocks, threads>>>(
+            matA_data, matB_data, output_data, gpu_buffer, size, ndim);
+    }
+    return 0;
+}
diff --git a/src/ops/ArrayLazyCallback.cu b/src/ops/ArrayLazyCallback.cu
new file mode 100644
index 0000000..e14e748
--- /dev/null
+++ b/src/ops/ArrayLazyCallback.cu
@@ -0,0 +1,71 @@
+#include "gpu_runtime.h"
+
+__global__ void array_lazy_callback_kernel(const float *from, float *to,
+                                           index_t *from_stride,
+                                           index_t *to_stride, size_t size,
+                                           size_t ndim) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t to_index = 0;
+    size_t from_index = ind;
+    for (index_t i = 0; i < ndim; i++) {
+        to_index = to_index + from_index / to_stride[i] * from_stride[i];
+        from_index = from_index % to_stride[i];
+    }
+    to[ind] = from[to_index];
+}
+
+int DLGpuArrayLazyCallback(const DLArrayHandle from, DLArrayHandle to,
+                           DLStreamHandle stream_handle = NULL) {
+    int dev_id = (from->ctx).device_id;
+    cudaSetDevice(dev_id);
+    index_t size = 1;
+    index_t ndim = to->ndim;
+    for (index_t i = 0; i < ndim; i++) {
+        size *= to->shape[i];
+    }
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+
+    index_t *from_stride =
+        (index_t *)find_chunk(ndim * sizeof(index_t), dev_id);
+    index_t *to_stride = (index_t *)find_chunk(ndim * sizeof(index_t), dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    float *to_data = (float *)to->data;
+    const float *from_data = (const float *)from->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+    if (cu_stream != NULL) {
+        CUDA_CALL(cudaMemcpyAsync(from_stride, from->stride,
+                                  ndim * sizeof(index_t),
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(to_stride, to->stride, ndim * sizeof(index_t),
+                                  cudaMemcpyHostToDevice, cu_stream));
+        array_lazy_callback_kernel<<<blocks, threads, 0, cu_stream>>>(
+            from_data, to_data, from_stride, to_stride, size, ndim);
+    } else {
+        CUDA_CALL(cudaMemcpy(from_stride, from->stride, ndim * sizeof(index_t),
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(to_stride, to->stride, ndim * sizeof(index_t),
+                             cudaMemcpyHostToDevice));
+        array_lazy_callback_kernel<<<blocks, threads>>>(
+            from_data, to_data, from_stride, to_stride, size, ndim);
+    }
+
+    del_chunk(from_stride, dev_id);
+    del_chunk(to_stride, dev_id);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/ArraySet.cu b/src/ops/ArraySet.cu
new file mode 100644
index 0000000..583232b
--- /dev/null
+++ b/src/ops/ArraySet.cu
@@ -0,0 +1,33 @@
+#include "gpu_runtime.h"
+
+__global__ void array_set_kernel(float *output, float value, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = value;
+}
+
+int DLGpuArraySet(DLArrayHandle arr, float value,
+                  DLStreamHandle stream_handle) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)arr->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        array_set_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            output_data, value, size);
+    else
+        array_set_kernel<<<blocks, threads>>>(output_data, value, size);
+    return 0;
+}
diff --git a/src/ops/AvgPool.cu b/src/ops/AvgPool.cu
new file mode 100644
index 0000000..a394e88
--- /dev/null
+++ b/src/ops/AvgPool.cu
@@ -0,0 +1,122 @@
+#include "gpu_runtime.h"
+
+__global__ void average_pooling2d(const size_t threads, const float *input_data,
+                                  float *output_data, const size_t N,
+                                  const size_t C, const size_t H,
+                                  const size_t W, const size_t kernel_H,
+                                  const size_t kernel_W, const size_t p_H,
+                                  const size_t p_W, const size_t padding,
+                                  const size_t stride) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= threads)
+        return;
+    size_t idx = id;
+    size_t idx_W = idx % p_W;
+    idx /= p_W;
+    size_t idx_H = idx % p_H;
+    idx /= p_H;
+    size_t idx_C = idx % C;
+    size_t idx_N = idx / C;
+    int hs = (int)idx_H * stride - padding;
+    int ws = (int)idx_W * stride - padding;
+    size_t hend = min(hs + kernel_H, H);
+    size_t wend = min(ws + kernel_W, W);
+    hs = max(hs, 0);
+    ws = max(ws, 0);
+    float temp = 0;
+    for (index_t i = hs; i < hend; i++) {
+        for (index_t j = ws; j < wend; j++) {
+            temp += input_data[idx_N * C * H * W + idx_C * H * W + i * W + j];
+        }
+    }
+    output_data[id] = temp / (kernel_H * kernel_W);
+}
+
+int DLGpuAvgerage_Pooling2d(const DLArrayHandle input, const size_t kernel_H,
+                            const size_t kernel_W, DLArrayHandle output,
+                            const size_t padding, const size_t stride,
+                            DLStreamHandle stream_handle = NULL) {
+    size_t input_N = input->shape[0];
+    size_t input_C = input->shape[1];
+    size_t input_H = input->shape[2];
+    size_t input_W = input->shape[3];
+    size_t output_H = output->shape[2];
+    size_t output_W = output->shape[3];
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+    size_t pooled_H = (input_H + 2 * padding - kernel_H) / stride + 1;
+    size_t pooled_W = (input_W + 2 * padding - kernel_W) / stride + 1;
+
+    size_t output_size = input_N * input_C * output_H * output_W;
+    size_t BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        average_pooling2d<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            output_size, input_data, output_data, input_N, input_C, input_H,
+            input_W, kernel_H, kernel_W, pooled_H, pooled_W, padding, stride);
+    else
+        average_pooling2d<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            output_size, input_data, output_data, input_N, input_C, input_H,
+            input_W, kernel_H, kernel_W, pooled_H, pooled_W, padding, stride);
+    return 0;
+}
+
+__global__ void average_pooling2d_gradient(
+    const size_t threads, const float *input_data, float *output_data,
+    const size_t N, const size_t C, const size_t H, const size_t W,
+    const size_t kernel_H, const size_t kernel_W, const size_t p_H,
+    const size_t p_W, const size_t padding, const size_t stride) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= threads)
+        return;
+    size_t idx = id;
+    size_t idx_W = idx % p_W;
+    idx /= p_W;
+    size_t idx_H = idx % p_H;
+    idx /= p_H;
+    size_t idx_C = idx % C;
+    size_t idx_N = idx / C;
+    size_t hs = (idx_H < kernel_H) ? 0 : (idx_H - kernel_H) / stride + 1;
+    size_t hend = min(idx_H / stride + 1, H);
+    size_t ws = (idx_W < kernel_W) ? 0 : (idx_W - kernel_W) / stride + 1;
+    size_t wend = min(idx_W / stride + 1, W);
+    float temp = 0;
+    const size_t pooling_size = kernel_H * kernel_W;
+    for (index_t i = hs; i < hend; i++) {
+        for (index_t j = ws; j < wend; j++) {
+            temp += input_data[idx_N * C * H * W + idx_C * H * W + i * W + j];
+        }
+    }
+    output_data[id] = temp / pooling_size;
+}
+
+int DLGpuAvgerage_Pooling2d_gradient(const DLArrayHandle gradient_Y,
+                                     const size_t kernel_H,
+                                     const size_t kernel_W,
+                                     DLArrayHandle gradient_X,
+                                     const size_t padding, const size_t stride,
+                                     DLStreamHandle stream_handle = NULL) {
+    size_t N = gradient_Y->shape[0];
+    size_t C = gradient_Y->shape[1];
+    size_t H = gradient_Y->shape[2];
+    size_t W = gradient_Y->shape[3];
+
+    size_t pooled_H = gradient_X->shape[2];
+    size_t pooled_W = gradient_X->shape[3];
+
+    const float *input_data = (const float *)gradient_Y->data;
+    float *output_data = (float *)gradient_X->data;
+
+    size_t output_size = N * C * pooled_H * pooled_W;
+    size_t BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        average_pooling2d_gradient<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                                     *(cudaStream_t *)stream_handle->handle>>>(
+            output_size, input_data, output_data, N, C, H, W, kernel_H,
+            kernel_W, pooled_H, pooled_W, padding, stride);
+    else
+        average_pooling2d_gradient<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            output_size, input_data, output_data, N, C, H, W, kernel_H,
+            kernel_W, pooled_H, pooled_W, padding, stride);
+    return 0;
+}
diff --git a/src/ops/BatchMatrixMult.cu b/src/ops/BatchMatrixMult.cu
new file mode 100644
index 0000000..3aaf30d
--- /dev/null
+++ b/src/ops/BatchMatrixMult.cu
@@ -0,0 +1,39 @@
+#include "gpu_runtime.h"
+
+int DLGpuBatchMatrixMultiply(const DLArrayHandle matA, bool transposeA,
+                             const DLArrayHandle matB, bool transposeB,
+                             DLArrayHandle matC,
+                             DLStreamHandle stream_handle = NULL) {
+    assert(matA->ndim == matB->ndim);
+    assert(matA->ndim == matC->ndim);
+
+    int dev_id = (matA->ctx).device_id;
+    cublas_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+
+    int ndim = matA->ndim;
+    int m = matC->shape[ndim - 1];
+    int n = matC->shape[ndim - 2];
+    int k = transposeA ? matA->shape[ndim - 2] : matA->shape[ndim - 1];
+    long long int strideA = matA->shape[ndim - 2] * matA->shape[ndim - 1];
+    long long int strideB = matB->shape[ndim - 2] * matB->shape[ndim - 1];
+    long long int strideC = matC->shape[ndim - 2] * matC->shape[ndim - 1];
+
+    int batchCount = 1;
+    for (int i = 0; i < ndim - 2; ++i) {
+        assert(matA->shape[i] == matB->shape[i]);
+        assert(matA->shape[i] == matC->shape[i]);
+        batchCount *= matA->shape[i];
+    }
+
+    cublasStatus_t res = cublasSgemmStridedBatched(
+        cublas_map[dev_id], transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
+        transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one,
+        (const float *)matB->data, !transposeB ? m : k, strideB,
+        (const float *)matA->data, !transposeA ? k : n, strideA, &zero,
+        (float *)matC->data, m, strideC, batchCount);
+    assert(res == CUBLAS_STATUS_SUCCESS);
+    return 0;
+}
diff --git a/src/ops/BinaryCrossEntropy.cu b/src/ops/BinaryCrossEntropy.cu
new file mode 100644
index 0000000..682461c
--- /dev/null
+++ b/src/ops/BinaryCrossEntropy.cu
@@ -0,0 +1,98 @@
+#include "gpu_runtime.h"
+
+// -label * log(prediction) - (1 - label) * log(1 - prediction)
+__global__ void binary_cross_entropy_kernel(int nrow, const float *prediction,
+                                            const float *label, float *loss) {
+    // Two dimensional thread blocks.
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= nrow)
+        return;
+    loss[id] = -label[id] * log(prediction[id])
+               - (1 - label[id]) * log(1 - prediction[id]);
+}
+
+int DLGpuBinaryCrossEntropy(const DLArrayHandle prediction,
+                            const DLArrayHandle label, DLArrayHandle loss,
+                            DLStreamHandle stream_handle = NULL) {
+    size_t indim = prediction->ndim;
+    assert(indim == label->ndim && indim == loss->ndim);
+    int nrow = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        nrow *= prediction->shape[i];
+    }
+
+    const float *prediction_data = (const float *)prediction->data;
+    const float *label_data = (const float *)label->data;
+    float *output_data = (float *)loss->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (nrow <= 1024) {
+        threads.x = nrow;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (nrow + 1023) / 1024;
+    }
+    // 1 block
+    if (stream_handle) {
+        binary_cross_entropy_kernel<<<blocks, threads, 0,
+                                      *(cudaStream_t *)stream_handle->handle>>>(
+            nrow, prediction_data, label_data, output_data);
+    } else {
+        binary_cross_entropy_kernel<<<blocks, threads>>>(
+            nrow, prediction_data, label_data, output_data);
+    }
+    return 0;
+}
+
+__global__ void binary_cross_entropy_gradient_kernel(int nrow,
+                                                     const float *prediction,
+                                                     const float *label,
+                                                     const float *output_grad,
+                                                     float *output) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= nrow)
+        return;
+    output[id] = output_grad[id]
+                 * (-label[id] / prediction[id]
+                    + (1 - label[id]) / (1 - prediction[id]));
+}
+
+int DLGpuBinaryCrossEntropy_Gradient(const DLArrayHandle prediction,
+                                     const DLArrayHandle label,
+                                     const DLArrayHandle output_grad,
+                                     DLArrayHandle output,
+                                     DLStreamHandle stream_handle = NULL) {
+    size_t indim = prediction->ndim;
+    assert(indim >= 2 && indim == label->ndim && indim == output_grad->ndim
+           && indim == output->ndim);
+    int nrow = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        nrow *= prediction->shape[i];
+    }
+
+    const float *prediction_data = (const float *)prediction->data;
+    const float *label_data = (const float *)label->data;
+    const float *output_grad_data = (const float *)output_grad->data;
+    float *output_data = (float *)output->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (nrow <= 1024) {
+        threads.x = nrow;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (nrow + 1023) / 1024;
+    }
+    if (stream_handle) {
+        binary_cross_entropy_gradient_kernel<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            nrow, prediction_data, label_data, output_grad_data, output_data);
+    } else {
+        binary_cross_entropy_gradient_kernel<<<blocks, threads>>>(
+            nrow, prediction_data, label_data, output_grad_data, output_data);
+    }
+    return 0;
+}
diff --git a/src/ops/Broadcast.cu b/src/ops/Broadcast.cu
new file mode 100644
index 0000000..14419f2
--- /dev/null
+++ b/src/ops/Broadcast.cu
@@ -0,0 +1,35 @@
+#include "gpu_runtime.h"
+
+__global__ void broadcast_to_kernel(const float *input_data, float *output_data,
+                                    size_t input_size, size_t output_size) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= output_size)
+        return;
+    output_data[id] = input_data[id % input_size];
+}
+
+int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output,
+                     DLStreamHandle stream_handle = NULL) {
+    for (index_t i = 0; i < input->ndim; i++) {
+        assert((input->shape[i]) == (output->shape[i + 1]));
+    }
+    size_t input_size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        input_size *= input->shape[i];
+    }
+    size_t output_size = 1;
+    for (index_t i = 0; i < output->ndim; i++) {
+        output_size *= output->shape[i];
+    }
+    size_t BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle) {
+        cudaStream_t *s = (cudaStream_t *)(stream_handle->handle);
+        broadcast_to_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0, *s>>>(
+            (const float *)(input->data), (float *)(output->data), input_size,
+            output_size);
+    } else
+        broadcast_to_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            (const float *)(input->data), (float *)(output->data), input_size,
+            output_size);
+    return 0;
+}
diff --git a/src/ops/BroadcastShape.cu b/src/ops/BroadcastShape.cu
new file mode 100644
index 0000000..0fbf334
--- /dev/null
+++ b/src/ops/BroadcastShape.cu
@@ -0,0 +1,139 @@
+#include "gpu_runtime.h"
+
+__global__ void broadcast_shape_kernel(const float *input_data,
+                                       float *output_data, uint *out_strides,
+                                       uint *in_dims, size_t ndims,
+                                       size_t output_size) {
+    size_t o_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_ind >= output_size)
+        return;
+    size_t i_ind = 0;
+    uint temp = o_ind;
+    for (int i = 0; i < ndims; ++i) {
+        i_ind *= in_dims[i];
+        i_ind += (in_dims[i] > 1) * temp / out_strides[i];
+        temp %= out_strides[i];
+    }
+    output_data[o_ind] = input_data[i_ind];
+}
+
+int DLGpuBroadcastShape(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                        int *add_axes, DLStreamHandle stream_handle = NULL) {
+    size_t allocated = out_arr->ndim * sizeof(uint);
+
+    uint *out_strides = (uint *)malloc(allocated);
+    uint *in_dims = (uint *)malloc(allocated);
+
+    size_t output_size = 1;
+    size_t diff = out_arr->ndim - in_arr->ndim;
+
+    if (add_axes == NULL) {
+        for (int i = out_arr->ndim - 1; i >= 0; --i) {
+            out_strides[i] = output_size;
+            output_size *= out_arr->shape[i];
+            if (i < diff) {
+                in_dims[i] = 1;
+            } else {
+                in_dims[i] = in_arr->shape[i - diff];
+            }
+        }
+    } else {
+        for (int i = out_arr->ndim - 1; i >= 0; --i) {
+            out_strides[i] = output_size;
+            output_size *= out_arr->shape[i];
+            in_dims[i] = 0;
+        }
+        for (int i = 0; i < diff; ++i) {
+            in_dims[add_axes[i]] = 1;
+        }
+        int o_ind = 0;
+        for (int i = 0; i < in_arr->ndim; ++i) {
+            while (in_dims[o_ind++] == 1)
+                ;
+            in_dims[o_ind - 1] = in_arr->shape[i];
+        }
+    }
+
+    int dev_id = (in_arr->ctx).device_id;
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    uint *gpu_strides = (uint *)find_chunk(allocated, dev_id);
+    uint *gpu_dims = (uint *)find_chunk(allocated, dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    if (output_size <= 1024) {
+        threads.x = output_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (output_size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (cu_stream != NULL) {
+        CUDA_CALL(cudaMemcpyAsync(gpu_strides, out_strides, allocated,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(gpu_dims, in_dims, allocated,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        broadcast_shape_kernel<<<blocks, threads, 0, cu_stream>>>(
+            (const float *)(in_arr->data), (float *)(out_arr->data),
+            gpu_strides, gpu_dims, (size_t)out_arr->ndim, output_size);
+    } else {
+        CUDA_CALL(cudaMemcpy(gpu_strides, out_strides, allocated,
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(
+            cudaMemcpy(gpu_dims, in_dims, allocated, cudaMemcpyHostToDevice));
+        broadcast_shape_kernel<<<blocks, threads>>>(
+            (const float *)(in_arr->data), (float *)(out_arr->data),
+            gpu_strides, gpu_dims, (size_t)out_arr->ndim, output_size);
+    }
+
+    del_chunk(gpu_strides, dev_id);
+    del_chunk(gpu_dims, dev_id);
+    free(out_strides);
+    free(in_dims);
+    return 0;
+}
+
+int DLGpuBroadcastShapeSimple(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                              const DLArrayHandle out_strides,
+                              const DLArrayHandle in_dims,
+                              DLStreamHandle stream_handle) {
+    const float *in_data = (const float *)(in_arr->data);
+    float *out_data = (float *)(out_arr->data);
+    uint *os_data = (uint *)(out_strides->data);
+    uint *id_data = (uint *)(in_dims->data);
+    size_t output_size = 1;
+    for (size_t i = 0; i < out_arr->ndim; ++i) {
+        output_size *= out_arr->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    if (output_size <= 1024) {
+        threads.x = output_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (output_size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (cu_stream != NULL) {
+        broadcast_shape_kernel<<<blocks, threads, 0, cu_stream>>>(
+            in_data, out_data, os_data, id_data, (size_t)out_arr->ndim,
+            output_size);
+    } else {
+        broadcast_shape_kernel<<<blocks, threads>>>(
+            in_data, out_data, os_data, id_data, (size_t)out_arr->ndim,
+            output_size);
+    }
+
+    return 0;
+}
diff --git a/src/ops/Concat.cu b/src/ops/Concat.cu
new file mode 100644
index 0000000..7cd8e91
--- /dev/null
+++ b/src/ops/Concat.cu
@@ -0,0 +1,134 @@
+#include "gpu_runtime.h"
+
+__global__ void concat_kernel(const float *x_data, const float *y_data,
+                              float *output_data, int concat_size, int offset1,
+                              int offset2, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    int all_offset = offset1 + offset2;
+    int post_ind = ind % concat_size;
+    int temp = ind / concat_size;
+    int mid_ind = temp % all_offset;
+    int pre_ind = temp / all_offset;
+    float val;
+    if (mid_ind < offset1) {
+        int x_ind = (pre_ind * offset1 + mid_ind) * concat_size + post_ind;
+        val = x_data[x_ind];
+    } else {
+        int y_ind =
+            (pre_ind * offset2 + mid_ind - offset1) * concat_size + post_ind;
+        val = y_data[y_ind];
+    }
+    output_data[ind] = val;
+}
+
+int DLGpuConcat(const DLArrayHandle input_x, const DLArrayHandle input_y,
+                DLArrayHandle output, int axis = 0,
+                DLStreamHandle stream_handle = NULL) {
+    int now_ndim = input_x->ndim;
+    assert(axis >= 0 && axis < now_ndim);
+    assert(now_ndim == input_y->ndim && now_ndim == output->ndim);
+    int num_concats = 1;
+    for (int i = 0; i < axis; ++i) {
+        int cur_dim = input_x->shape[i];
+        assert(cur_dim == input_y->shape[i] && cur_dim == output->shape[i]);
+        num_concats *= cur_dim;
+    }
+    int offset1 = input_x->shape[axis];
+    int offset2 = input_y->shape[axis];
+    assert(offset1 + offset2 == output->shape[axis]);
+    int concat_size = 1;
+    for (int i = axis + 1; i < now_ndim; i++) {
+        int cur_dim = input_x->shape[i];
+        assert(cur_dim == input_y->shape[i] && cur_dim == output->shape[i]);
+        concat_size *= cur_dim;
+    }
+    size_t size = num_concats * (offset1 + offset2) * concat_size;
+
+    const float *x_data = (const float *)(input_x->data);
+    const float *y_data = (const float *)(input_y->data);
+    float *output_data = (float *)(output->data);
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        concat_kernel<<<blocks, threads, 0,
+                        *(cudaStream_t *)stream_handle->handle>>>(
+            x_data, y_data, output_data, concat_size, offset1, offset2, size);
+    } else {
+        concat_kernel<<<blocks, threads>>>(x_data, y_data, output_data,
+                                           concat_size, offset1, offset2, size);
+    }
+    return 0;
+}
+
+__global__ void concat_gradient_kernel(const float *o_data, float *i_data,
+                                       int concat_size, int concat_offset,
+                                       int small_offset, int big_offset,
+                                       size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    int post_ind = ind % concat_size;
+    int temp = ind / concat_size;
+    int mid_ind = temp % small_offset + concat_offset;
+    int pre_ind = temp / small_offset;
+    int o_ind = (pre_ind * big_offset + mid_ind) * concat_size + post_ind;
+    i_data[ind] = o_data[o_ind];
+}
+
+int DLGpuConcat_gradient(const DLArrayHandle o_grad, DLArrayHandle i_grad,
+                         int axis = 0, int id = 0,
+                         DLStreamHandle stream_handle = NULL) {
+    int now_ndim = o_grad->ndim;
+    assert(now_ndim == i_grad->ndim);
+    int num_concats = 1;
+    for (int i = 0; i < axis; ++i) {
+        int cur_dim = o_grad->shape[i];
+        assert(cur_dim == i_grad->shape[i]);
+        num_concats *= cur_dim;
+    }
+    int big_offset = o_grad->shape[axis];
+    int small_offset = i_grad->shape[axis];
+    int concat_offset = (id == 1) ? (big_offset - small_offset) : 0;
+    int concat_size = 1;
+    for (int i = axis + 1; i < now_ndim; ++i) {
+        int cur_dim = o_grad->shape[i];
+        assert(cur_dim == i_grad->shape[i]);
+        concat_size *= cur_dim;
+    }
+    const float *o_data = (const float *)(o_grad->data);
+    float *i_data = (float *)(i_grad->data);
+    size_t size = num_concats * small_offset * concat_size;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        concat_gradient_kernel<<<blocks, threads, 0,
+                                 *(cudaStream_t *)stream_handle->handle>>>(
+            o_data, i_data, concat_size, concat_offset, small_offset,
+            big_offset, size);
+    } else {
+        concat_gradient_kernel<<<blocks, threads>>>(o_data, i_data, concat_size,
+                                                    concat_offset, small_offset,
+                                                    big_offset, size);
+    }
+    return 0;
+}
diff --git a/src/ops/Conv2d.cu b/src/ops/Conv2d.cu
new file mode 100644
index 0000000..35c7f8c
--- /dev/null
+++ b/src/ops/Conv2d.cu
@@ -0,0 +1,366 @@
+#include "gpu_runtime.h"
+
+extern __global__ void array_set_kernel(float *output, float value,
+                                        size_t size);
+extern __global__ void float_memory_copy(float *A, const float *B, size_t len);
+extern int Float_Add(float *A, const float *B, int len,
+                     DLStreamHandle stream_handle);
+
+__global__ void im2col_kernel(int N, int C, int H, int W, int filter_outChannel,
+                              int filter_H, int filter_W,
+                              const float *input_data_x, float *workspace_data,
+                              const int padding, const int stride,
+                              const int blocks) {
+    int block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int max_threads_per_block = blockDim.x;
+    int thread_index = block_id * max_threads_per_block + thread_id;
+    int out_H = (H + 2 * padding - filter_H) / stride + 1;
+    int out_W = (W + 2 * padding - filter_W) / stride + 1;
+    for (int i = thread_index; i < N * C * out_H * out_W;
+         i += blocks * max_threads_per_block) {
+        int N_i = i / (C * out_H * out_W);
+        int base_N = N_i * C * out_H * out_W;
+        int C_i = (i - base_N) / (out_H * out_W);
+        int base_C = C_i * out_H * out_W;
+        int out_H_i = (i - base_N - base_C) / out_W;
+        int out_W_i = i % out_W;
+        assert(base_N + base_C + out_H_i * out_W + out_W_i == i);
+        int in_x = out_H_i * stride - padding;
+        int in_y = out_W_i * stride - padding;
+        for (int x = in_x; x < in_x + filter_H; x++)
+            for (int y = in_y; y < in_y + filter_W; y++) {
+                if (x < 0 || x >= H || y < 0 || y >= W)
+                    workspace_data[(base_N + base_C) * filter_H * filter_W
+                                   + ((x - in_x) * filter_W + (y - in_y))
+                                         * out_H * out_W
+                                   + out_H_i * out_W + out_W_i] = 0;
+                else
+                    workspace_data[(base_N + base_C) * filter_H * filter_W
+                                   + ((x - in_x) * filter_W + (y - in_y))
+                                         * out_H * out_W
+                                   + out_H_i * out_W + out_W_i] =
+                        input_data_x[(N_i * C + C_i) * H * W + x * W + y];
+            }
+    }
+}
+
+__global__ void gemm_kernel(const float *A, const float *B, float *C, int rowA,
+                            int colA, int rowB, int colB) {
+    int r = blockIdx.y * blockDim.y + threadIdx.y;
+    int c = blockIdx.x * blockDim.x + threadIdx.x;
+    assert(rowB % colA == 0);
+    int K = rowB / colA;
+    if (r >= rowA || c >= colB)
+        return;
+    for (int k = 0; k < K; k++) {
+        float Cvalue = 0.0;
+        for (int e = 0; e < colA; e++)
+            Cvalue += A[r * colA + e] * B[(e + k * colA) * colB + c];
+        C[(r + k * rowA) * colB + c] = Cvalue;
+    }
+}
+
+int DLGpuConv2d(const DLArrayHandle input_x, const DLArrayHandle input_f,
+                DLArrayHandle output, DLArrayHandle workspace_arr,
+                const int padding, const int stride,
+                DLStreamHandle stream_handle = NULL) {
+    assert(input_x->ndim == 4);
+    assert(input_f->ndim == 4);
+    assert(input_x->shape[1] == input_f->shape[1]);
+    int N = input_x->shape[0];
+    int C = input_x->shape[1];
+    int H = input_x->shape[2];
+    int W = input_x->shape[3];
+    int filter_outChannel = input_f->shape[0];
+    // int filter_inChannel = input_f->shape[1];
+    int filter_H = input_f->shape[2];
+    int filter_W = input_f->shape[3];
+    assert((H + 2 * padding - filter_H) % stride == 0);
+    assert((W + 2 * padding - filter_W) % stride == 0);
+    int out_H = (H + 2 * padding - filter_H) / stride + 1;
+    int out_W = (W + 2 * padding - filter_W) / stride + 1;
+    int y_col_size = out_H * out_W;
+    int y_row_size = C * filter_H * filter_W;
+
+    const float *input_data_x = (const float *)input_x->data;
+    const float *input_data_f = (const float *)input_f->data;
+    float *output_data = (float *)output->data;
+    float *workspace_data = (float *)workspace_arr->data;
+    // get max threads and blocks
+    int dev_id = (input_x->ctx).device_id;
+    ;
+    cudaSetDevice(dev_id);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev_id);
+    int threads = deviceProp.maxThreadsPerBlock;
+    int blocks = deviceProp.maxThreadsPerMultiProcessor / threads
+                 * deviceProp.multiProcessorCount;
+    // im2col kernel
+    if (stream_handle)
+        im2col_kernel<<<blocks, threads, 0,
+                        *(cudaStream_t *)stream_handle->handle>>>(
+            N, C, H, W, filter_outChannel, filter_H, filter_W, input_data_x,
+            workspace_data, padding, stride, blocks);
+    else
+        im2col_kernel<<<blocks, threads>>>(
+            N, C, H, W, filter_outChannel, filter_H, filter_W, input_data_x,
+            workspace_data, padding, stride, blocks);
+    // sgemm
+    const int BLOCK_SIZE = 16;
+    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 dimGrid((std::max(y_row_size, y_col_size) + dimBlock.x - 1)
+                     / dimBlock.x,
+                 (std::max(filter_outChannel, y_row_size) + dimBlock.y - 1)
+                     / dimBlock.y);
+    if (stream_handle)
+        gemm_kernel<<<dimGrid, dimBlock, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            input_data_f, workspace_data, output_data, filter_outChannel,
+            y_row_size, N * y_row_size, y_col_size);
+    else
+        gemm_kernel<<<dimGrid, dimBlock>>>(
+            input_data_f, workspace_data, output_data, filter_outChannel,
+            y_row_size, N * y_row_size, y_col_size);
+    return 0;
+}
+__global__ void trans_im2col_kernel(int N, int C, int H, int W,
+                                    int filter_outChannel, int filter_H,
+                                    int filter_W, float *input_data_x,
+                                    float *workspace_data, const int padding,
+                                    const int stride, const int blocks) {
+    int block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int max_threads_per_block = blockDim.x;
+    int thread_index = block_id * max_threads_per_block + thread_id;
+    int out_H = (H + 2 * padding - filter_H) / stride + 1;
+    int out_W = (W + 2 * padding - filter_W) / stride + 1;
+    for (int i = thread_index; i < N * C * out_H * out_W;
+         i += blocks * max_threads_per_block) {
+        int N_i = i / (C * out_H * out_W);
+        int base_N = N_i * C * out_H * out_W;
+        int C_i = (i - base_N) / (out_H * out_W);
+        int base_C = C_i * out_H * out_W;
+        int out_H_i = (i - base_N - base_C) / out_W;
+        int out_W_i = i % out_W;
+        assert(base_N + base_C + out_H_i * out_W + out_W_i == i);
+        int in_x = out_H_i * stride - padding;
+        int in_y = out_W_i * stride - padding;
+        for (int x = in_x; x < in_x + filter_H; x++)
+            for (int y = in_y; y < in_y + filter_W; y++) {
+                if (x < 0 || x >= H || y < 0 || y >= W)
+                    workspace_data[(base_N + base_C) * filter_H * filter_W
+                                   + ((x - in_x) * filter_W + (y - in_y))
+                                         * out_H * out_W
+                                   + out_H_i * out_W + out_W_i] = 0;
+                else
+                    atomicAdd(
+                        &input_data_x[(N_i * C + C_i) * H * W + x * W + y],
+                        workspace_data[(base_N + base_C) * filter_H * filter_W
+                                       + ((x - in_x) * filter_W + (y - in_y))
+                                             * out_H * out_W
+                                       + out_H_i * out_W + out_W_i]);
+            }
+    }
+}
+__global__ void transA_gemm_kernel(const float *A, const float *B, float *C,
+                                   int rowA, int colA, int rowB, int colB) {
+    size_t r = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t c = blockIdx.y * blockDim.y + threadIdx.y;
+    if (r >= colA || c >= colB)
+        return;
+    assert(rowB % rowA == 0);
+    size_t batch_size = rowB / rowA;
+    // output shape(output_batch, filter_col_size, output_col_size)
+    for (int i = 0; i < batch_size; i++) {
+        float tmp = 0;
+        // C[batch_size][colA][colB]  -> C[i][r][c]
+        for (int j = 0; j < rowA; j++)
+            // A[j][r] * B[i][j][c]
+            tmp += A[j * colA + r] * B[i * rowA * colB + j * colB + c];
+        C[i * colA * colB + r * colB + c] = tmp;
+    }
+}
+__global__ void batch_transB_gemm_kernel(const float *A, const float *B,
+                                         float *C, int rowA, int colA, int rowB,
+                                         int colB, int batch_size) {
+    size_t r = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t c = blockIdx.y * blockDim.y + threadIdx.y;
+    if (r >= rowA || c >= rowB)
+        return;
+    assert(colA == colB);
+    // output shape(batch_size, filter_row_size, filter_col_size)
+    for (int i = 0; i < batch_size; i++) {
+        float tmp = 0;
+        // C[batch_size][rowA][rowB]  -> C[i][r][c]
+        for (int j = 0; j < colA; j++)
+            // A[i][r][j] * B[i][c][j]
+            tmp += A[i * rowA * colB + r * colB + j]
+                   * B[i * rowB * colB + c * colB + j];
+        C[i * rowA * rowB + r * rowB + c] = tmp;
+    }
+}
+
+int DLGpuConv2d_Gradient_of_Data(const DLArrayHandle input_f,
+                                 const DLArrayHandle gradient_y,
+                                 DLArrayHandle gradient_x,
+                                 DLArrayHandle workspace_im2col,
+                                 const int padding, const int stride,
+                                 DLStreamHandle stream_handle = NULL) {
+    size_t input_N = gradient_x->shape[0];
+    size_t input_C = gradient_x->shape[1];
+    size_t input_H = gradient_x->shape[2];
+    size_t input_W = gradient_x->shape[3];
+    size_t filter_outChannel = input_f->shape[0];
+    size_t filter_inChannel = input_f->shape[1];
+    size_t filter_H = input_f->shape[2];
+    size_t filter_W = input_f->shape[3];
+    size_t output_N = gradient_y->shape[0];
+    size_t output_C = gradient_y->shape[1];
+    size_t output_H = gradient_y->shape[2];
+    size_t output_W = gradient_y->shape[3];
+
+    float *gradient_x_data = (float *)gradient_x->data;
+    float *output_data = (float *)gradient_y->data;
+    size_t output_batch = output_N;
+    size_t output_row_size = output_C;
+    size_t output_col_size = output_H * output_W;
+    const float *filter_data = (const float *)input_f->data;
+    size_t filter_row_size = filter_outChannel;
+    size_t filter_col_size = filter_inChannel * filter_H * filter_W;
+
+    float *gradient_im2col_XX;
+    gradient_im2col_XX = (float *)workspace_im2col->data;
+
+    // output size (output_N, filter_C * filter_H * filter_W, output_H *
+    // output*W)  == (output_batch, filter_col_size, output_col_size)
+    const int BLOCK_SIZE = 16;
+    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 dimGrid((filter_col_size + BLOCK_SIZE - 1) / dimBlock.x,
+                 (output_col_size + BLOCK_SIZE - 1) / dimBlock.y);
+    if (stream_handle)
+        transA_gemm_kernel<<<dimGrid, dimBlock, 0,
+                             *(cudaStream_t *)stream_handle->handle>>>(
+            filter_data, output_data, gradient_im2col_XX, filter_row_size,
+            filter_col_size, output_batch * output_row_size, output_col_size);
+    else
+        transA_gemm_kernel<<<dimGrid, dimBlock>>>(
+            filter_data, output_data, gradient_im2col_XX, filter_row_size,
+            filter_col_size, output_batch * output_row_size, output_col_size);
+    // get max threads and blocks
+    int dev_id = (input_f->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev_id);
+    int threads = deviceProp.maxThreadsPerBlock;
+    int blocks = deviceProp.maxThreadsPerMultiProcessor / threads
+                 * deviceProp.multiProcessorCount;
+    // get the gradient of input_x
+    size_t numthread = input_N * input_C * input_H * input_W;
+    size_t numblocks = (numthread + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        array_set_kernel<<<numblocks, THREADS_PER_BLOCK, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            gradient_x_data, 0, numthread);
+    else
+        array_set_kernel<<<numblocks, THREADS_PER_BLOCK>>>(gradient_x_data, 0,
+                                                           numthread);
+    if (stream_handle)
+        trans_im2col_kernel<<<blocks, threads, 0,
+                              *(cudaStream_t *)stream_handle->handle>>>(
+            input_N, input_C, input_H, input_W, filter_outChannel, filter_H,
+            filter_W, gradient_x_data, gradient_im2col_XX, padding, stride,
+            blocks);
+    else
+        trans_im2col_kernel<<<blocks, threads>>>(
+            input_N, input_C, input_H, input_W, filter_outChannel, filter_H,
+            filter_W, gradient_x_data, gradient_im2col_XX, padding, stride,
+            blocks);
+
+    return 0;
+}
+
+int DLGpuConv2d_Gradient_of_Filter(const DLArrayHandle input_x,
+                                   const DLArrayHandle gradient_y,
+                                   DLArrayHandle gradient_f,
+                                   DLArrayHandle workspace_im2col,
+                                   DLArrayHandle workspace_batch_filter,
+                                   const int padding, const int stride,
+                                   DLStreamHandle stream_handle = NULL) {
+    size_t input_N = input_x->shape[0];
+    size_t input_C = input_x->shape[1];
+    size_t input_H = input_x->shape[2];
+    size_t input_W = input_x->shape[3];
+    size_t filter_outChannel = gradient_f->shape[0];
+    size_t filter_inChannel = gradient_f->shape[1];
+    size_t filter_H = gradient_f->shape[2];
+    size_t filter_W = gradient_f->shape[3];
+    size_t output_N = gradient_y->shape[0];
+    size_t output_C = gradient_y->shape[1];
+    size_t output_H = gradient_y->shape[2];
+    size_t output_W = gradient_y->shape[3];
+
+    const float *input_x_data = (const float *)input_x->data;
+    float *gradient_f_data = (float *)gradient_f->data;
+    float *output_data = (float *)gradient_y->data;
+    size_t output_batch = output_N;
+    size_t output_row_size = output_C;
+    size_t output_col_size = output_H * output_W;
+    size_t filter_row_size = filter_outChannel;
+    size_t filter_col_size = filter_inChannel * filter_H * filter_W;
+
+    // get max threads and blocks
+    int dev_id = (input_x->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev_id);
+    int threads = deviceProp.maxThreadsPerBlock;
+    int blocks = deviceProp.maxThreadsPerMultiProcessor / threads
+                 * deviceProp.multiProcessorCount;
+
+    float *im2col_XX;
+    im2col_XX = (float *)workspace_im2col->data;
+    if (stream_handle)
+        im2col_kernel<<<blocks, threads, 0,
+                        *(cudaStream_t *)stream_handle->handle>>>(
+            input_N, input_C, input_H, input_W, filter_outChannel, filter_H,
+            filter_W, input_x_data, im2col_XX, padding, stride, blocks);
+    else
+        im2col_kernel<<<blocks, threads>>>(
+            input_N, input_C, input_H, input_W, filter_outChannel, filter_H,
+            filter_W, input_x_data, im2col_XX, padding, stride, blocks);
+
+    size_t im2col_XX_row = filter_col_size;
+    size_t im2col_XX_col = output_col_size;
+    float *batch_filter;
+    // batch_filter = new float[input_N * filter_row_size * filter_col_size];
+    batch_filter = (float *)(workspace_batch_filter->data);
+    const int BLOCK_SIZE = 16;
+    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 dimGrid((filter_row_size + BLOCK_SIZE - 1) / dimBlock.x,
+                 (filter_col_size + BLOCK_SIZE - 1) / dimBlock.y);
+    if (stream_handle)
+        batch_transB_gemm_kernel<<<dimGrid, dimBlock, 0,
+                                   *(cudaStream_t *)stream_handle->handle>>>(
+            output_data, im2col_XX, batch_filter, output_row_size,
+            output_col_size, im2col_XX_row, im2col_XX_col, output_batch);
+    else
+        batch_transB_gemm_kernel<<<dimGrid, dimBlock>>>(
+            output_data, im2col_XX, batch_filter, output_row_size,
+            output_col_size, im2col_XX_row, im2col_XX_col, output_batch);
+    size_t total = filter_row_size * filter_col_size;
+    while (output_batch != 1) {
+        Float_Add(batch_filter, batch_filter + (output_batch + 1) / 2 * total,
+                  output_batch / 2 * total, stream_handle);
+        output_batch = (output_batch + 1) / 2;
+    }
+    size_t BLOCKS = (total + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            gradient_f_data, batch_filter, total);
+    else
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK>>>(gradient_f_data,
+                                                         batch_filter, total);
+    return 0;
+}
diff --git a/src/ops/Conv2dBroadcast.cu b/src/ops/Conv2dBroadcast.cu
new file mode 100644
index 0000000..c7c8205
--- /dev/null
+++ b/src/ops/Conv2dBroadcast.cu
@@ -0,0 +1,34 @@
+#include "gpu_runtime.h"
+
+__global__ void conv2d_broadcast_to_kernel(size_t nthreads,
+                                           const float *input_data,
+                                           float *output_data,
+                                           size_t input_size,
+                                           size_t output_size) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= nthreads)
+        return;
+    size_t input_id = (id % (input_size * output_size)) / output_size;
+    output_data[id] = input_data[input_id];
+}
+
+int DLGpuConv2d_broadcast_to(const DLArrayHandle input_x,
+                             DLArrayHandle output_y,
+                             DLStreamHandle stream_handle = NULL) {
+    assert(input_x->shape[0] == output_y->shape[1]);
+    const float *input_data = (const float *)input_x->data;
+    float *output_data = (float *)output_y->data;
+    size_t batch_size = output_y->shape[0];
+    size_t input_size = input_x->shape[0];
+    size_t output_size = (output_y->shape[2]) * (output_y->shape[3]);
+    size_t nthreads = batch_size * input_size * output_size;
+    size_t BLOCKS = (nthreads + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        conv2d_broadcast_to_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                                     *(cudaStream_t *)stream_handle->handle>>>(
+            nthreads, input_data, output_data, input_size, output_size);
+    else
+        conv2d_broadcast_to_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            nthreads, input_data, output_data, input_size, output_size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Conv2dReduceSum.cu b/src/ops/Conv2dReduceSum.cu
new file mode 100644
index 0000000..6ca4c45
--- /dev/null
+++ b/src/ops/Conv2dReduceSum.cu
@@ -0,0 +1,37 @@
+#include "gpu_runtime.h"
+
+__global__ void conv2d_reduce_kernel(const float *input_data,
+                                     float *output_data, size_t input_size,
+                                     size_t output_size, size_t batch_size) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= output_size)
+        return;
+    float temp = 0;
+    for (int i = 0; i < batch_size; i++) {
+        for (int j = 0; j < input_size; j++) {
+            temp +=
+                input_data[i * input_size * output_size + id * input_size + j];
+        }
+    }
+    output_data[id] = temp;
+}
+
+int DLGpuConv2d_reduce_sum(const DLArrayHandle input_x, DLArrayHandle output_y,
+                           DLStreamHandle stream_handle = NULL) {
+    assert(input_x->shape[1] == output_y->shape[0]);
+    const float *input_data = (const float *)input_x->data;
+    float *output_data = (float *)output_y->data;
+    size_t batch_size = input_x->shape[0];
+    size_t input_size = input_x->shape[2] * input_x->shape[3];
+    size_t output_size = output_y->shape[0];
+
+    size_t BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        conv2d_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                               *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, input_size, output_size, batch_size);
+    else
+        conv2d_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            input_data, output_data, input_size, output_size, batch_size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/CuSparseCsrmm.cu b/src/ops/CuSparseCsrmm.cu
new file mode 100644
index 0000000..f687ecd
--- /dev/null
+++ b/src/ops/CuSparseCsrmm.cu
@@ -0,0 +1,154 @@
+#include "gpu_runtime.h"
+
+__global__ void spmm_kernel(const int *indptr, const int *indices,
+                            const float *data, const float *B, float *C, int k,
+                            int n, int m, int start_pos, int end_pos) {
+    // matC (n , m) matB (k , m) C = A * B
+    // data & indices (nnz) ,indptr(n)
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t ind = id / m;
+    size_t offset = id - ind * m;
+    if (ind >= n)
+        return;
+    C[m * ind + offset] = 0;
+    int i_s = indptr[ind], i_e = indptr[ind + 1];
+    if (start_pos == -1) {
+        for (int i = i_s; i < i_e; i++) {
+            int from = indices[i];
+            float scale = data[i];
+            C[m * ind + offset] += B[m * from + offset] * scale;
+        }
+    } else {
+        for (int i = i_s; i < i_e; i++) {
+            if (indices[i] >= start_pos && indices[i] < end_pos) {
+                int from = indices[i] - start_pos;
+                float scale = data[i];
+                C[m * ind + offset] += B[m * from + offset] * scale;
+            }
+        }
+    }
+    return;
+}
+
+__global__ void spmm_set_zero_kernel(float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = 0;
+}
+
+__global__ void spmm_T_kernel(const int *indptr, const int *indices,
+                              const float *data, const float *B, float *C,
+                              int k, int n, int m) {
+    // matC (n , m) matB (k , m) C = A^T *B
+    // data & indices (nnz) ,indptr(k)
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t ind = id / m;
+    size_t offset = id - ind * m;
+    if (ind >= k)
+        return;
+    // C[m * ind + offset] = 0;
+    int i_s = indptr[ind], i_e = indptr[ind + 1];
+    float val = B[m * ind + offset];
+    for (int i = i_s; i < i_e; i++) {
+        int to = indices[i];
+        float addend = data[i] * val;
+        atomicAdd(&C[m * to + offset], addend);
+    }
+    return;
+}
+
+int CuSparse_DLGpuCsrmm0(const DLArrayHandle data_handle,
+                         const DLArrayHandle row_handle,
+                         const DLArrayHandle col_handle, int nrow, int ncol,
+                         const DLArrayHandle matB, DLArrayHandle matC,
+                         int start_pos, int end_pos,
+                         DLStreamHandle stream_handle = NULL) {
+    int n = matC->shape[0];
+    int m = matC->shape[1];
+    int k = matB->shape[0];
+    dim3 blocks;
+    dim3 threads;
+    if (n * m <= 1024) {
+        threads.x = n * m;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (n * m + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        spmm_kernel<<<blocks, threads, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            (const int *)row_handle->data, (const int *)col_handle->data,
+            (const float *)data_handle->data, (const float *)matB->data,
+            (float *)matC->data, k, n, m, start_pos, end_pos);
+    } else {
+        spmm_kernel<<<blocks, threads>>>(
+            (const int *)row_handle->data, (const int *)col_handle->data,
+            (const float *)data_handle->data, (const float *)matB->data,
+            (float *)matC->data, k, n, m, start_pos, end_pos);
+    }
+
+    return 0;
+}
+
+int CuSparse_DLGpuCsrmm1(const DLArrayHandle data_handle,
+                         const DLArrayHandle row_handle,
+                         const DLArrayHandle col_handle, int nrow, int ncol,
+                         const DLArrayHandle matB, DLArrayHandle matC,
+                         DLStreamHandle stream_handle = NULL) {
+    int n = matC->shape[0];
+    int m = matC->shape[1];
+    int k = matB->shape[0];
+    dim3 blocks;
+    dim3 threads;
+    if (k * m <= 1024) {
+        threads.x = k * m;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (k * m + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        spmm_set_zero_kernel<<<n, m, 0,
+                               *(cudaStream_t *)stream_handle->handle>>>(
+            (float *)matC->data, n * m);
+        spmm_T_kernel<<<blocks, threads, 0,
+                        *(cudaStream_t *)stream_handle->handle>>>(
+            (const int *)row_handle->data, (const int *)col_handle->data,
+            (const float *)data_handle->data, (const float *)matB->data,
+            (float *)matC->data, k, n, m);
+    } else {
+        spmm_set_zero_kernel<<<n, m>>>((float *)matC->data, n * m);
+        spmm_T_kernel<<<blocks, threads>>>(
+            (const int *)row_handle->data, (const int *)col_handle->data,
+            (const float *)data_handle->data, (const float *)matB->data,
+            (float *)matC->data, k, n, m);
+    }
+
+    return 0;
+}
+
+int CuSparse_DLGpuCsrmm(const DLArrayHandle data_handle,
+                        const DLArrayHandle row_handle,
+                        const DLArrayHandle col_handle, int nrow, int ncol,
+                        bool transposeA, const DLArrayHandle matB,
+                        bool transposeB, DLArrayHandle matC, int start_pos = -1,
+                        int end_pos = -1, DLStreamHandle stream_handle = NULL) {
+    assert(!transposeB);
+    assert(data_handle->ndim == 1);
+    assert(row_handle->ndim == 1);
+    assert(col_handle->ndim == 1);
+    assert(matB->ndim == 2);
+    assert(matC->ndim == 2);
+    if (!transposeA) {
+        return CuSparse_DLGpuCsrmm0(data_handle, row_handle, col_handle, nrow,
+                                    ncol, matB, matC, start_pos, end_pos,
+                                    stream_handle);
+    } else {
+        return CuSparse_DLGpuCsrmm1(data_handle, row_handle, col_handle, nrow,
+                                    ncol, matB, matC, stream_handle);
+    }
+}
\ No newline at end of file
diff --git a/src/ops/CuSparseCsrmv.cu b/src/ops/CuSparseCsrmv.cu
new file mode 100644
index 0000000..9132f99
--- /dev/null
+++ b/src/ops/CuSparseCsrmv.cu
@@ -0,0 +1,35 @@
+#include "gpu_runtime.h"
+
+int CuSparse_DLGpuCsrmv(const DLArrayHandle data_handle,
+                        const DLArrayHandle row_handle,
+                        const DLArrayHandle col_handle, int nrow, int ncol,
+                        bool transpose, const DLArrayHandle input_handle,
+                        DLArrayHandle output_handle,
+                        DLStreamHandle stream_handle = NULL) {
+    assert(data_handle->ndim == 1);
+    assert(row_handle->ndim == 1);
+    assert(col_handle->ndim == 1);
+    assert(transpose ? nrow == input_handle->shape[0] :
+                       ncol == input_handle->shape[0]);
+
+    int nnz = data_handle->shape[0];
+    int dev_id = (data_handle->ctx).device_id;
+    cusp_init(dev_id, stream_handle);
+
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    cusparseMatDescr_t descr = 0;
+    CUSP_CALL(cusparseCreateMatDescr(&descr));
+    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :
+                                            CUSPARSE_OPERATION_NON_TRANSPOSE;
+    CUSP_CALL(cusparseScsrmv(
+        cusp_map[dev_id], trans, nrow, ncol, nnz, (const float *)&alpha, descr,
+        (const float *)data_handle->data, (const int *)row_handle->data,
+        (const int *)col_handle->data, (const float *)input_handle->data,
+        (const float *)&beta, (float *)output_handle->data));
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/CudnnAvgPool.cu b/src/ops/CudnnAvgPool.cu
new file mode 100644
index 0000000..dd07cbe
--- /dev/null
+++ b/src/ops/CudnnAvgPool.cu
@@ -0,0 +1,114 @@
+#include "gpu_runtime.h"
+
+int CuDNN_DLGpuAvgerage_Pooling2d(const DLArrayHandle input,
+                                  const size_t kernel_H, const size_t kernel_W,
+                                  DLArrayHandle output, const size_t padding,
+                                  const size_t stride,
+                                  DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input->shape[0];
+    size_t input_C = input->shape[1];
+    size_t input_H = input->shape[2];
+    size_t input_W = input->shape[3];
+    const float *input_data = (const float *)input->data;
+
+    // output
+    size_t output_H = output->shape[2];
+    size_t output_W = output->shape[3];
+    float *output_data = (float *)output->data;
+
+    // pooling descriptor
+    cudnnPoolingDescriptor_t avgpool_desc;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&avgpool_desc));
+    CUDNN_CALL(cudnnSetPooling2dDescriptor(
+        avgpool_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+        CUDNN_PROPAGATE_NAN, kernel_H, kernel_W, padding, padding, stride,
+        stride));
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          output_H, output_W));
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnPoolingForward(cudnn_map[dev_id], avgpool_desc, &alpha,
+                                   input_desc, input_data, &beta, output_desc,
+                                   output_data));
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(avgpool_desc));
+    return 0;
+}
+
+int CuDNN_DLGpuAvgerage_Pooling2d_gradient(
+    const DLArrayHandle output_Y, const DLArrayHandle gradient_Y,
+    const DLArrayHandle input_X, const size_t kernel_H, const size_t kernel_W,
+    DLArrayHandle gradient_X, const size_t padding, const size_t stride,
+    DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input_X->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_X->shape[0];
+    size_t input_C = input_X->shape[1];
+    size_t input_H = input_X->shape[2];
+    size_t input_W = input_X->shape[3];
+    const float *input_data = (const float *)input_X->data;
+    float *gradient_x_data = (float *)gradient_X->data;
+    // output
+    size_t output_H = output_Y->shape[2];
+    size_t output_W = output_Y->shape[3];
+    const float *output_data = (const float *)output_Y->data;
+    const float *gradient_Y_data = (const float *)gradient_Y->data;
+
+    // pooling descriptor
+    cudnnPoolingDescriptor_t avgpool_desc;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&avgpool_desc));
+    CUDNN_CALL(cudnnSetPooling2dDescriptor(
+        avgpool_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+        CUDNN_PROPAGATE_NAN, kernel_H, kernel_W, padding, padding, stride,
+        stride));
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          output_H, output_W));
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnPoolingBackward(cudnn_map[dev_id], avgpool_desc, &alpha,
+                                    output_desc, output_data, output_desc,
+                                    gradient_Y_data, input_desc, input_data,
+                                    &beta, input_desc, gradient_x_data));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(avgpool_desc));
+    return 0;
+}
diff --git a/src/ops/CudnnBn.cu b/src/ops/CudnnBn.cu
new file mode 100644
index 0000000..1654b59
--- /dev/null
+++ b/src/ops/CudnnBn.cu
@@ -0,0 +1,212 @@
+#include "gpu_runtime.h"
+
+__global__ void init_mean_and_var_kernel(float *mean, float *var, size_t size) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= size)
+        return;
+    mean[id] = 0;
+    var[id] = 0;
+}
+
+__global__ void copy_mean_and_var_kernel(float *mean, float *var,
+                                         float *saved_mean, float *saved_var,
+                                         size_t size) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= size)
+        return;
+    saved_mean[id] = mean[id];
+    saved_var[id] = var[id];
+}
+
+// the shape of bn_scale/bias   1*C*1*1
+int CuDNN_DLGpuBatch_Normalization(
+    const DLArrayHandle input_X, const DLArrayHandle bn_scale,
+    const DLArrayHandle bn_bias, DLArrayHandle output_Y, float momentum,
+    float eps, DLArrayHandle save_mean_arr, DLArrayHandle save_var_arr,
+    DLArrayHandle running_mean_arr, DLArrayHandle running_var_arr,
+    DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_X->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_X->shape[0];
+    size_t input_C = input_X->shape[1];
+    size_t input_H = input_X->shape[2];
+    size_t input_W = input_X->shape[3];
+    const float *input_data = (const float *)(input_X->data);
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output
+    float *output_data = (float *)(output_Y->data);
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // bn parameter descriptor
+    cudnnTensorDescriptor_t bnScaleBiasMeanVar_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVar_desc));
+    CUDNN_CALL(
+        cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVar_desc, input_desc,
+                                      CUDNN_BATCHNORM_SPATIAL)); // after conv
+
+    // bn parameter
+    const float *bn_scale_data = (const float *)(bn_scale->data);
+    const float *bn_bias_data = (const float *)(bn_bias->data);
+
+    void *save_mean = save_mean_arr->data;
+    void *save_var = save_var_arr->data;
+    void *running_mean = running_mean_arr->data;
+    void *running_var = running_var_arr->data;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnBatchNormalizationForwardTraining(
+        cudnn_map[dev_id], CUDNN_BATCHNORM_SPATIAL, &alpha, &beta, input_desc,
+        input_data, output_desc, output_data, bnScaleBiasMeanVar_desc,
+        bn_scale_data, bn_bias_data, momentum, (float *)save_mean,
+        (float *)save_var, eps, running_mean, running_var));
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVar_desc));
+
+    return 0;
+}
+
+int CuDNN_DLGpuBatch_Normalization_gradient(
+    const DLArrayHandle gradient_Y, const DLArrayHandle input_X,
+    const DLArrayHandle bn_scale, DLArrayHandle gradient_X,
+    DLArrayHandle gradient_bn_scale, DLArrayHandle gradient_bn_bias, float eps,
+    DLArrayHandle running_mean_arr, DLArrayHandle running_var_arr,
+    DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_X->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_X->shape[0];
+    size_t input_C = input_X->shape[1];
+    size_t input_H = input_X->shape[2];
+    size_t input_W = input_X->shape[3];
+    const float *input_data = (const float *)(input_X->data);
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output
+    const float *gradient_y_data = (const float *)(gradient_Y->data);
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // bn parameter descriptor
+    cudnnTensorDescriptor_t bnScaleBiasMeanVar_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVar_desc));
+    CUDNN_CALL(
+        cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVar_desc, input_desc,
+                                      CUDNN_BATCHNORM_SPATIAL)); // after conv
+
+    const float *bn_scale_data = (const float *)(bn_scale->data);
+
+    // x gradient
+    float *gradient_x_data = (float *)(gradient_X->data);
+    // bn gradient
+    float *gradient_bn_bias_data = (float *)(gradient_bn_bias->data);
+    float *gradient_bn_scale_data = (float *)(gradient_bn_scale->data);
+    void *running_mean = (float *)(running_mean_arr->data);
+    void *running_var = (float *)(running_var_arr->data);
+    float one = 1.0f;
+    float zero = 0.0f;
+
+    CUDNN_CALL(cudnnBatchNormalizationBackward(
+        cudnn_map[dev_id], CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &one, &zero,
+        &one, &zero, input_desc, input_data, output_desc, gradient_y_data,
+        input_desc, gradient_x_data, bnScaleBiasMeanVar_desc, bn_scale_data,
+        gradient_bn_scale_data, gradient_bn_bias_data, eps, running_mean,
+        running_var));
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVar_desc));
+    return 0;
+}
+
+int CuDNN_DLGpuBatch_Normalization_inference(
+    const DLArrayHandle input_X, const DLArrayHandle bn_scale,
+    const DLArrayHandle bn_bias, DLArrayHandle output_Y, float eps,
+    DLArrayHandle save_mean_arr, DLArrayHandle save_var_arr,
+    DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_X->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_X->shape[0];
+    size_t input_C = input_X->shape[1];
+    size_t input_H = input_X->shape[2];
+    size_t input_W = input_X->shape[3];
+    const float *input_data = (const float *)(input_X->data);
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output
+    float *output_data = (float *)(output_Y->data);
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // bn parameter descriptor
+    cudnnTensorDescriptor_t bnScaleBiasMeanVar_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVar_desc));
+    CUDNN_CALL(
+        cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVar_desc, input_desc,
+                                      CUDNN_BATCHNORM_SPATIAL)); // after conv
+
+    // bn parameter
+    const float *bn_scale_data = (const float *)(bn_scale->data);
+    const float *bn_bias_data = (const float *)(bn_bias->data);
+
+    const float *save_mean = (const float *)save_mean_arr->data;
+    const float *save_var = (const float *)save_var_arr->data;
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnBatchNormalizationForwardInference(
+        cudnn_map[dev_id], CUDNN_BATCHNORM_SPATIAL, &alpha, &beta, input_desc,
+        input_data, output_desc, output_data, bnScaleBiasMeanVar_desc,
+        bn_scale_data, bn_bias_data, save_mean, save_var, eps));
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVar_desc));
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/CudnnConv2d.cu b/src/ops/CudnnConv2d.cu
new file mode 100644
index 0000000..70d84ee
--- /dev/null
+++ b/src/ops/CudnnConv2d.cu
@@ -0,0 +1,237 @@
+#include "gpu_runtime.h"
+
+int CuDNN_DLGpuConv2d(const DLArrayHandle input_x, const DLArrayHandle input_f,
+                      DLArrayHandle output, const int padding, const int stride,
+                      DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_x->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    size_t input_N = input_x->shape[0];
+    size_t input_C = input_x->shape[1];
+    size_t input_H = input_x->shape[2];
+    size_t input_W = input_x->shape[3];
+    const float *input_data = (const float *)input_x->data;
+
+    // input
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+    size_t filter_N = input_f->shape[0];
+    size_t filter_C = input_f->shape[1];
+    size_t filter_H = input_f->shape[2];
+    size_t filter_W = input_f->shape[3];
+    const float *filter_data = (const float *)input_f->data;
+
+    // filter
+    cudnnFilterDescriptor_t filter_desc;
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc));
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT,
+                                          CUDNN_TENSOR_NCHW, filter_N, filter_C,
+                                          filter_H, filter_W));
+
+    // convolution
+    cudnnConvolutionDescriptor_t conv_desc;
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+    CUDNN_CALL(cudnnSetConvolution2dDescriptor(
+        conv_desc, padding, padding, stride, stride, 1, 1,
+        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
+    size_t out_N = output->shape[0];
+    size_t out_C = output->shape[1];
+    size_t out_H = output->shape[2];
+    size_t out_W = output->shape[3];
+    // output
+    cudnnTensorDescriptor_t out_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, out_N, out_C, out_H,
+                                          out_W));
+    float *output_data = (float *)output->data;
+    // algorithm
+    cudnnConvolutionFwdAlgo_t algo;
+    // algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+    CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
+        cudnn_map[dev_id], input_desc, filter_desc, conv_desc, out_desc,
+        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
+    size_t workspace_size;
+    CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnn_map[dev_id], input_desc, filter_desc, conv_desc, out_desc, algo,
+        &workspace_size));
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *work_data = find_chunk(workspace_size, dev_id);
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    CUDNN_CALL(cudnnConvolutionForward(
+        cudnn_map[dev_id], &alpha, input_desc, input_data, filter_desc,
+        filter_data, conv_desc, algo, work_data, workspace_size, &beta,
+        out_desc, output_data));
+    del_chunk(work_data, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    return 0;
+}
+int CuDNN_DLGpuConv2d_Gradient_of_Filter(const DLArrayHandle input_x,
+                                         const DLArrayHandle gradient_y,
+                                         DLArrayHandle gradient_f,
+                                         const int padding, const int stride,
+                                         DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input_x->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_x->shape[0];
+    size_t input_C = input_x->shape[1];
+    size_t input_H = input_x->shape[2];
+    size_t input_W = input_x->shape[3];
+    const float *input_data = (const float *)input_x->data;
+
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+    // dy
+    size_t dy_N = gradient_y->shape[0];
+    size_t dy_C = gradient_y->shape[1];
+    size_t dy_H = gradient_y->shape[2];
+    size_t dy_W = gradient_y->shape[3];
+    const float *dy_data = (const float *)gradient_y->data;
+
+    cudnnTensorDescriptor_t dy_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(
+        dy_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dy_N, dy_C, dy_H, dy_W));
+
+    // conv2d
+    cudnnConvolutionDescriptor_t conv_desc;
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+    CUDNN_CALL(cudnnSetConvolution2dDescriptor(
+        conv_desc, padding, padding, stride, stride, 1, 1,
+        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
+    // dw
+    size_t df_N = gradient_f->shape[0];
+    size_t df_C = gradient_f->shape[1];
+    size_t df_H = gradient_f->shape[2];
+    size_t df_W = gradient_f->shape[3];
+    float *df_data = (float *)gradient_f->data;
+
+    cudnnFilterDescriptor_t df_desc;
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&df_desc));
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(
+        df_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, df_N, df_C, df_H, df_W));
+
+    // algo
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(
+        cudnn_map[dev_id], input_desc, dy_desc, conv_desc, df_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algo));
+    // algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT;
+    // algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    size_t workspace_size;
+    CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        cudnn_map[dev_id], input_desc, dy_desc, conv_desc, df_desc, algo,
+        &workspace_size));
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *work_data = find_chunk(workspace_size, dev_id);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    CUDNN_CALL(cudnnConvolutionBackwardFilter(
+        cudnn_map[dev_id], &alpha, input_desc, input_data, dy_desc, dy_data,
+        conv_desc, algo, work_data, workspace_size, &beta, df_desc, df_data));
+    del_chunk(work_data, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(df_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    return 0;
+}
+
+int CuDNN_DLGpuConv2d_Gradient_of_Data(const DLArrayHandle input_f,
+                                       const DLArrayHandle gradient_y,
+                                       DLArrayHandle gradient_x,
+                                       const int padding, const int stride,
+                                       DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input_f->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // filter
+    size_t filter_N = input_f->shape[0];
+    size_t filter_C = input_f->shape[1];
+    size_t filter_H = input_f->shape[2];
+    size_t filter_W = input_f->shape[3];
+    const float *filter_data = (const float *)input_f->data;
+
+    cudnnFilterDescriptor_t filter_desc;
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc));
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT,
+                                          CUDNN_TENSOR_NCHW, filter_N, filter_C,
+                                          filter_H, filter_W));
+    // dy
+    size_t dy_N = gradient_y->shape[0];
+    size_t dy_C = gradient_y->shape[1];
+    size_t dy_H = gradient_y->shape[2];
+    size_t dy_W = gradient_y->shape[3];
+    const float *dy_data = (const float *)gradient_y->data;
+
+    cudnnTensorDescriptor_t dy_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(
+        dy_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dy_N, dy_C, dy_H, dy_W));
+
+    // conv2d
+    cudnnConvolutionDescriptor_t conv_desc;
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+    CUDNN_CALL(cudnnSetConvolution2dDescriptor(
+        conv_desc, padding, padding, stride, stride, 1, 1,
+        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
+    // dx
+    size_t dx_N = gradient_x->shape[0];
+    size_t dx_C = gradient_x->shape[1];
+    size_t dx_H = gradient_x->shape[2];
+    size_t dx_W = gradient_x->shape[3];
+    float *dx_data = (float *)gradient_x->data;
+
+    cudnnTensorDescriptor_t dx_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(
+        dx_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dx_N, dx_C, dx_H, dx_W));
+
+    // algo
+    cudnnConvolutionBwdDataAlgo_t algo;
+    CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(
+        cudnn_map[dev_id], filter_desc, dy_desc, conv_desc, dx_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algo));
+    // algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    // algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+    size_t workspace_size;
+    CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        cudnn_map[dev_id], filter_desc, dy_desc, conv_desc, dx_desc, algo,
+        &workspace_size));
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *work_data = find_chunk(workspace_size, dev_id);
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    CUDNN_CALL(cudnnConvolutionBackwardData(
+        cudnn_map[dev_id], &alpha, filter_desc, filter_data, dy_desc, dy_data,
+        conv_desc, algo, work_data, workspace_size, &beta, dx_desc, dx_data));
+    del_chunk(work_data, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc));
+    return 0;
+}
diff --git a/src/ops/CudnnDropout.cu b/src/ops/CudnnDropout.cu
new file mode 100644
index 0000000..dd3aac1
--- /dev/null
+++ b/src/ops/CudnnDropout.cu
@@ -0,0 +1,149 @@
+#include "gpu_runtime.h"
+
+int CuDNN_DLGpuDropout(const DLArrayHandle input_X, const float dropout,
+                       DLArrayHandle output_Y, int *reserve_size,
+                       void **reserve_space, int need_allocate,
+                       DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_X->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    size_t input_N, input_C, input_H, input_W;
+    // input
+    if (input_X->ndim == 2) {
+        input_N = input_X->shape[0];
+        input_C = input_H = 1;
+        input_W = input_X->shape[1];
+    } else if (input_X->ndim == 3) {
+        input_N = input_X->shape[0];
+        input_C = input_X->shape[1];
+        input_H = 1;
+        input_W = input_X->shape[2];
+    } else {
+        input_N = input_X->shape[0];
+        input_C = input_X->shape[1];
+        input_H = input_X->shape[2];
+        input_W = input_X->shape[3];
+    }
+    const float *input_data = (const float *)(input_X->data);
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // dropout descriptor
+    cudnnDropoutDescriptor_t dropout_desc;
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc));
+
+    unsigned long long seed = 19260817ull;
+    size_t state_size;
+    CUDNN_CALL(cudnnDropoutGetStatesSize(cudnn_map[dev_id], &state_size));
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *state_data = find_chunk(state_size, dev_id);
+
+    CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc, cudnn_map[dev_id],
+                                         dropout, state_data, state_size,
+                                         seed));
+    // output
+    float *output_data = (float *)output_Y->data;
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    if (need_allocate == 2)
+        del_chunk(*reserve_space, dev_id);
+
+    if (need_allocate > 0) {
+        CUDNN_CALL(cudnnDropoutGetReserveSpaceSize(input_desc,
+                                                   (size_t *)reserve_size));
+        *reserve_space = find_chunk(*reserve_size, dev_id);
+    }
+
+    // dropout_forward
+    CUDNN_CALL(cudnnDropoutForward(cudnn_map[dev_id], dropout_desc, input_desc,
+                                   input_data, output_desc, output_data,
+                                   *reserve_space, *reserve_size));
+
+    del_chunk(state_data, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc));
+    return 0;
+}
+
+int CuDNN_DLGpuDropout_gradient(const DLArrayHandle output_Y,
+                                const float dropout, DLArrayHandle input_X,
+                                int *reserve_size, void **reserve_space,
+                                DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input_X->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    size_t input_N, input_C, input_H, input_W;
+    // input
+    if (input_X->ndim == 2) {
+        input_N = input_X->shape[0];
+        input_C = input_H = 1;
+        input_W = input_X->shape[1];
+    } else if (input_X->ndim == 3) {
+        input_N = input_X->shape[0];
+        input_C = input_X->shape[1];
+        input_H = 1;
+        input_W = input_X->shape[2];
+    } else {
+        input_N = input_X->shape[0];
+        input_C = input_X->shape[1];
+        input_H = input_X->shape[2];
+        input_W = input_X->shape[3];
+    }
+    float *input_data = (float *)(input_X->data);
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // dropout descriptor
+    cudnnDropoutDescriptor_t dropout_desc;
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc));
+
+    unsigned long long seed = 19260817ull; // ha
+    size_t state_size;
+
+    CUDNN_CALL(cudnnDropoutGetStatesSize(cudnn_map[dev_id], &state_size));
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *state_data = find_chunk(state_size, dev_id);
+    CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc, cudnn_map[dev_id],
+                                         dropout, state_data, state_size,
+                                         seed));
+
+    // output
+    const float *output_data = (const float *)(output_Y->data);
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // dropout_backward
+    CUDNN_CALL(cudnnDropoutBackward(cudnn_map[dev_id], dropout_desc,
+                                    output_desc, output_data, input_desc,
+                                    input_data, *reserve_space, *reserve_size));
+
+    del_chunk(state_data, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc));
+
+    return 0;
+}
diff --git a/src/ops/CudnnMaxPool.cu b/src/ops/CudnnMaxPool.cu
new file mode 100644
index 0000000..8ca4488
--- /dev/null
+++ b/src/ops/CudnnMaxPool.cu
@@ -0,0 +1,111 @@
+#include "gpu_runtime.h"
+
+int CuDNN_DLGpuMax_Pooling2d(const DLArrayHandle input, const size_t kernel_H,
+                             const size_t kernel_W, DLArrayHandle output,
+                             const size_t padding, const size_t stride,
+                             DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input->shape[0];
+    size_t input_C = input->shape[1];
+    size_t input_H = input->shape[2];
+    size_t input_W = input->shape[3];
+    const float *input_data = (const float *)input->data;
+
+    // output
+    size_t output_H = output->shape[2];
+    size_t output_W = output->shape[3];
+    float *output_data = (float *)output->data;
+
+    // pooling descriptor
+    cudnnPoolingDescriptor_t maxpool_desc;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&maxpool_desc));
+    CUDNN_CALL(cudnnSetPooling2dDescriptor(
+        maxpool_desc, CUDNN_POOLING_MAX_DETERMINISTIC, CUDNN_PROPAGATE_NAN,
+        kernel_H, kernel_W, padding, padding, stride, stride));
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          output_H, output_W));
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnPoolingForward(cudnn_map[dev_id], maxpool_desc, &alpha,
+                                   input_desc, input_data, &beta, output_desc,
+                                   output_data));
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(maxpool_desc));
+    return 0;
+}
+
+int CuDNN_DLGpuMax_Pooling2d_gradient(
+    const DLArrayHandle output_Y, const DLArrayHandle gradient_Y,
+    const DLArrayHandle input_X, const size_t kernel_H, const size_t kernel_W,
+    DLArrayHandle gradient_X, const size_t padding, const size_t stride,
+    DLStreamHandle stream_handle = NULL) {
+    // create handle
+    int dev_id = (input_X->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    // input
+    size_t input_N = input_X->shape[0];
+    size_t input_C = input_X->shape[1];
+    size_t input_H = input_X->shape[2];
+    size_t input_W = input_X->shape[3];
+    const float *input_data = (const float *)input_X->data;
+    float *gradient_x_data = (float *)gradient_X->data;
+    // output
+    size_t output_H = output_Y->shape[2];
+    size_t output_W = output_Y->shape[3];
+    const float *output_data = (const float *)output_Y->data;
+    const float *gradient_Y_data = (const float *)gradient_Y->data;
+
+    // pooling descriptor
+    cudnnPoolingDescriptor_t maxpool_desc;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&maxpool_desc));
+    CUDNN_CALL(cudnnSetPooling2dDescriptor(
+        maxpool_desc, CUDNN_POOLING_MAX_DETERMINISTIC, CUDNN_PROPAGATE_NAN,
+        kernel_H, kernel_W, padding, padding, stride, stride));
+
+    // input descriptor
+    cudnnTensorDescriptor_t input_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          input_H, input_W));
+
+    // output descriptor
+    cudnnTensorDescriptor_t output_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, input_N, input_C,
+                                          output_H, output_W));
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    CUDNN_CALL(cudnnPoolingBackward(cudnn_map[dev_id], maxpool_desc, &alpha,
+                                    output_desc, output_data, output_desc,
+                                    gradient_Y_data, input_desc, input_data,
+                                    &beta, input_desc, gradient_x_data));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(maxpool_desc));
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/CudnnSoftmax.cu b/src/ops/CudnnSoftmax.cu
new file mode 100644
index 0000000..5fe1fad
--- /dev/null
+++ b/src/ops/CudnnSoftmax.cu
@@ -0,0 +1,57 @@
+#include "gpu_runtime.h"
+
+int CuDNN_DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output,
+                       DLStreamHandle stream_handle = NULL) {
+    size_t indim = input->ndim;
+    assert(indim == output->ndim);
+    int dev_id = (input->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    int n_ = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        n_ *= input->shape[i];
+    }
+    int c_ = input->shape[indim - 1];
+    const float *input_data = (const float *)(input->data);
+    float *output_data = (float *)(output->data);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    cudnnTensorDescriptor_t desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, n_, c_, 1, 1));
+    CUDNN_CALL(cudnnSoftmaxForward(cudnn_map[dev_id], CUDNN_SOFTMAX_ACCURATE,
+                                   CUDNN_SOFTMAX_MODE_INSTANCE, &alpha, desc,
+                                   (const void *)input_data, &beta, desc,
+                                   (void *)output_data));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(desc));
+    return 0;
+}
+
+int CuDNN_DLGpuSoftmaxGradient(const DLArrayHandle y_arr,
+                               const DLArrayHandle dy, DLArrayHandle dx,
+                               DLStreamHandle stream_handle = NULL) {
+    size_t indim = y_arr->ndim;
+    assert(indim == dy->ndim && indim == dx->ndim);
+    int dev_id = (y_arr->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    int n_ = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        n_ *= y_arr->shape[i];
+    }
+    int c_ = y_arr->shape[indim - 1];
+    const float *y_data = (const float *)(y_arr->data);
+    const float *dy_data = (const float *)(dy->data);
+    float *dx_data = (float *)(dx->data);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    cudnnTensorDescriptor_t desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, n_, c_, 1, 1));
+    CUDNN_CALL(cudnnSoftmaxBackward(
+        cudnn_map[dev_id], CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE,
+        &alpha, desc, (const void *)y_data, desc, (const void *)dy_data, &beta,
+        desc, (void *)dx_data));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(desc));
+    return 0;
+}
diff --git a/src/ops/CudnnSoftmaxEntropy.cu b/src/ops/CudnnSoftmaxEntropy.cu
new file mode 100644
index 0000000..f7e940e
--- /dev/null
+++ b/src/ops/CudnnSoftmaxEntropy.cu
@@ -0,0 +1,146 @@
+#include "gpu_runtime.h"
+
+__global__ void cudnn_cross_entropy_kernel(const float *logsoftmax,
+                                           const float *label, float *output,
+                                           size_t size) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+    output[idx] = -logsoftmax[idx] * label[idx];
+}
+
+int CuDNN_DLGpuSoftmaxEntropy(const DLArrayHandle input_y,
+                              const DLArrayHandle label, DLArrayHandle output,
+                              DLStreamHandle stream_handle = NULL) {
+    size_t indim = input_y->ndim;
+    assert(indim == label->ndim && indim == output->ndim + 1);
+    int dev_id = (input_y->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    int n_ = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        n_ *= input_y->shape[i];
+    }
+    int c_ = input_y->shape[indim - 1];
+    size_t size = n_ * c_;
+    const float *y_data = (const float *)(input_y->data);
+    float *label_data = (float *)(label->data);
+    float *output_data = (float *)(output->data);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    cudnnTensorDescriptor_t desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, n_, c_, 1, 1));
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *temp_data = find_chunk(size * sizeof(float), dev_id);
+    CUDNN_CALL(cudnnSoftmaxForward(
+        cudnn_map[dev_id], CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_INSTANCE,
+        &alpha, desc, (const void *)y_data, &beta, desc, temp_data));
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        cudnn_cross_entropy_kernel<<<blocks, threads, 0,
+                                     *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)temp_data, label_data, (float *)temp_data, size);
+    } else {
+        cudnn_cross_entropy_kernel<<<blocks, threads>>>(
+            (const float *)temp_data, label_data, (float *)temp_data, size);
+    }
+
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+    cudnnTensorDescriptor_t new_desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&new_desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(new_desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, n_, 1, 1, 1));
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0, temp_data,
+                                 size * sizeof(float), &alpha, desc,
+                                 (const void *)temp_data, &beta, new_desc,
+                                 (void *)output_data));
+
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(new_desc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(desc));
+    del_chunk(temp_data, dev_id);
+    return 0;
+}
+
+__global__ void
+cudnn_softmax_cross_entropy_gradient(const float *pred, const float *y_,
+                                     const float *grad_data, float *output_data,
+                                     int last_dim, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output_data[ind] = (pred[ind] - y_[ind]) * grad_data[ind / last_dim];
+}
+
+int CuDNN_DLGpuSoftmaxEntropyGradient(const DLArrayHandle grad,
+                                      const DLArrayHandle input_y,
+                                      const DLArrayHandle label,
+                                      DLArrayHandle output,
+                                      DLStreamHandle stream_handle = NULL) {
+    size_t indim = input_y->ndim;
+    assert(indim == label->ndim && indim == output->ndim
+           && indim == grad->ndim + 1);
+    int dev_id = (input_y->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    int n_ = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        n_ *= input_y->shape[i];
+    }
+    int c_ = input_y->shape[indim - 1];
+    size_t size = n_ * c_;
+    const float *grad_data = (const float *)grad->data;
+    const float *y_data = (const float *)input_y->data;
+    const float *label_data = (const float *)label->data;
+    float *output_data = (float *)output->data;
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *temp_data = find_chunk(size * sizeof(float), dev_id);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    cudnnTensorDescriptor_t desc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&desc));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW,
+                                          CUDNN_DATA_FLOAT, n_, c_, 1, 1));
+    CUDNN_CALL(cudnnSoftmaxForward(
+        cudnn_map[dev_id], CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE,
+        &alpha, desc, (const void *)y_data, &beta, desc, temp_data));
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        cudnn_softmax_cross_entropy_gradient<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)temp_data, label_data, grad_data, output_data, c_,
+            size);
+    } else {
+        cudnn_softmax_cross_entropy_gradient<<<blocks, threads>>>(
+            (const float *)temp_data, label_data, grad_data, output_data, c_,
+            size);
+    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(desc));
+    del_chunk(temp_data, dev_id);
+    return 0;
+}
diff --git a/src/ops/DivideConst.cu b/src/ops/DivideConst.cu
new file mode 100644
index 0000000..bd18876
--- /dev/null
+++ b/src/ops/DivideConst.cu
@@ -0,0 +1,37 @@
+#include "gpu_runtime.h"
+
+__global__ void div_const_kernel(const float *input, float *output, float value,
+                                 size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = value / input[ind];
+}
+
+int DLGpuMatrixDivConst(float val, const DLArrayHandle input,
+                        DLArrayHandle output,
+                        DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *input_data = (const float *)input->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        div_const_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, val, size);
+    else
+        div_const_kernel<<<blocks, threads>>>(input_data, output_data, val,
+                                              size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/DivideElewise.cu b/src/ops/DivideElewise.cu
new file mode 100644
index 0000000..06fb391
--- /dev/null
+++ b/src/ops/DivideElewise.cu
@@ -0,0 +1,38 @@
+#include "gpu_runtime.h"
+
+__global__ void ele_div_kernel(const float *matA, const float *matB,
+                               float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = matA[ind] / matB[ind];
+}
+
+int DLGpuMatrixElementwiseDivide(const DLArrayHandle matA,
+                                 const DLArrayHandle matB, DLArrayHandle output,
+                                 DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < matA->ndim; i++) {
+        size *= matA->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        ele_div_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            matA_data, matB_data, output_data, size);
+    else
+        ele_div_kernel<<<blocks, threads>>>(matA_data, matB_data, output_data,
+                                            size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Dot.cu b/src/ops/Dot.cu
new file mode 100644
index 0000000..9373042
--- /dev/null
+++ b/src/ops/Dot.cu
@@ -0,0 +1,43 @@
+#include "gpu_runtime.h"
+
+__global__ void ele_mult_kernel_2(const float *matA, const float *matB,
+                                  float *output, size_t size, size_t size_2) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = matA[ind] * matB[(int)(ind / size_2)];
+}
+
+int DLGpuDot(const DLArrayHandle matA, const DLArrayHandle matB,
+             DLArrayHandle output, DLStreamHandle stream_handle = NULL) {
+    size_t size_A = 1;
+    assert(matA->ndim == output->ndim);
+    for (index_t i = 0; i < matA->ndim; i++) {
+        size_A *= matA->shape[i];
+        assert(matA->shape[i] == output->shape[i]);
+    }
+    size_t size_B = 1;
+    for (index_t i = 0; i < matB->ndim; i++) {
+        size_B *= matB->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    if (size_A <= 1024) {
+        threads.x = size_A;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size_A + 1023) / 1024;
+    }
+    if (stream_handle)
+        ele_mult_kernel_2<<<blocks, threads, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            matA_data, matB_data, output_data, size_A, size_B);
+    else
+        ele_mult_kernel_2<<<blocks, threads>>>(matA_data, matB_data,
+                                               output_data, size_A, size_B);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Dropout.cu b/src/ops/Dropout.cu
new file mode 100644
index 0000000..4f55ff7
--- /dev/null
+++ b/src/ops/Dropout.cu
@@ -0,0 +1,84 @@
+#include "gpu_runtime.h"
+#include <curand.h>
+
+__global__ void dropout_kernel(const float *input, float *output,
+                               const float rate, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float keep_mask = (float)(output[ind] >= rate);
+    output[ind] = input[ind] * keep_mask / (1 - rate);
+}
+
+int DLGpuDropout(const DLArrayHandle input, const float dropout,
+                 DLArrayHandle output, unsigned long long *pseed,
+                 DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+
+    curandGenerator_t gen;
+    CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+    *pseed = time(0);
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, *pseed));
+    CURAND_CALL(curandGenerateUniform(gen, output_data, size));
+    CURAND_CALL(curandDestroyGenerator(gen));
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        dropout_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, dropout, size);
+    } else {
+        dropout_kernel<<<blocks, threads>>>(input_data, output_data, dropout,
+                                            size);
+    }
+    return 0;
+}
+
+int DLGpuDropoutGradient(const DLArrayHandle grad, const float dropout,
+                         DLArrayHandle output, unsigned long long seed,
+                         DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < grad->ndim; i++) {
+        size *= grad->shape[i];
+    }
+    const float *grad_data = (const float *)grad->data;
+    float *output_data = (float *)output->data;
+
+    curandGenerator_t gen;
+    CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, seed));
+    CURAND_CALL(curandGenerateUniform(gen, output_data, size));
+    CURAND_CALL(curandDestroyGenerator(gen));
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        dropout_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            grad_data, output_data, dropout, size);
+    } else {
+        dropout_kernel<<<blocks, threads>>>(grad_data, output_data, dropout,
+                                            size);
+    }
+    return 0;
+}
diff --git a/src/ops/Dropout2d.cu b/src/ops/Dropout2d.cu
new file mode 100644
index 0000000..78875d8
--- /dev/null
+++ b/src/ops/Dropout2d.cu
@@ -0,0 +1,96 @@
+#include "gpu_runtime.h"
+#include <curand.h>
+
+__global__ void dropout2d_kernel(const float *input, float *output,
+                                 const float rate, size_t size,
+                                 size_t last_two) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t leader = size_t(ind / last_two) * last_two;
+    float keep_mask = (float)(output[leader] >= rate);
+    output[ind] = input[ind] * keep_mask / (1 - rate);
+}
+
+int DLGpuDropout2d(const DLArrayHandle input, const float dropout,
+                   DLArrayHandle output, unsigned long long *pseed,
+                   DLStreamHandle stream_handle = NULL) {
+    assert(input->ndim == 4);
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    size_t last_two_size = 1;
+    last_two_size *= input->shape[input->ndim - 1];
+    last_two_size *= input->shape[input->ndim - 2];
+
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+
+    curandGenerator_t gen;
+    CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+    *pseed = time(0);
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, *pseed));
+    CURAND_CALL(curandGenerateUniform(gen, output_data, size));
+    CURAND_CALL(curandDestroyGenerator(gen));
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        dropout2d_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, dropout, size, last_two_size);
+    } else {
+        dropout2d_kernel<<<blocks, threads>>>(input_data, output_data, dropout,
+                                              size, last_two_size);
+    }
+    return 0;
+}
+
+int DLGpuDropout2dGradient(const DLArrayHandle grad, const float dropout,
+                           DLArrayHandle output, unsigned long long seed,
+                           DLStreamHandle stream_handle = NULL) {
+    assert(grad->ndim == 4);
+    size_t size = 1;
+    for (index_t i = 0; i < grad->ndim; i++) {
+        size *= grad->shape[i];
+    }
+    size_t last_two_size = 1;
+    last_two_size *= grad->shape[grad->ndim - 1];
+    last_two_size *= grad->shape[grad->ndim - 2];
+
+    const float *grad_data = (const float *)grad->data;
+    float *output_data = (float *)output->data;
+
+    curandGenerator_t gen;
+    CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, seed));
+    CURAND_CALL(curandGenerateUniform(gen, output_data, size));
+    CURAND_CALL(curandDestroyGenerator(gen));
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        dropout2d_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            grad_data, output_data, dropout, size, last_two_size);
+    } else {
+        dropout2d_kernel<<<blocks, threads>>>(grad_data, output_data, dropout,
+                                              size, last_two_size);
+    }
+    return 0;
+}
diff --git a/src/ops/EmbeddingLookup.cu b/src/ops/EmbeddingLookup.cu
new file mode 100644
index 0000000..fcdfe54
--- /dev/null
+++ b/src/ops/EmbeddingLookup.cu
@@ -0,0 +1,132 @@
+#include "gpu_runtime.h"
+
+__global__ void embedding_lookup_kernel(const float *input, const float *ids,
+                                        float *output, size_t size,
+                                        size_t length) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= size)
+        return;
+    int id = ids[index];
+    float *output_ptr = output + length * index;
+    const float *input_ptr = input + length * id;
+    for (int i = 0; i < length; i++)
+        output_ptr[i] = input_ptr[i];
+}
+
+int DLGpuEmbeddingLookUp(const DLArrayHandle input, const DLArrayHandle ids,
+                         DLArrayHandle output,
+                         DLStreamHandle stream_handle = NULL) {
+    assert(input->ndim == 2);
+    size_t size = 1;
+    for (int i = 0; i < output->ndim; i++) {
+        if (i < output->ndim - 1) {
+            assert(ids->shape[i] == output->shape[i]);
+        } else if (i == output->ndim - 1) {
+            assert(input->shape[1] == output->shape[i]);
+        }
+    }
+    for (int i = 0; i < ids->ndim; i++) {
+        size = size * ids->shape[i];
+    }
+    size_t length = input->shape[1];
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *input_data = (const float *)input->data;
+    const float *id_list = (const float *)ids->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        embedding_lookup_kernel<<<blocks, threads, 0,
+                                  *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, id_list, output_data, size, length);
+    else
+        embedding_lookup_kernel<<<blocks, threads>>>(input_data, id_list,
+                                                     output_data, size, length);
+    return 0;
+}
+
+__global__ void array_set_zero_kernel(float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = 0;
+}
+
+__global__ void embedding_lookup_gradient_kernel(const float *output_grad_data,
+                                                 const float *ids,
+                                                 float *input_grad_data,
+                                                 size_t size, size_t length) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= size)
+        return;
+    int id = ids[index];
+    const float *output_grad_ptr = output_grad_data + length * index;
+    float *input_grad_ptr = input_grad_data + length * id;
+    for (int i = 0; i < length; i++)
+        atomicAdd(input_grad_ptr + i, *(output_grad_ptr + i));
+}
+
+int DLGpuEmbeddingLookUp_Gradient(const DLArrayHandle output_grad,
+                                  const DLArrayHandle ids,
+                                  DLArrayHandle input_grad,
+                                  DLStreamHandle stream_handle = NULL) {
+    assert(input_grad->ndim == 2);
+    size_t size = 1;
+    for (int i = 0; i < output_grad->ndim; i++) {
+        if (i < output_grad->ndim - 1) {
+            assert(ids->shape[i] == output_grad->shape[i]);
+        } else if (i == output_grad->ndim - 1) {
+            assert(input_grad->shape[1] == output_grad->shape[i]);
+        }
+    }
+    for (int i = 0; i < ids->ndim; i++) {
+        size = size * ids->shape[i];
+    }
+    size_t length = input_grad->shape[1];
+    dim3 blocks;
+    dim3 threads;
+    const float *output_grad_data = (const float *)output_grad->data;
+    float *input_grad_data = (float *)input_grad->data;
+    const float *id_list = (const float *)ids->data;
+
+    size_t input_grad_size = 1;
+    for (int i = 0; i < input_grad->ndim; i++) {
+        input_grad_size *= input_grad->shape[i];
+    }
+    if (input_grad_size <= 1024) {
+        threads.x = input_grad_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (input_grad_size + 1023) / 1024;
+    }
+    if (stream_handle)
+        array_set_zero_kernel<<<blocks, threads, 0,
+                                *(cudaStream_t *)stream_handle->handle>>>(
+            input_grad_data, input_grad_size);
+    else
+        array_set_zero_kernel<<<blocks, threads>>>(input_grad_data,
+                                                   input_grad_size);
+
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        embedding_lookup_gradient_kernel<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            output_grad_data, id_list, input_grad_data, size, length);
+    else
+        embedding_lookup_gradient_kernel<<<blocks, threads>>>(
+            output_grad_data, id_list, input_grad_data, size, length);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/IndexedSlices.cu b/src/ops/IndexedSlices.cu
new file mode 100644
index 0000000..fb84969
--- /dev/null
+++ b/src/ops/IndexedSlices.cu
@@ -0,0 +1,48 @@
+#include "gpu_runtime.h"
+
+__global__ void indexedslices_oneside_add_kernel(const float *values_data,
+                                                 const float *indices_data,
+                                                 float *output_data,
+                                                 size_t size, size_t length) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= size)
+        return;
+    int id = indices_data[index];
+    const float *values_ptr = values_data + length * index;
+    float *output_ptr = output_data + length * id;
+    for (int i = 0; i < length; i++)
+        atomicAdd(output_ptr + i, *(values_ptr + i));
+}
+
+int IndexedSlicesOneSideAdd(const DLArrayHandle indices,
+                            const DLArrayHandle values, DLArrayHandle output,
+                            DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t length = output->shape[1];
+    for (int i = 0; i < indices->ndim; i++) {
+        size *= indices->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    const float *values_data = (const float *)values->data;
+    float *output_data = (float *)output->data;
+    const float *indices_data = (const float *)indices->data;
+
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        indexedslices_oneside_add_kernel<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            values_data, indices_data, output_data, size, length);
+    else
+        indexedslices_oneside_add_kernel<<<blocks, threads>>>(
+            values_data, indices_data, output_data, size, length);
+    return 0;
+}
diff --git a/src/ops/Initializers.cu b/src/ops/Initializers.cu
new file mode 100644
index 0000000..c4e8dd7
--- /dev/null
+++ b/src/ops/Initializers.cu
@@ -0,0 +1,110 @@
+#include "gpu_runtime.h"
+#include <curand.h>
+#include <curand_kernel.h>
+
+int DLGpuNormalInit(DLArrayHandle arr, const float mean, const float stddev,
+                    unsigned long long seed,
+                    DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    curandGenerator_t gen;
+    CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, seed));
+    if (stream_handle)
+        CURAND_CALL(
+            curandSetStream(gen, *(cudaStream_t *)stream_handle->handle));
+    CURAND_CALL(curandGenerateNormal(gen, arr_data, size, mean, stddev));
+    CURAND_CALL(curandDestroyGenerator(gen));
+
+    return 0;
+}
+
+__global__ void init_scale_kernel(float *arr, const float lb, const float ub,
+                                  unsigned long long seed, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed + ind, 0, 0, &state);
+    arr[ind] = curand_uniform(&state) * (ub - lb) + lb;
+}
+
+int DLGpuUniformInit(DLArrayHandle arr, const float lb, const float ub,
+                     unsigned long long seed,
+                     DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        init_scale_kernel<<<blocks, threads, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            arr_data, lb, ub, seed, size);
+    } else {
+        init_scale_kernel<<<blocks, threads>>>(arr_data, lb, ub, seed, size);
+    }
+
+    return 0;
+}
+
+__global__ void truncated_normal_kernel(float *arr, const float mean,
+                                        const float stddev,
+                                        unsigned long long seed, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    curandStatePhilox4_32_10_t state;
+    bool not_found = true;
+    curand_init(seed + ind, 0, 0, &state);
+    float temp;
+    while (not_found) {
+        temp = curand_normal(&state);
+        not_found = (temp < -2 || temp > 2);
+    }
+    arr[ind] = temp * stddev + mean;
+}
+
+int DLGpuTruncatedNormalInit(DLArrayHandle arr, const float mean,
+                             const float stddev, unsigned long long seed,
+                             DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < arr->ndim; i++) {
+        size *= arr->shape[i];
+    }
+    float *arr_data = (float *)arr->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        truncated_normal_kernel<<<blocks, threads, 0,
+                                  *(cudaStream_t *)stream_handle->handle>>>(
+            arr_data, mean, stddev, seed, size);
+    } else {
+        truncated_normal_kernel<<<blocks, threads>>>(arr_data, mean, stddev,
+                                                     seed, size);
+    }
+
+    return 0;
+}
diff --git a/src/ops/InstanceNorm2d.cu b/src/ops/InstanceNorm2d.cu
new file mode 100644
index 0000000..842778e
--- /dev/null
+++ b/src/ops/InstanceNorm2d.cu
@@ -0,0 +1,189 @@
+#include "gpu_runtime.h"
+
+__global__ void minus_mean_n_square_kernel1(const float *in_arr,
+                                            const float *mean, float *out_arr,
+                                            int last_2dim, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float temp = in_arr[ind] - mean[ind / last_2dim];
+    out_arr[ind] = temp * temp;
+}
+
+__global__ void std_normal_transform(const float *in_arr, const float *mean_arr,
+                                     const float *var_arr, float *out_arr,
+                                     int last_2dim, float eps, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t mo_ind = ind / last_2dim;
+    out_arr[ind] =
+        (in_arr[ind] - mean_arr[mo_ind]) / sqrtf(var_arr[mo_ind] + eps);
+}
+
+int DLGpuInstanceNormalization2d(const DLArrayHandle in_arr,
+                                 DLArrayHandle mean_arr, DLArrayHandle var_arr,
+                                 DLArrayHandle out_arr, float eps,
+                                 DLStreamHandle stream_handle) {
+    int dev_id = (in_arr->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudnn_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_AVG, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+    cudnnTensorDescriptor_t adesc;
+    cudnnTensorDescriptor_t cdesc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&adesc));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cdesc));
+
+    int ndim = in_arr->ndim;
+    assert(ndim == 4);
+    int last_2dim = in_arr->shape[ndim - 1] * in_arr->shape[ndim - 2];
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        dimA[i] = (int)in_arr->shape[i];
+        dimC[i] = i < in_arr->ndim - 2 ? (int)in_arr->shape[i] : 1;
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+    size_t size = temp_strideA * sizeof(float);
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(adesc, CUDNN_DATA_FLOAT, ndim, dimA,
+                                          strideA));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)out_arr->data, size, &one, adesc,
+                                 (const void *)in_arr->data, &zero, cdesc,
+                                 (void *)mean_arr->data));
+
+    dim3 blocks;
+    dim3 threads;
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        minus_mean_n_square_kernel1<<<blocks, threads, 0,
+                                      *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (float *)out_arr->data, last_2dim, temp_strideA);
+    else
+        minus_mean_n_square_kernel1<<<blocks, threads>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (float *)out_arr->data, last_2dim, temp_strideA);
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)out_arr->data, size, &one, adesc,
+                                 (const void *)out_arr->data, &zero, cdesc,
+                                 (void *)var_arr->data));
+
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+    if (stream_handle)
+        std_normal_transform<<<blocks, threads, 0,
+                               *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (float *)out_arr->data, last_2dim,
+            eps, temp_strideA);
+    else
+        std_normal_transform<<<blocks, threads>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (float *)out_arr->data, last_2dim,
+            eps, temp_strideA);
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(adesc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cdesc));
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
+
+__global__ void calculate_grad_kernel(const float *out_grads,
+                                      const float *in_arr,
+                                      const float *mean_arr,
+                                      const float *var_arr, float *grad_arr,
+                                      size_t last2dim, float eps, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t mo_ind = ind / last2dim;
+    float y = (in_arr[ind] - mean_arr[mo_ind]) / sqrtf(var_arr[mo_ind] + eps);
+    grad_arr[ind] = out_grads[ind] * (1.0 - 1.0 / (float)last2dim - y * y)
+                    / sqrtf(var_arr[mo_ind] + eps);
+}
+
+int DLGpuInstanceNormalization2dGradient(const DLArrayHandle out_grads,
+                                         const DLArrayHandle in_arr,
+                                         DLArrayHandle grad_arr,
+                                         const DLArrayHandle mean_arr,
+                                         const DLArrayHandle var_arr, float eps,
+                                         DLStreamHandle stream_handle) {
+    /*
+       already have mean and var, we directly get y = x-u / sigma
+       the grad_arr = out_grad * (1 - 1/WH - y^2) / var
+     */
+    int dev_id = (out_grads->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    int ndim = out_grads->ndim;
+    assert(ndim == 4);
+    int total_elements = 1;
+    for (int i = 0; i < ndim; ++i)
+        total_elements *= out_grads->shape[i];
+    int last2dim = out_grads->shape[ndim - 1] * out_grads->shape[ndim - 2];
+
+    dim3 blocks;
+    dim3 threads;
+    if (total_elements <= 1024) {
+        threads.x = total_elements;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (total_elements + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        calculate_grad_kernel<<<blocks, threads, 0,
+                                *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (float *)grad_arr->data, last2dim, eps, total_elements);
+    else
+        calculate_grad_kernel<<<blocks, threads>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (float *)grad_arr->data, last2dim, eps, total_elements);
+
+    return 0;
+}
diff --git a/src/ops/LayerNorm.cu b/src/ops/LayerNorm.cu
new file mode 100644
index 0000000..8f2daf1
--- /dev/null
+++ b/src/ops/LayerNorm.cu
@@ -0,0 +1,388 @@
+#include "gpu_runtime.h"
+
+__global__ void minus_mean_n_square_kernel(const float *in_arr,
+                                           const float *mean, float *out_arr,
+                                           int last_dim, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float temp = in_arr[ind] - mean[ind / last_dim];
+    out_arr[ind] = temp * temp;
+}
+
+__global__ void rescale_kernel(const float *in_arr, const float *mean_arr,
+                               const float *var_arr, const float *ln_scale,
+                               const float *ln_bias, float *out_arr,
+                               int last_dim, float eps, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    size_t mo_ind = ind / last_dim;
+    size_t ln_ind = ind % last_dim;
+    out_arr[ind] = (in_arr[ind] - mean_arr[mo_ind])
+                       / sqrtf(var_arr[mo_ind] + eps) * ln_scale[ln_ind]
+                   + ln_bias[ln_ind];
+}
+
+int DLGpuLayerNormalization(const DLArrayHandle in_arr,
+                            const DLArrayHandle ln_scale,
+                            const DLArrayHandle ln_bias, DLArrayHandle mean_arr,
+                            DLArrayHandle var_arr, DLArrayHandle out_arr,
+                            float eps, DLStreamHandle stream_handle) {
+    int dev_id = (in_arr->ctx).device_id;
+    cudaSetDevice(dev_id);
+    cudnn_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_AVG, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+    cudnnTensorDescriptor_t adesc;
+    cudnnTensorDescriptor_t cdesc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&adesc));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cdesc));
+
+    int ndim = in_arr->ndim;
+    int last_dim = in_arr->shape[ndim - 1];
+    if (ndim < 4)
+        ndim = 4;
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        dimA[i] = i < in_arr->ndim ? (int)in_arr->shape[i] : 1;
+        dimC[i] = i < in_arr->ndim - 1 ? (int)in_arr->shape[i] : 1;
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+    size_t size = temp_strideA * sizeof(float);
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(adesc, CUDNN_DATA_FLOAT, ndim, dimA,
+                                          strideA));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)out_arr->data, size, &one, adesc,
+                                 (const void *)in_arr->data, &zero, cdesc,
+                                 (void *)mean_arr->data));
+
+    dim3 blocks;
+    dim3 threads;
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        minus_mean_n_square_kernel<<<blocks, threads, 0,
+                                     *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (float *)out_arr->data, last_dim, temp_strideA);
+    else
+        minus_mean_n_square_kernel<<<blocks, threads>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (float *)out_arr->data, last_dim, temp_strideA);
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)out_arr->data, size, &one, adesc,
+                                 (const void *)out_arr->data, &zero, cdesc,
+                                 (void *)var_arr->data));
+
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+    if (stream_handle)
+        rescale_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (const float *)ln_scale->data,
+            (const float *)ln_bias->data, (float *)out_arr->data, last_dim, eps,
+            temp_strideA);
+    else
+        rescale_kernel<<<blocks, threads>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (const float *)ln_scale->data,
+            (const float *)ln_bias->data, (float *)out_arr->data, last_dim, eps,
+            temp_strideA);
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(adesc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cdesc));
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
+
+__global__ void process_kernel1(const float *grads, const float *in_arr,
+                                const float *mean_arr, const float *var_arr,
+                                const float *ln_scale, float *ws1, float *ws2,
+                                float *out_arr, float eps, int last_dim,
+                                size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    int mo_ind = ind / last_dim;
+    float std = sqrtf(var_arr[mo_ind] + eps);
+    float x_centered = in_arr[ind] - mean_arr[mo_ind];
+    float x_norm = x_centered / std;
+    float gscale = grads[ind] * x_norm;
+    ws1[ind] = gscale;
+    int ln_ind = ind % last_dim;
+    if (ln_ind == 0) {
+        ws2[mo_ind] = std;
+    }
+    float dx_norm = grads[ind] * ln_scale[ln_ind];
+    float dvar_temp = dx_norm * x_centered;
+    out_arr[ind] = dvar_temp;
+}
+
+__global__ void process_kernel2(const float *grads, const float *in_arr,
+                                const float *mean_arr, const float *var_arr,
+                                const float *ln_scale, float *ws1, float *ws2,
+                                float *ws3, float eps, int last_dim,
+                                size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    int ln_ind = ind % last_dim;
+    float dx_norm = grads[ind] * ln_scale[ln_ind];
+    int mo_ind = ind / last_dim;
+    float dx_mu_1 = dx_norm / ws2[mo_ind];
+    float dvar = ws3[mo_ind] * -0.5 / ws2[mo_ind] / (var_arr[mo_ind] + eps);
+    float x_centered = in_arr[ind] - mean_arr[mo_ind];
+    float dx_mu_2 = dvar * 2 * x_centered / last_dim;
+    float dx1 = dx_mu_1 + dx_mu_2;
+    ws1[ind] = dx1;
+}
+
+__global__ void process_kernel3(const float *ws1, const float *ws3,
+                                float *out_arr, int last_dim, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    int mo_ind = ind / last_dim;
+    out_arr[ind] = ws1[ind] - ws3[mo_ind] / last_dim;
+}
+
+int DLGpuLayerNormalizationGradient(
+    const DLArrayHandle out_grads, const DLArrayHandle in_arr,
+    const DLArrayHandle ln_scale, DLArrayHandle grad_arr,
+    DLArrayHandle grad_scale, DLArrayHandle grad_bias,
+    const DLArrayHandle mean_arr, const DLArrayHandle var_arr, float eps,
+    DLStreamHandle stream_handle) {
+    int dev_id = (out_grads->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+    float one = 1.0f;
+    float zero = 0.0f;
+
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+    cudnnTensorDescriptor_t adesc;
+    cudnnTensorDescriptor_t cdesc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&adesc));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cdesc));
+    int ori_ndim = out_grads->ndim;
+    int last_dim = out_grads->shape[ori_ndim - 1];
+    int ndim = max(ori_ndim, 4);
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        dimA[i] = i < ori_ndim ? (int)out_grads->shape[i] : 1;
+        dimC[i] = i == ori_ndim - 1 ? (int)out_grads->shape[i] : 1;
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+    size_t size = temp_strideA * sizeof(float);
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(adesc, CUDNN_DATA_FLOAT, ndim, dimA,
+                                          strideA));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)grad_arr->data, size, &one, adesc,
+                                 (const void *)out_grads->data, &zero, cdesc,
+                                 (void *)grad_bias->data));
+
+    size_t size1 = temp_strideA * sizeof(float);
+    size_t size2 = size1 / last_dim;
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    float *ws1 = (float *)find_chunk(size1, dev_id);
+    float *ws2 = (float *)find_chunk(size2, dev_id);
+    float *ws3 = (float *)find_chunk(size2, dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        process_kernel1<<<blocks, threads, 0,
+                          *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (const float *)ln_scale->data, ws1, ws2, (float *)grad_arr->data,
+            eps, last_dim, temp_strideA);
+    else
+        process_kernel1<<<blocks, threads>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (const float *)ln_scale->data, ws1, ws2, (float *)grad_arr->data,
+            eps, last_dim, temp_strideA);
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0, (void *)ws1,
+                                 size, &one, adesc, (const void *)ws1, &zero,
+                                 cdesc, (void *)grad_scale->data));
+
+    temp_strideC = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        dimC[i] = i < ori_ndim - 1 ? (int)out_grads->shape[i] : 1;
+        strideC[i] = temp_strideC;
+        temp_strideC *= dimC[i];
+    }
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+    CUDNN_CALL(cudnnReduceTensor(
+        cudnn_map[dev_id], rtd, NULL, 0, (void *)grad_arr->data, size, &one,
+        adesc, (const void *)grad_arr->data, &zero, cdesc, (void *)ws3));
+
+    if (stream_handle)
+        process_kernel2<<<blocks, threads, 0,
+                          *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (const float *)ln_scale->data, ws1, ws2, ws3, eps, last_dim,
+            temp_strideA);
+    else
+        process_kernel2<<<blocks, threads>>>(
+            (const float *)out_grads->data, (const float *)in_arr->data,
+            (const float *)mean_arr->data, (const float *)var_arr->data,
+            (const float *)ln_scale->data, ws1, ws2, ws3, eps, last_dim,
+            temp_strideA);
+
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0,
+                                 (void *)grad_arr->data, size, &one, adesc,
+                                 (const void *)ws1, &zero, cdesc, (void *)ws3));
+
+    if (stream_handle)
+        process_kernel3<<<blocks, threads, 0,
+                          *(cudaStream_t *)stream_handle->handle>>>(
+            ws1, ws3, (float *)grad_arr->data, last_dim, temp_strideA);
+    else
+        process_kernel3<<<blocks, threads>>>(ws1, ws3, (float *)grad_arr->data,
+                                             last_dim, temp_strideA);
+
+    del_chunk(ws1, dev_id);
+    del_chunk(ws2, dev_id);
+    del_chunk(ws3, dev_id);
+
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(adesc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cdesc));
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
+
+int DLGpuLayerNormalizationInference(
+    const DLArrayHandle in_arr, const DLArrayHandle ln_scale,
+    const DLArrayHandle ln_bias, DLArrayHandle mean_arr, DLArrayHandle var_arr,
+    DLArrayHandle out_arr, float eps, DLStreamHandle stream_handle) {
+    int ndim = in_arr->ndim;
+    int last_dim = in_arr->shape[ndim - 1];
+    if (ndim < 4)
+        ndim = 4;
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        dimA[i] = i < in_arr->ndim ? (int)in_arr->shape[i] : 1;
+        dimC[i] = i < in_arr->ndim - 1 ? (int)in_arr->shape[i] : 1;
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+
+    if (temp_strideA <= 1024) {
+        threads.x = temp_strideA;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (temp_strideA + 1023) / 1024;
+    }
+    if (stream_handle)
+        rescale_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (const float *)ln_scale->data,
+            (const float *)ln_bias->data, (float *)out_arr->data, last_dim, eps,
+            temp_strideA);
+    else
+        rescale_kernel<<<blocks, threads>>>(
+            (const float *)in_arr->data, (const float *)mean_arr->data,
+            (const float *)var_arr->data, (const float *)ln_scale->data,
+            (const float *)ln_bias->data, (float *)out_arr->data, last_dim, eps,
+            temp_strideA);
+
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/LeakyRelu.cu b/src/ops/LeakyRelu.cu
new file mode 100644
index 0000000..3e016f1
--- /dev/null
+++ b/src/ops/LeakyRelu.cu
@@ -0,0 +1,80 @@
+#include "gpu_runtime.h"
+
+__global__ void leaky_relu_kernel(float *input, const float alpha,
+                                  float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float s = alpha;
+    if (input[ind] > 0.)
+        s = 1.;
+    output[ind] = input[ind] * s;
+}
+
+int DLGpuLeakyRelu(const DLArrayHandle input, const float alpha,
+                   DLArrayHandle output, DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        leaky_relu_kernel<<<blocks, threads, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, alpha, output_data, size);
+    else
+        leaky_relu_kernel<<<blocks, threads>>>(input_data, alpha, output_data,
+                                               size);
+    return 0;
+}
+
+__global__ void leaky_relu_grad_kernel(const float *input, const float *in_grad,
+                                       const float alpha, float *output,
+                                       size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = in_grad[ind];
+    if (input[ind] < 0.)
+        output[ind] *= alpha;
+}
+
+int DLGpuLeakyReluGradient(const DLArrayHandle input,
+                           const DLArrayHandle in_grad, const float alpha,
+                           DLArrayHandle output,
+                           DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    const float *input_data = (const float *)input->data;
+    const float *in_grad_data = (const float *)in_grad->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        leaky_relu_grad_kernel<<<blocks, threads, 0,
+                                 *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, in_grad_data, alpha, output_data, size);
+    else
+        leaky_relu_grad_kernel<<<blocks, threads>>>(input_data, in_grad_data,
+                                                    alpha, output_data, size);
+    return 0;
+}
diff --git a/src/ops/MatrixMult.cu b/src/ops/MatrixMult.cu
new file mode 100644
index 0000000..0a15810
--- /dev/null
+++ b/src/ops/MatrixMult.cu
@@ -0,0 +1,28 @@
+#include "gpu_runtime.h"
+
+cublasHandle_t cublas_handle = NULL;
+int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
+                        const DLArrayHandle matB, bool transposeB,
+                        DLArrayHandle matC,
+                        DLStreamHandle stream_handle = NULL) {
+    // cublas assume matrix is column major
+    assert(matA->ndim == 2);
+    assert(matB->ndim == 2);
+    assert(matC->ndim == 2);
+
+    int dev_id = (matA->ctx).device_id;
+    cublas_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+    int m = matC->shape[1];
+    int n = matC->shape[0];
+    int k = transposeA ? matA->shape[0] : matA->shape[1];
+
+    cublasSgemm(cublas_map[dev_id], transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
+                transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one,
+                (const float *)matB->data, !transposeB ? m : k,
+                (const float *)matA->data, !transposeA ? k : n, &zero,
+                (float *)matC->data, m);
+    return 0;
+}
diff --git a/src/ops/MaxPool.cu b/src/ops/MaxPool.cu
new file mode 100644
index 0000000..52e37df
--- /dev/null
+++ b/src/ops/MaxPool.cu
@@ -0,0 +1,162 @@
+#include "gpu_runtime.h"
+
+__global__ void max_pool_forward_kernel(const int nthreads,
+                                        const float *input_data,
+                                        const int input_H, const int input_W,
+                                        const int input_C, const int pooled_H,
+                                        const int pooled_W, const int kernel_h,
+                                        const int kernel_w, const int stride,
+                                        const int padding, float *output_data) {
+    const float kMinFLOAT8X4 = -FLT_MAX;
+
+    for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < nthreads;
+         index += gridDim.x * blockDim.x) {
+        int pw = index % pooled_W;
+        int ph = (index / pooled_W) % pooled_H;
+        int c = (index / pooled_W / pooled_H) % input_C;
+        int n = index / pooled_W / pooled_H / input_C;
+
+        int hstart = ph * stride - padding;
+        int wstart = pw * stride - padding;
+        int hend = min(hstart + kernel_h, input_H);
+        int wend = min(wstart + kernel_w, input_W);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+
+        float maxval = kMinFLOAT8X4;
+
+        const float *input_data_n =
+            input_data + n * input_C * input_H * input_W;
+
+        for (int h = hstart; h < hend; h++) {
+            for (int w = wstart; w < wend; w++) {
+                int idx = (c * input_H + h) * input_W + w;
+                if (input_data_n[idx] > maxval) {
+                    maxval = input_data_n[idx];
+                }
+            }
+        }
+
+        output_data[index] = maxval;
+    }
+}
+
+int DLGpuMax_Pooling2d(const DLArrayHandle input, const int kernel_H,
+                       const int kernel_W, DLArrayHandle output,
+                       const int padding, const int stride,
+                       DLStreamHandle stream_handle = NULL) {
+    const int kThreadsPerBlock = 1024;
+
+    int input_N = input->shape[0];
+    int input_C = input->shape[1];
+    int input_H = input->shape[2];
+    int input_W = input->shape[3];
+    int pooled_H = output->shape[2];
+    int pooled_W = output->shape[3];
+
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int output_size = input_N * input_C * pooled_H * pooled_W;
+
+    int blocks = (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock;
+    int threads = kThreadsPerBlock;
+
+    if (stream_handle)
+        max_pool_forward_kernel<<<blocks, threads, 0,
+                                  *(cudaStream_t *)stream_handle->handle>>>(
+            output_size, input_data, input_H, input_W, input_C, pooled_H,
+            pooled_W, kernel_H, kernel_W, stride, padding, output_data);
+    else
+        max_pool_forward_kernel<<<blocks, threads>>>(
+            output_size, input_data, input_H, input_W, input_C, pooled_H,
+            pooled_W, kernel_H, kernel_W, stride, padding, output_data);
+
+    return 0;
+}
+
+__global__ void max_pool_backward_kernel(
+    const int nthreads, const float *input_data, const int input_H,
+    const int input_W, const int input_C, const int pooled_H,
+    const int pooled_W, const int kernel_H, const int kernel_W,
+    const int stride, const int padding, const float *input_grad_data,
+    float *output_grad_data) {
+    const float kMinFLOAT8X4 = -FLT_MAX;
+
+    for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < nthreads;
+         index += gridDim.x * blockDim.x) {
+        int pw = index % pooled_W;
+        int ph = (index / pooled_W) % pooled_H;
+        int c = (index / pooled_W / pooled_H) % input_C;
+        int n = index / pooled_W / pooled_H / input_C;
+
+        int hstart = ph * stride - padding;
+        int wstart = pw * stride - padding;
+        int hend = min(hstart + kernel_H, input_H);
+        int wend = min(wstart + kernel_W, input_W);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+
+        float maxval = kMinFLOAT8X4;
+        int maxidx = -1;
+
+        const float *input_data_n =
+            input_data + n * input_C * input_H * input_W;
+        float *output_grad_data_n =
+            output_grad_data + n * input_C * input_H * input_W;
+
+        for (int h = hstart; h < hend; h++) {
+            for (int w = wstart; w < wend; w++) {
+                int idx = (c * input_H + h) * input_W + w;
+                if (input_data_n[idx] > maxval) {
+                    maxidx = idx;
+                    maxval = input_data_n[idx];
+                }
+            }
+        }
+
+        if (maxidx != -1) {
+            atomicAdd(output_grad_data_n + maxidx, input_grad_data[index]);
+        }
+    }
+}
+
+int DLGpuMax_Pooling2d_gradient(const DLArrayHandle input,
+                                const DLArrayHandle input_grad,
+                                const int kernel_H, const int kernel_W,
+                                DLArrayHandle output_grad, const int padding,
+                                const int stride,
+                                DLStreamHandle stream_handle = NULL) {
+    const int kThreadsPerBlock = 1024;
+    int input_N = input->shape[0];
+    int input_C = input->shape[1];
+    int input_H = input->shape[2];
+    int input_W = input->shape[3];
+    int pooled_H = input_grad->shape[2];
+    int pooled_W = input_grad->shape[3];
+
+    const float *input_data = (const float *)input->data;
+    const float *input_grad_data = (const float *)input_grad->data;
+    float *output_grad_data = (float *)output_grad->data;
+
+    int input_grad_size = input_N * input_C * pooled_H * pooled_W;
+    int output_grad_size = input_N * input_C * input_H * input_W;
+    int blocks = (input_grad_size + kThreadsPerBlock - 1) / kThreadsPerBlock;
+    int threads = kThreadsPerBlock;
+    cudaMemset(output_grad_data, 0, output_grad_size * sizeof(float));
+    if (stream_handle)
+        max_pool_backward_kernel<<<blocks, threads, 0,
+                                   *(cudaStream_t *)stream_handle->handle>>>(
+            input_grad_size, input_data, input_H, input_W, input_C, pooled_H,
+            pooled_W, kernel_H, kernel_W, stride, padding, input_grad_data,
+            output_grad_data);
+    else
+        max_pool_backward_kernel<<<blocks, threads>>>(
+            input_grad_size, input_data, input_H, input_W, input_C, pooled_H,
+            pooled_W, kernel_H, kernel_W, stride, padding, input_grad_data,
+            output_grad_data);
+
+    return 0;
+}
diff --git a/src/ops/MultiplyConst.cu b/src/ops/MultiplyConst.cu
new file mode 100644
index 0000000..530ec95
--- /dev/null
+++ b/src/ops/MultiplyConst.cu
@@ -0,0 +1,39 @@
+#include "gpu_runtime.h"
+
+__global__ void mult_const_kernel(const float *input, float *output,
+                                  float value, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = input[ind] * value;
+}
+
+int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
+                               DLArrayHandle output,
+                               DLStreamHandle stream_handle = NULL) {
+    int dev_id = (input->ctx).device_id;
+    cudaSetDevice(dev_id);
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *input_data = (const float *)input->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        mult_const_kernel<<<blocks, threads, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, val, size);
+    else
+        mult_const_kernel<<<blocks, threads>>>(input_data, output_data, val,
+                                               size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/MultiplyElewise.cu b/src/ops/MultiplyElewise.cu
new file mode 100644
index 0000000..9f95e96
--- /dev/null
+++ b/src/ops/MultiplyElewise.cu
@@ -0,0 +1,39 @@
+#include "gpu_runtime.h"
+
+__global__ void ele_mult_kernel(const float *matA, const float *matB,
+                                float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = matA[ind] * matB[ind];
+}
+
+int DLGpuMatrixElementwiseMultiply(const DLArrayHandle matA,
+                                   const DLArrayHandle matB,
+                                   DLArrayHandle output,
+                                   DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < matA->ndim; i++) {
+        size *= matA->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *output_data = (float *)output->data;
+    const float *matA_data = (const float *)matA->data;
+    const float *matB_data = (const float *)matB->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        ele_mult_kernel<<<blocks, threads, 0,
+                          *(cudaStream_t *)stream_handle->handle>>>(
+            matA_data, matB_data, output_data, size);
+    else
+        ele_mult_kernel<<<blocks, threads>>>(matA_data, matB_data, output_data,
+                                             size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/OneHot.cu b/src/ops/OneHot.cu
new file mode 100644
index 0000000..a2b0aa1
--- /dev/null
+++ b/src/ops/OneHot.cu
@@ -0,0 +1,47 @@
+#include "gpu_runtime.h"
+
+__global__ void onehot_kernel(const float *input, float *output,
+                              size_t last_dim, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float offset = (float)(ind % last_dim);
+    float writein = 0;
+    if (offset == input[ind / last_dim]) {
+        writein = 1;
+    } else {
+        writein = 0;
+    }
+    output[ind] = writein;
+}
+
+int DLGpuOneHot(const DLArrayHandle input, DLArrayHandle output,
+                DLStreamHandle stream_handle = NULL) {
+    assert(input->ndim == output->ndim - 1);
+    size_t insize = 1;
+    for (int i = 0; i < input->ndim; ++i) {
+        insize *= input->shape[i];
+    }
+    size_t last_dim = output->shape[input->ndim];
+    size_t size = insize * last_dim;
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        onehot_kernel<<<blocks, threads, 0,
+                        *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, last_dim, size);
+    else
+        onehot_kernel<<<blocks, threads>>>(input_data, output_data, last_dim,
+                                           size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Opposite.cu b/src/ops/Opposite.cu
new file mode 100644
index 0000000..cc5b865
--- /dev/null
+++ b/src/ops/Opposite.cu
@@ -0,0 +1,34 @@
+#include "gpu_runtime.h"
+
+__global__ void opposite_kernel(float *input, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = -input[ind];
+}
+
+int DLGpuOpposite(const DLArrayHandle input, DLArrayHandle output,
+                  DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        opposite_kernel<<<blocks, threads, 0,
+                          *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        opposite_kernel<<<blocks, threads>>>(input_data, output_data, size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Optimizers.cu b/src/ops/Optimizers.cu
new file mode 100644
index 0000000..e2d787e
--- /dev/null
+++ b/src/ops/Optimizers.cu
@@ -0,0 +1,216 @@
+#include "gpu_runtime.h"
+
+__global__ void add_l2_regularization(const float *param, float *grad,
+                                      float l2reg, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    grad[ind] = grad[ind] + l2reg * param[ind];
+}
+
+int AddL2Regularization(const DLArrayHandle param, DLArrayHandle grad,
+                        float l2reg, DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < param->ndim; ++i) {
+        size *= param->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    const float *param_data = (const float *)param->data;
+    float *grad_data = (float *)grad->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle) {
+        add_l2_regularization<<<blocks, threads, 0,
+                                *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, l2reg, size);
+    } else {
+        add_l2_regularization<<<blocks, threads>>>(param_data, grad_data, l2reg,
+                                                   size);
+    }
+    return 0;
+}
+
+__global__ void sgd_update(float *param, const float *grad, float lr,
+                           size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    param[ind] = param[ind] - lr * grad[ind];
+}
+
+int SGDOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad, float lr,
+                       DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < param->ndim; ++i) {
+        size *= param->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        sgd_update<<<blocks, threads, 0,
+                     *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, lr, size);
+    else
+        sgd_update<<<blocks, threads>>>(param_data, grad_data, lr, size);
+    return 0;
+}
+
+__global__ void nesterov_momentum_update(float *param, const float *grad,
+                                         float *velocity, float lr,
+                                         float momentum, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float temp = lr * grad[ind];
+    velocity[ind] = momentum * (velocity[ind] - temp);
+    param[ind] = param[ind] + velocity[ind] - temp;
+}
+
+__global__ void momentum_update(float *param, const float *grad,
+                                float *velocity, float lr, float momentum,
+                                size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    velocity[ind] = momentum * velocity[ind] - lr * grad[ind];
+    param[ind] = param[ind] + velocity[ind];
+}
+
+int MomentumOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                            DLArrayHandle velocity, float lr, float momentum,
+                            bool nesterov,
+                            DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < param->ndim; ++i) {
+        size *= param->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad->data;
+    float *velocity_data = (float *)velocity->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (nesterov) {
+        if (stream_handle)
+            nesterov_momentum_update<<<
+                blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+                param_data, grad_data, velocity_data, lr, momentum, size);
+        else
+            nesterov_momentum_update<<<blocks, threads>>>(
+                param_data, grad_data, velocity_data, lr, momentum, size);
+    } else {
+        if (stream_handle)
+            momentum_update<<<blocks, threads, 0,
+                              *(cudaStream_t *)stream_handle->handle>>>(
+                param_data, grad_data, velocity_data, lr, momentum, size);
+        else
+            momentum_update<<<blocks, threads>>>(
+                param_data, grad_data, velocity_data, lr, momentum, size);
+    }
+    return 0;
+}
+
+__global__ void adagrad_update(float *param, const float *grad, float *acc,
+                               float lr, float eps, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    acc[ind] = acc[ind] + grad[ind] * grad[ind];
+    param[ind] = param[ind] - lr * grad[ind] / (sqrtf(acc[ind]) + eps);
+}
+
+int AdaGradOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                           DLArrayHandle acc, float lr, float eps,
+                           DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < param->ndim; ++i) {
+        size *= param->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad->data;
+    float *acc_data = (float *)acc->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        adagrad_update<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, acc_data, lr, eps, size);
+    else
+        adagrad_update<<<blocks, threads>>>(param_data, grad_data, acc_data, lr,
+                                            eps, size);
+    return 0;
+}
+
+__global__ void adam_update(float *param, const float *grad, float *m, float *v,
+                            float lr, float beta1, float beta2, float beta1t,
+                            float beta2t, float eps, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    m[ind] = beta1 * m[ind] + (1 - beta1) * grad[ind];
+    v[ind] = beta2 * v[ind] + (1 - beta2) * grad[ind] * grad[ind];
+    float m_local = m[ind] / (1 - beta1t);
+    float v_local = v[ind] / (1 - beta2t);
+    param[ind] = param[ind] - lr * m_local / (sqrtf(v_local) + eps);
+}
+
+int AdamOptimizerUpdate(DLArrayHandle param, const DLArrayHandle grad,
+                        DLArrayHandle expavg, DLArrayHandle expavgsq, float lr,
+                        float beta1, float beta2, float beta1t, float beta2t,
+                        float eps, DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < param->ndim; ++i) {
+        size *= param->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad->data;
+    float *m_data = (float *)expavg->data;
+    float *v_data = (float *)expavgsq->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        adam_update<<<blocks, threads, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, m_data, v_data, lr, beta1, beta2, beta1t,
+            beta2t, eps, size);
+    else
+        adam_update<<<blocks, threads>>>(param_data, grad_data, m_data, v_data,
+                                         lr, beta1, beta2, beta1t, beta2t, eps,
+                                         size);
+    return 0;
+}
diff --git a/src/ops/OptimizersSparse.cu b/src/ops/OptimizersSparse.cu
new file mode 100644
index 0000000..6d726c1
--- /dev/null
+++ b/src/ops/OptimizersSparse.cu
@@ -0,0 +1,356 @@
+#include "gpu_runtime.h"
+
+__global__ void sgd_sparse_update(const float *grad_data,
+                                  const float *indices_data, float *param_data,
+                                  size_t size, size_t length, float lr) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t index = thread_ind / length;
+    size_t offset = thread_ind % length;
+    int id = indices_data[index];
+    const float cur_grad = grad_data[thread_ind];
+    float *param_ptr = param_data + length * id + offset;
+    atomicAdd(param_ptr, -lr * cur_grad);
+}
+
+int SGDOptimizerSparseUpdate(DLArrayHandle param,
+                             const DLArrayHandle grad_indices,
+                             const DLArrayHandle grad_values, float lr,
+                             DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t length = param->shape[1];
+    for (int i = 0; i < grad_values->ndim; i++) {
+        size *= grad_values->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    const float *grad_data = (const float *)grad_values->data;
+    float *param_data = (float *)param->data;
+    const float *indices_data = (const float *)grad_indices->data;
+
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        sgd_sparse_update<<<blocks, threads, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            grad_data, indices_data, param_data, size, length, lr);
+    else
+        sgd_sparse_update<<<blocks, threads>>>(grad_data, indices_data,
+                                               param_data, size, length, lr);
+    return 0;
+}
+
+__global__ void nesterov_sparse_first_phase(float *param_data,
+                                            float *veloc_data,
+                                            const float *grad_data,
+                                            const float *indices_data, float lr,
+                                            size_t size, size_t length) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t ind = thread_ind / length;
+    size_t offset = thread_ind % length;
+    int grad_ind = indices_data[ind];
+    const float cur_grad = grad_data[thread_ind];
+    size_t total_offset = length * grad_ind + offset;
+    float *veloc_ptr = veloc_data + total_offset;
+    float *param_ptr = param_data + total_offset;
+    float temp = -lr * cur_grad;
+    atomicAdd(veloc_ptr, temp);
+    atomicAdd(param_ptr, temp);
+}
+
+__global__ void nesterov_sparse_second_phase(float *param_data,
+                                             float *veloc_data, float momentum,
+                                             size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float temp_veloc = momentum * veloc_data[ind];
+    veloc_data[ind] = temp_veloc;
+    param_data[ind] = param_data[ind] + temp_veloc;
+}
+
+__global__ void momentum_sparse_first_phase(float *veloc_data,
+                                            const float *grad_data,
+                                            const float *indices_data, float lr,
+                                            size_t size, size_t length) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t ind = thread_ind / length;
+    size_t offset = thread_ind % length;
+    int grad_ind = indices_data[ind];
+    const float cur_grad = grad_data[thread_ind];
+    float *veloc_ptr = veloc_data + length * grad_ind + offset;
+    atomicAdd(veloc_ptr, -lr * cur_grad);
+}
+
+__global__ void momentum_sparse_second_phase(float *param_data,
+                                             float *veloc_data, float momentum,
+                                             size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    param_data[ind] = param_data[ind] + veloc_data[ind];
+    veloc_data[ind] = momentum * veloc_data[ind];
+}
+
+int MomentumOptimizerSparseUpdate(DLArrayHandle param,
+                                  const DLArrayHandle grad_indices,
+                                  const DLArrayHandle grad_values,
+                                  DLArrayHandle velocity, float lr,
+                                  float momentum, bool nesterov,
+                                  DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t total_size = 1;
+    size_t length = param->shape[1];
+    for (int i = 0; i < grad_values->ndim; ++i) {
+        size *= grad_values->shape[i];
+    }
+    for (int i = 0; i < param->ndim; ++i) {
+        total_size *= param->shape[i];
+    }
+
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad_values->data;
+    const float *indices_data = (const float *)grad_indices->data;
+    float *velocity_data = (float *)velocity->data;
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    dim3 dense_blocks;
+    dim3 dense_threads;
+    if (total_size <= 1024) {
+        dense_threads.x = total_size;
+        dense_blocks.x = 1;
+    } else {
+        dense_threads.x = 1024;
+        dense_blocks.x = (total_size + 1023) / 1024;
+    }
+
+    if (nesterov) {
+        if (stream_handle) {
+            nesterov_sparse_first_phase<<<
+                blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+                param_data, velocity_data, grad_data, indices_data, lr, size,
+                length);
+            nesterov_sparse_second_phase<<<dense_blocks, dense_threads, 0,
+                                           *(cudaStream_t *)
+                                                stream_handle->handle>>>(
+                param_data, velocity_data, momentum, total_size);
+        } else {
+            nesterov_sparse_first_phase<<<blocks, threads>>>(
+                param_data, velocity_data, grad_data, indices_data, lr, size,
+                length);
+            nesterov_sparse_second_phase<<<dense_blocks, dense_threads>>>(
+                param_data, velocity_data, momentum, total_size);
+        }
+    } else {
+        if (stream_handle) {
+            momentum_sparse_first_phase<<<
+                blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+                velocity_data, grad_data, indices_data, lr, size, length);
+            momentum_sparse_second_phase<<<dense_blocks, dense_threads, 0,
+                                           *(cudaStream_t *)
+                                                stream_handle->handle>>>(
+                param_data, velocity_data, momentum, total_size);
+        } else {
+            momentum_sparse_first_phase<<<blocks, threads>>>(
+                velocity_data, grad_data, indices_data, lr, size, length);
+            momentum_sparse_second_phase<<<dense_blocks, dense_threads>>>(
+                param_data, velocity_data, momentum, total_size);
+        }
+    }
+    return 0;
+}
+
+__global__ void deduplicate_kernel(const float *origin_data,
+                                   const float *inverse_data,
+                                   float *compressed_data, size_t size,
+                                   size_t length) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t ind = thread_ind / length;
+    size_t offset = thread_ind % length;
+    int com_ind = inverse_data[ind];
+    const float cur_origin = origin_data[thread_ind];
+    float *compressed_ptr = compressed_data + length * com_ind + offset;
+    atomicAdd(compressed_ptr, cur_origin);
+}
+
+int DeduplicateIndexedSlices(const DLArrayHandle origin,
+                             const DLArrayHandle inverse,
+                             DLArrayHandle compressed,
+                             DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t length = compressed->shape[compressed->ndim - 1];
+    for (int i = 0; i < origin->ndim; ++i) {
+        size *= origin->shape[i];
+    }
+    const float *origin_data = (const float *)origin->data;
+    const float *inverse_data = (const float *)inverse->data;
+    float *compressed_data = (float *)compressed->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle)
+        deduplicate_kernel<<<blocks, threads, 0,
+                             *(cudaStream_t *)stream_handle->handle>>>(
+            origin_data, inverse_data, compressed_data, size, length);
+    else
+        deduplicate_kernel<<<blocks, threads>>>(origin_data, inverse_data,
+                                                compressed_data, size, length);
+
+    return 0;
+}
+
+__global__ void adagrad_sparse_update(float *param_data, const float *grad_data,
+                                      const float *indices_data,
+                                      float *acc_data, float lr, float eps,
+                                      size_t size, size_t length) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t ind = thread_ind / length;
+    size_t offset = thread_ind % length;
+
+    int grad_ind = indices_data[ind];
+    const float cur_grad = grad_data[thread_ind];
+    size_t total_offset = length * grad_ind + offset;
+    float *acc_ptr = acc_data + total_offset;
+    float *param_ptr = param_data + total_offset;
+
+    float cur_acc = *acc_ptr + cur_grad * cur_grad;
+    *acc_ptr = cur_acc;
+    *param_ptr -= lr * cur_grad / (sqrtf(cur_acc) + eps);
+}
+
+int AdaGradOptimizerSparseUpdate(DLArrayHandle param,
+                                 const DLArrayHandle grad_indices,
+                                 const DLArrayHandle grad_values,
+                                 DLArrayHandle acc, float lr, float eps,
+                                 DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t length = param->shape[1];
+    for (int i = 0; i < grad_values->ndim; ++i) {
+        size *= grad_values->shape[i];
+    }
+
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad_values->data;
+    const float *indices_data = (const float *)grad_indices->data;
+    float *acc_data = (float *)acc->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        adagrad_sparse_update<<<blocks, threads, 0,
+                                *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, indices_data, acc_data, lr, eps, size,
+            length);
+    } else {
+        adagrad_sparse_update<<<blocks, threads>>>(param_data, grad_data,
+                                                   indices_data, acc_data, lr,
+                                                   eps, size, length);
+    }
+    return 0;
+}
+
+__global__ void adam_sparse_update(float *param, const float *grad_data,
+                                   const float *indices_data, float *m,
+                                   float *v, float lr, float beta1, float beta2,
+                                   float beta1t, float beta2t, float eps,
+                                   size_t size, size_t length) {
+    size_t thread_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_ind >= size)
+        return;
+    size_t ind = thread_ind / length;
+    size_t offset = thread_ind % length;
+
+    int grad_ind = indices_data[ind];
+    const float cur_grad = grad_data[thread_ind];
+    size_t total_offset = length * grad_ind + offset;
+    float *m_ptr = m + total_offset;
+    float *v_ptr = v + total_offset;
+    float *param_ptr = param + total_offset;
+
+    float cur_m = beta1 * (*m_ptr) + (1 - beta1) * cur_grad;
+    float cur_v = beta2 * (*v_ptr) + (1 - beta2) * cur_grad * cur_grad;
+    *m_ptr = cur_m;
+    *v_ptr = cur_v;
+    cur_m /= (1 - beta1t);
+    cur_v /= (1 - beta2t);
+    *(param_ptr) -= lr * cur_m / (sqrtf(cur_v) + eps);
+}
+
+int AdamOptimizerSparseUpdate(DLArrayHandle param,
+                              const DLArrayHandle grad_indices,
+                              const DLArrayHandle grad_values,
+                              DLArrayHandle expavg, DLArrayHandle expavgsq,
+                              float lr, float beta1, float beta2, float beta1t,
+                              float beta2t, float eps,
+                              DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    size_t length = param->shape[1];
+    for (int i = 0; i < grad_values->ndim; ++i) {
+        size *= grad_values->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    float *param_data = (float *)param->data;
+    const float *grad_data = (const float *)grad_values->data;
+    const float *indices_data = (const float *)grad_indices->data;
+    float *m_data = (float *)expavg->data;
+    float *v_data = (float *)expavgsq->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        adam_sparse_update<<<blocks, threads, 0,
+                             *(cudaStream_t *)stream_handle->handle>>>(
+            param_data, grad_data, indices_data, m_data, v_data, lr, beta1,
+            beta2, beta1t, beta2t, eps, size, length);
+    else
+        adam_sparse_update<<<blocks, threads>>>(
+            param_data, grad_data, indices_data, m_data, v_data, lr, beta1,
+            beta2, beta1t, beta2t, eps, size, length);
+    return 0;
+}
diff --git a/src/ops/Pad.cu b/src/ops/Pad.cu
new file mode 100644
index 0000000..3948510
--- /dev/null
+++ b/src/ops/Pad.cu
@@ -0,0 +1,139 @@
+#include "gpu_runtime.h"
+
+__global__ void pad_constant_kernel(const float *input_data, float *output_data,
+                                    size_t begin_N, size_t end_N, size_t N,
+                                    size_t begin_C, size_t end_C, size_t C,
+                                    size_t begin_H, size_t end_H, size_t H,
+                                    size_t begin_W, size_t end_W, size_t W,
+                                    float constant_value) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N * C * H * W)
+        return;
+    size_t idx_N = idx / (C * H * W);
+    size_t idx_C = idx % (C * H * W) / (H * W);
+    size_t idx_H = idx % (H * W) / W;
+    size_t idx_W = idx % W;
+    if (idx_N >= begin_N && idx_N < end_N && idx_C >= begin_C && idx_C < end_C
+        && idx_H >= begin_H && idx_H < end_H && idx_W >= begin_W
+        && idx_W < end_W) {
+        output_data[idx] = input_data[(((idx_N - begin_N) * (end_C - begin_C)
+                                        + idx_C - begin_C)
+                                           * (end_H - begin_H)
+                                       + idx_H - begin_H)
+                                          * (end_W - begin_W)
+                                      + idx_W - begin_W];
+    } else {
+        output_data[idx] = constant_value;
+    }
+}
+
+// mode = 0    CONSTANT
+//        1    REFLECT
+//        2    SYMMETRIC
+int DLGpuPad(const DLArrayHandle input_X, DLArrayHandle output_Y, int *paddings,
+             int pad_len, size_t mode = 0, float constant_values = 0,
+             DLStreamHandle stream_handle = NULL) {
+    int len = pad_len;
+    int endpoint[8];
+    for (index_t i = 0; i < 4; i++) {
+        if (i < (4 - len / 2)) {
+            assert((input_X->shape[i]) == (output_Y->shape[i]));
+            endpoint[i * 2] = input_X->shape[i] - 1;
+            endpoint[i * 2 + 1] = endpoint[i * 2] + 1;
+        } else {
+            assert((input_X->shape[i] + paddings[(i - (4 - len / 2)) * 2]
+                    + paddings[(i - (4 - len / 2)) * 2 + 1])
+                   == (output_Y->shape[i]));
+            endpoint[i * 2] = paddings[(i - (4 - len / 2)) * 2];
+            endpoint[i * 2 + 1] =
+                paddings[(i - (4 - len / 2)) * 2] + input_X->shape[i];
+        }
+    }
+    size_t output_size = 1;
+    for (index_t i = 0; i < 4; i++) {
+        output_size *= (output_Y->shape[i]);
+    }
+
+    size_t blocks = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (mode == 0) {
+        if (stream_handle)
+            pad_constant_kernel<<<blocks, THREADS_PER_BLOCK, 0,
+                                  *(cudaStream_t *)stream_handle->handle>>>(
+                (const float *)input_X->data, (float *)output_Y->data,
+                endpoint[0], endpoint[1], output_Y->shape[0], endpoint[2],
+                endpoint[3], output_Y->shape[1], endpoint[4], endpoint[5],
+                output_Y->shape[2], endpoint[6], endpoint[7],
+                output_Y->shape[3], constant_values);
+        else
+            pad_constant_kernel<<<blocks, THREADS_PER_BLOCK>>>(
+                (const float *)input_X->data, (float *)output_Y->data,
+                endpoint[0], endpoint[1], output_Y->shape[0], endpoint[2],
+                endpoint[3], output_Y->shape[1], endpoint[4], endpoint[5],
+                output_Y->shape[2], endpoint[6], endpoint[7],
+                output_Y->shape[3], constant_values);
+    }
+    return 0;
+}
+__global__ void pad_constant_gradient_kernel(const float *output_grad_data,
+                                             float *input_grad_data, int N,
+                                             int C, int H, int W, int begin_N,
+                                             int begin_C, int begin_H,
+                                             int begin_W, int out_N, int out_C,
+                                             int out_H, int out_W) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N * C * H * W)
+        return;
+    size_t idx_N = idx / (C * H * W);
+    size_t idx_C = idx % (C * H * W) / (H * W);
+    size_t idx_H = idx % (H * W) / W;
+    size_t idx_W = idx % W;
+    input_grad_data[idx] =
+        output_grad_data[((((idx_N + begin_N) * out_C + idx_C + begin_C) * out_H
+                           + idx_H + begin_H)
+                              * out_W
+                          + idx_W + begin_W)];
+}
+int DLGpuPad_gradient(const DLArrayHandle output_gradient_Y,
+                      DLArrayHandle input_gradient_X, int *paddings,
+                      int pad_len, size_t mode,
+                      DLStreamHandle stream_handle = NULL) {
+    int len = pad_len;
+    int begin_p[4];
+    int N = input_gradient_X->shape[0];
+    int C = input_gradient_X->shape[1];
+    int H = input_gradient_X->shape[2];
+    int W = input_gradient_X->shape[3];
+
+    int out_N = output_gradient_Y->shape[0];
+    int out_C = output_gradient_Y->shape[1];
+    int out_H = output_gradient_Y->shape[2];
+    int out_W = output_gradient_Y->shape[3];
+
+    for (int i = 0; i < 4; i++) {
+        if (i < (4 - len / 2)) {
+            begin_p[i] = 0;
+        } else {
+            begin_p[i] = paddings[(i - (4 - len / 2)) * 2];
+        }
+    }
+    size_t input_size = 1;
+    for (index_t i = 0; i < 4; i++) {
+        input_size *= input_gradient_X->shape[i];
+    }
+    size_t blocks = (input_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (mode == 0) {
+        if (stream_handle)
+            pad_constant_gradient_kernel<<<blocks, THREADS_PER_BLOCK, 0,
+                                           *(cudaStream_t *)
+                                                stream_handle->handle>>>(
+                (const float *)(output_gradient_Y->data),
+                (float *)(input_gradient_X->data), N, C, H, W, begin_p[0],
+                begin_p[1], begin_p[2], begin_p[3], out_N, out_C, out_H, out_W);
+        else
+            pad_constant_gradient_kernel<<<blocks, THREADS_PER_BLOCK>>>(
+                (const float *)(output_gradient_Y->data),
+                (float *)(input_gradient_X->data), N, C, H, W, begin_p[0],
+                begin_p[1], begin_p[2], begin_p[3], out_N, out_C, out_H, out_W);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/ReduceMean.cu b/src/ops/ReduceMean.cu
new file mode 100644
index 0000000..ce5824b
--- /dev/null
+++ b/src/ops/ReduceMean.cu
@@ -0,0 +1,73 @@
+#include "gpu_runtime.h"
+
+int DLGpuReduceMean(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                    int *axes, int num_ax,
+                    DLStreamHandle stream_handle = NULL) {
+    int dev_id = (in_arr->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_AVG, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+    cudnnTensorDescriptor_t adesc;
+    cudnnTensorDescriptor_t cdesc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&adesc));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cdesc));
+
+    int ori_ndim = in_arr->ndim;
+    int ndim = max(ori_ndim, 4);
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    for (int i = 0; i < ori_ndim; ++i) {
+        dimA[i] = dimC[i] = (int)in_arr->shape[i];
+    }
+    for (int i = ori_ndim; i < ndim; ++i) {
+        dimA[i] = dimC[i] = 1;
+    }
+    for (int i = 0; i < num_ax; ++i) {
+        assert(axes[i] < ori_ndim && axes[i] >= 0);
+        dimC[axes[i]] = 1;
+    }
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+
+    size_t size = temp_strideA * sizeof(float);
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *workspace = find_chunk(size, dev_id);
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(adesc, CUDNN_DATA_FLOAT, ndim, dimA,
+                                          strideA));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0, workspace,
+                                 size, &one, adesc, (const void *)in_arr->data,
+                                 &zero, cdesc, (void *)out_arr->data));
+
+    del_chunk(workspace, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(adesc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cdesc));
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
diff --git a/src/ops/ReduceSum.cu b/src/ops/ReduceSum.cu
new file mode 100644
index 0000000..0a24905
--- /dev/null
+++ b/src/ops/ReduceSum.cu
@@ -0,0 +1,72 @@
+#include "gpu_runtime.h"
+
+int DLGpuReduceSum(const DLArrayHandle in_arr, DLArrayHandle out_arr, int *axes,
+                   int num_ax, DLStreamHandle stream_handle = NULL) {
+    int dev_id = (in_arr->ctx).device_id;
+    cudnn_init(dev_id, stream_handle);
+
+    float one = 1.0f;
+    float zero = 0.0f;
+    cudnnReduceTensorDescriptor_t rtd;
+    CUDNN_CALL(cudnnCreateReduceTensorDescriptor(&rtd));
+    CUDNN_CALL(cudnnSetReduceTensorDescriptor(
+        rtd, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
+        CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+    cudnnTensorDescriptor_t adesc;
+    cudnnTensorDescriptor_t cdesc;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&adesc));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cdesc));
+
+    int ori_ndim = in_arr->ndim;
+    int ndim = max(ori_ndim, 4);
+    size_t cpu_mem = ndim * sizeof(int);
+    int *dimA = (int *)malloc(cpu_mem);
+    int *strideA = (int *)malloc(cpu_mem);
+    int *dimC = (int *)malloc(cpu_mem);
+    int *strideC = (int *)malloc(cpu_mem);
+
+    for (int i = 0; i < ori_ndim; ++i) {
+        dimA[i] = dimC[i] = (int)in_arr->shape[i];
+    }
+    for (int i = ori_ndim; i < ndim; ++i) {
+        dimA[i] = dimC[i] = 1;
+    }
+    for (int i = 0; i < num_ax; ++i) {
+        assert(axes[i] < ori_ndim && axes[i] >= 0);
+        dimC[axes[i]] = 1;
+    }
+    int temp_strideA = 1;
+    int temp_strideC = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+        strideA[i] = temp_strideA;
+        strideC[i] = temp_strideC;
+        temp_strideA *= dimA[i];
+        temp_strideC *= dimC[i];
+    }
+
+    size_t size = temp_strideA * sizeof(float);
+
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    void *workspace = find_chunk(size, dev_id);
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(adesc, CUDNN_DATA_FLOAT, ndim, dimA,
+                                          strideA));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(cdesc, CUDNN_DATA_FLOAT, ndim, dimC,
+                                          strideC));
+    CUDNN_CALL(cudnnReduceTensor(cudnn_map[dev_id], rtd, NULL, 0, workspace,
+                                 size, &one, adesc, (const void *)in_arr->data,
+                                 &zero, cdesc, (void *)out_arr->data));
+
+    del_chunk(workspace, dev_id);
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(adesc));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cdesc));
+    CUDNN_CALL(cudnnDestroyReduceTensorDescriptor(rtd));
+    free(dimA);
+    free(dimC);
+    free(strideA);
+    free(strideC);
+    return 0;
+}
diff --git a/src/ops/ReduceSumAxisZero.cu b/src/ops/ReduceSumAxisZero.cu
new file mode 100644
index 0000000..55147fb
--- /dev/null
+++ b/src/ops/ReduceSumAxisZero.cu
@@ -0,0 +1,153 @@
+#include "gpu_runtime.h"
+
+__global__ void float_memory_copy(float *A, const float *B, size_t len) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= len)
+        return;
+    A[id] = B[id];
+}
+__global__ void float_add_kernel(float *A, const float *B, size_t len) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= len)
+        return;
+    A[id] += B[id];
+}
+
+int Float_Add(float *A, const float *B, int len, DLStreamHandle stream_handle) {
+    size_t BLOCKS = (len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_add_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(A, B, len);
+    else
+        float_add_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(A, B, len);
+    return 0;
+}
+
+int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output,
+                           DLStreamHandle stream_handle = NULL) {
+    for (int i = 1; i < (input->ndim); i++) {
+        assert((input->shape[i]) == (output->shape[i - 1]));
+    }
+    const float *input_data = (const float *)input->data;
+
+    int dev_id = (input->ctx).device_id;
+    size_t workspace_size = (input->shape[0] + 1) / 2;
+    for (int i = 1; i < (input->ndim); i++) {
+        workspace_size *= input->shape[i];
+    }
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+
+    void *work_data = find_chunk(workspace_size * sizeof(float), dev_id);
+    float *workspace_data = (float *)work_data;
+
+    float *output_data = (float *)output->data;
+    size_t output_size = 1;
+    for (index_t i = 0; i < (output->ndim); i++) {
+        output_size *= (output->shape[i]);
+    }
+    size_t batch = input->shape[0];
+    size_t BLOCKS = ((batch + 1) / 2 * output_size + THREADS_PER_BLOCK - 1)
+                    / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            workspace_data, input_data, (batch + 1) / 2 * output_size);
+    else
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            workspace_data, input_data, (batch + 1) / 2 * output_size);
+    Float_Add(workspace_data, input_data + (batch + 1) / 2 * output_size,
+              batch / 2 * output_size, stream_handle);
+    batch = (batch + 1) / 2;
+    while (batch != 1) {
+        Float_Add(workspace_data,
+                  workspace_data + (batch + 1) / 2 * output_size,
+                  batch / 2 * output_size, stream_handle);
+        batch = (batch + 1) / 2;
+    }
+    BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            output_data, workspace_data, output_size);
+    else
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            output_data, workspace_data, output_size);
+    del_chunk(work_data, dev_id);
+    return 0;
+}
+
+__global__ void tree_reduce_kernel(const size_t nblocks, const float *input,
+                                   float *output, size_t now_batch,
+                                   size_t output_size, size_t total_len) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= nblocks)
+        return;
+    size_t row = idx / output_size;
+    size_t col = idx % output_size;
+    row = row * 2 * now_batch * output_size;
+    if (row + col < total_len
+        && row + col + now_batch * output_size < total_len) {
+        output[row + col] =
+            input[row + col] + input[row + col + now_batch * output_size];
+        output[row + col + now_batch * output_size] =
+            input[row + col + now_batch * output_size];
+    }
+    if (row + col < total_len
+        && row + col + now_batch * output_size >= total_len) {
+        output[row + col] = input[row + col];
+    }
+}
+int _DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output,
+                            DLArrayHandle arr_workspace,
+                            DLStreamHandle stream_handle = NULL) {
+    const float *input_data = (const float *)(input->data);
+    float *workspace_data = (float *)(arr_workspace->data);
+    float *output_data = (float *)(output->data);
+    size_t output_size = 1;
+    size_t input_size = 1;
+    for (index_t i = 0; i < (output->ndim); i++) {
+        output_size *= (output->shape[i]);
+    }
+    input_size = output_size * input->shape[0];
+    size_t batch = input->shape[0];
+    size_t BLOCKS = ((batch + 1) / 2 * output_size + THREADS_PER_BLOCK - 1)
+                    / THREADS_PER_BLOCK;
+    size_t now = 1;
+    if (stream_handle)
+        tree_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                             *(cudaStream_t *)stream_handle->handle>>>(
+            (batch + 1) / 2 * output_size, input_data, workspace_data, now,
+            output_size, input_size);
+    else
+        tree_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            (batch + 1) / 2 * output_size, input_data, workspace_data, now,
+            output_size, input_size);
+    now *= 2;
+    batch = (batch + 1) / 2;
+    while (now < (input->shape[0])) {
+        BLOCKS = ((batch + 1) / 2 * output_size + THREADS_PER_BLOCK - 1)
+                 / THREADS_PER_BLOCK;
+        if (stream_handle)
+            tree_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                                 *(cudaStream_t *)stream_handle->handle>>>(
+                (batch + 1) / 2 * output_size, workspace_data, workspace_data,
+                now, output_size, input_size);
+        else
+            tree_reduce_kernel<<<BLOCKS, THREADS_PER_BLOCK>>>(
+                (batch + 1) / 2 * output_size, workspace_data, workspace_data,
+                now, output_size, input_size);
+        now = now * 2;
+        batch = (batch + 1) / 2;
+    }
+    BLOCKS = (output_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            output_data, workspace_data, output_size);
+    else
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            output_data, workspace_data, output_size);
+    return 0;
+}
diff --git a/src/ops/Relu.cu b/src/ops/Relu.cu
new file mode 100644
index 0000000..a6ffcb6
--- /dev/null
+++ b/src/ops/Relu.cu
@@ -0,0 +1,76 @@
+#include "gpu_runtime.h"
+
+__global__ void relu_kernel(float *input, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = max(input[ind], 0.);
+}
+
+int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output,
+              DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        relu_kernel<<<blocks, threads, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        relu_kernel<<<blocks, threads>>>(input_data, output_data, size);
+    return 0;
+}
+
+__global__ void relu_grad_kernel(const float *input, const float *in_grad,
+                                 float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    float s = 0;
+    if (input[ind] > 0)
+        s = 1;
+    if (input[ind] < 0)
+        s = -1;
+    output[ind] = (s + 1) * in_grad[ind] * 0.5;
+}
+
+int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
+                      DLArrayHandle output,
+                      DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    const float *input_data = (const float *)input->data;
+    const float *in_grad_data = (const float *)in_grad->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        relu_grad_kernel<<<blocks, threads, 0,
+                           *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, in_grad_data, output_data, size);
+    else
+        relu_grad_kernel<<<blocks, threads>>>(input_data, in_grad_data,
+                                              output_data, size);
+    return 0;
+}
diff --git a/src/ops/Reshape.cu b/src/ops/Reshape.cu
new file mode 100644
index 0000000..6c8156e
--- /dev/null
+++ b/src/ops/Reshape.cu
@@ -0,0 +1,27 @@
+#include "gpu_runtime.h"
+
+extern __global__ void float_memory_copy(float *A, const float *B, size_t len);
+
+int DLGpuReshape(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                 DLStreamHandle stream_handle = NULL) {
+    size_t input_size = 1;
+    size_t output_size = 1;
+    for (index_t i = 0; i < in_arr->ndim; i++) {
+        input_size *= in_arr->shape[i];
+    }
+    for (index_t i = 0; i < out_arr->ndim; i++) {
+        output_size *= out_arr->shape[i];
+    }
+    assert(input_size == output_size);
+    const float *input_data = (const float *)in_arr->data;
+    float *output_data = (float *)out_arr->data;
+    size_t BLOCKS = (input_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (stream_handle)
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK, 0,
+                            *(cudaStream_t *)stream_handle->handle>>>(
+            output_data, input_data, input_size);
+    else
+        float_memory_copy<<<BLOCKS, THREADS_PER_BLOCK>>>(
+            output_data, input_data, input_size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Sigmoid.cu b/src/ops/Sigmoid.cu
new file mode 100644
index 0000000..87c7502
--- /dev/null
+++ b/src/ops/Sigmoid.cu
@@ -0,0 +1,34 @@
+#include "gpu_runtime.h"
+
+__global__ void sigmoid_kernel(float *input, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = 1.0 / (1.0 + 1.0 / exp(input[ind]));
+}
+
+int DLGpuSigmoid(const DLArrayHandle input, DLArrayHandle output,
+                 DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        sigmoid_kernel<<<blocks, threads, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        sigmoid_kernel<<<blocks, threads>>>(input_data, output_data, size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Slice.cu b/src/ops/Slice.cu
new file mode 100644
index 0000000..0ee6e4e
--- /dev/null
+++ b/src/ops/Slice.cu
@@ -0,0 +1,296 @@
+#include "gpu_runtime.h"
+
+__global__ void slice_kernel(float *out_arr, const float *in_arr,
+                             const int64_t *o_shape, const int64_t *i_shape,
+                             const int64_t *begin_pos, size_t ndim,
+                             size_t size) {
+    size_t o_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_index >= size)
+        return;
+
+    size_t tmp_index = o_index;
+    size_t i_index = 0;
+    int64_t i_mat = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        int64_t offset = begin_pos[i] + tmp_index % o_shape[i];
+        tmp_index /= o_shape[i];
+        i_index += offset * i_mat;
+        i_mat *= i_shape[i];
+    }
+    out_arr[o_index] = in_arr[i_index];
+}
+
+int DLGpuSlice(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+               int64_t *begin_pos, DLStreamHandle stream_handle = NULL) {
+    assert(in_arr->ndim == out_arr->ndim);
+    size_t ndim = in_arr->ndim;
+    size_t o_size = 1;
+    for (int i = 0; i < ndim; ++i) {
+        assert(begin_pos[i] >= 0);
+        assert(begin_pos[i] + out_arr->shape[i] <= in_arr->shape[i]);
+        o_size *= out_arr->shape[i];
+    }
+    const float *i_data = (const float *)in_arr->data;
+    float *o_data = (float *)out_arr->data;
+    int dev_id = (in_arr->ctx).device_id;
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    size_t alloc_size = ndim * sizeof(int64_t);
+    void *pos = find_chunk(alloc_size, dev_id);
+    void *i_shape = find_chunk(alloc_size, dev_id);
+    void *o_shape = find_chunk(alloc_size, dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    if (o_size <= 1024) {
+        threads.x = o_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (o_size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (cu_stream != NULL) {
+        CUDA_CALL(cudaMemcpyAsync(pos, (void *)begin_pos, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(i_shape, (void *)in_arr->shape, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(o_shape, (void *)out_arr->shape, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        slice_kernel<<<blocks, threads, 0, cu_stream>>>(
+            o_data, i_data, (const int64_t *)o_shape, (const int64_t *)i_shape,
+            (const int64_t *)pos, ndim, o_size);
+
+    } else {
+        CUDA_CALL(cudaMemcpy(pos, (void *)begin_pos, alloc_size,
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(i_shape, (void *)in_arr->shape, alloc_size,
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(o_shape, (void *)out_arr->shape, alloc_size,
+                             cudaMemcpyHostToDevice));
+        slice_kernel<<<blocks, threads>>>(
+            o_data, i_data, (const int64_t *)o_shape, (const int64_t *)i_shape,
+            (const int64_t *)pos, ndim, o_size);
+    }
+
+    del_chunk(o_shape, dev_id);
+    del_chunk(i_shape, dev_id);
+    del_chunk(pos, dev_id);
+    return 0;
+}
+
+__global__ void slice_gradient_kernel(float *out_arr, const float *in_arr,
+                                      const int64_t *o_shape,
+                                      const int64_t *i_shape,
+                                      const int64_t *begin_pos, size_t ndim,
+                                      size_t size) {
+    size_t o_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_index >= size)
+        return;
+
+    out_arr[o_index] = 0;
+
+    size_t tmp_index = o_index;
+    size_t i_index = 0;
+    int64_t i_mat = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        int64_t offset = tmp_index % o_shape[i];
+        if (offset < begin_pos[i] || offset >= begin_pos[i] + i_shape[i])
+            return;
+        tmp_index /= o_shape[i];
+        i_index += (offset - begin_pos[i]) * i_mat;
+        i_mat *= i_shape[i];
+    }
+    out_arr[o_index] = in_arr[i_index];
+}
+
+int DLGpuSliceGradient(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                       int64_t *begin_pos,
+                       DLStreamHandle stream_handle = NULL) {
+    assert(in_arr->ndim == out_arr->ndim);
+    size_t ndim = in_arr->ndim;
+    size_t o_size = 1;
+    for (int i = 0; i < ndim; ++i) {
+        assert(begin_pos[i] >= 0);
+        assert(begin_pos[i] + in_arr->shape[i] <= out_arr->shape[i]);
+        o_size *= out_arr->shape[i];
+    }
+    const float *i_data = (const float *)in_arr->data;
+    float *o_data = (float *)out_arr->data;
+    int dev_id = (in_arr->ctx).device_id;
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    size_t alloc_size = ndim * sizeof(int64_t);
+    void *pos = find_chunk(alloc_size, dev_id);
+    void *i_shape = find_chunk(alloc_size, dev_id);
+    void *o_shape = find_chunk(alloc_size, dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    if (o_size <= 1024) {
+        threads.x = o_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (o_size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (cu_stream != NULL) {
+        CUDA_CALL(cudaMemcpyAsync(pos, (void *)begin_pos, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(i_shape, (void *)in_arr->shape, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        CUDA_CALL(cudaMemcpyAsync(o_shape, (void *)out_arr->shape, alloc_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        slice_gradient_kernel<<<blocks, threads, 0, cu_stream>>>(
+            o_data, i_data, (const int64_t *)o_shape, (const int64_t *)i_shape,
+            (const int64_t *)pos, ndim, o_size);
+    } else {
+        CUDA_CALL(cudaMemcpy(pos, (void *)begin_pos, alloc_size,
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(i_shape, (void *)in_arr->shape, alloc_size,
+                             cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(o_shape, (void *)out_arr->shape, alloc_size,
+                             cudaMemcpyHostToDevice));
+        slice_gradient_kernel<<<blocks, threads>>>(
+            o_data, i_data, (const int64_t *)o_shape, (const int64_t *)i_shape,
+            (const int64_t *)pos, ndim, o_size);
+    }
+
+    del_chunk(o_shape, dev_id);
+    del_chunk(i_shape, dev_id);
+    del_chunk(pos, dev_id);
+    return 0;
+}
+
+/* below is the simple version of slice and slicegradient */
+
+__global__ void slice_kernel_simple(float *out_arr, const float *in_arr,
+                                    const uint *gpu_buf, size_t ndim,
+                                    size_t size) {
+    size_t o_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_index >= size)
+        return;
+
+    const uint *begin_pos = gpu_buf;
+    const uint *i_shape = begin_pos + ndim;
+    const uint *o_shape = i_shape + ndim;
+
+    size_t tmp_index = o_index;
+    size_t i_index = 0;
+    uint i_mat = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        uint offset = begin_pos[i] + tmp_index % o_shape[i];
+        tmp_index /= o_shape[i];
+        i_index += offset * i_mat;
+        i_mat *= i_shape[i];
+    }
+    out_arr[o_index] = in_arr[i_index];
+}
+
+int DLGpuSliceSimple(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                     const DLArrayHandle gpu_buf,
+                     DLStreamHandle stream_handle = NULL) {
+    assert(in_arr->ndim == out_arr->ndim);
+    size_t ndim = in_arr->ndim;
+    size_t o_size = 1;
+    for (int i = 0; i < ndim; ++i) {
+        o_size *= out_arr->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    if (o_size <= 1024) {
+        threads.x = o_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (o_size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        cudaStream_t cu_stream = *(cudaStream_t *)(stream_handle->handle);
+        slice_kernel_simple<<<blocks, threads, 0, cu_stream>>>(
+            (float *)(out_arr->data), (const float *)(in_arr->data),
+            (const uint *)(gpu_buf->data), ndim, o_size);
+
+    } else {
+        slice_kernel_simple<<<blocks, threads>>>(
+            (float *)(out_arr->data), (const float *)(in_arr->data),
+            (const uint *)(gpu_buf->data), ndim, o_size);
+    }
+    return 0;
+}
+
+__global__ void slice_gradient_kernel_simple(float *out_arr,
+                                             const float *in_arr,
+                                             const uint *gpu_buf, size_t ndim,
+                                             size_t size) {
+    size_t o_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (o_index >= size)
+        return;
+
+    const uint *begin_pos = gpu_buf;
+    const uint *i_shape = begin_pos + ndim;
+    const uint *o_shape = i_shape + ndim;
+
+    out_arr[o_index] = 0;
+
+    size_t tmp_index = o_index;
+    size_t i_index = 0;
+    int64_t i_mat = 1;
+
+    for (int i = ndim - 1; i >= 0; --i) {
+        int64_t offset = tmp_index % o_shape[i];
+        if (offset < begin_pos[i] || offset >= begin_pos[i] + i_shape[i])
+            return;
+        tmp_index /= o_shape[i];
+        i_index += (offset - begin_pos[i]) * i_mat;
+        i_mat *= i_shape[i];
+    }
+    out_arr[o_index] = in_arr[i_index];
+}
+
+int DLGpuSliceGradientSimple(const DLArrayHandle in_arr, DLArrayHandle out_arr,
+                             const DLArrayHandle gpu_buf,
+                             DLStreamHandle stream_handle = NULL) {
+    assert(in_arr->ndim == out_arr->ndim);
+    size_t ndim = in_arr->ndim;
+    size_t o_size = 1;
+    for (int i = 0; i < ndim; ++i) {
+        o_size *= out_arr->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    if (o_size <= 1024) {
+        threads.x = o_size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (o_size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        cudaStream_t cu_stream = *(cudaStream_t *)(stream_handle->handle);
+        slice_gradient_kernel_simple<<<blocks, threads, 0, cu_stream>>>(
+            (float *)(out_arr->data), (const float *)(in_arr->data),
+            (const uint *)(gpu_buf->data), ndim, o_size);
+    } else {
+        slice_gradient_kernel_simple<<<blocks, threads>>>(
+            (float *)(out_arr->data), (const float *)(in_arr->data),
+            (const uint *)(gpu_buf->data), ndim, o_size);
+    }
+    return 0;
+}
diff --git a/src/ops/Softmax.cu b/src/ops/Softmax.cu
new file mode 100644
index 0000000..49f5986
--- /dev/null
+++ b/src/ops/Softmax.cu
@@ -0,0 +1,46 @@
+#include "gpu_runtime.h"
+
+__global__ void softmax_kernel(int nrow, int ncol, const float *input,
+                               float *output) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int thread_id = id; thread_id < nrow;
+         thread_id += blockDim.x * gridDim.x) {
+        float maxval = input[thread_id * ncol];
+        // Find max for a row.
+        for (int x = 1; x < ncol; ++x) {
+            maxval = max(maxval, input[thread_id * ncol + x]);
+        }
+        // Deduct by max for a row, and raise to exp.
+        float sum = 0;
+        for (int x = 0; x < ncol; ++x) {
+            sum += exp(input[thread_id * ncol + x] - maxval);
+        }
+        for (int x = 0; x < ncol; ++x) {
+            output[thread_id * ncol + x] =
+                exp(input[thread_id * ncol + x] - maxval) / sum;
+        }
+    }
+}
+
+int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output,
+                 DLStreamHandle stream_handle = NULL) {
+    assert(input->ndim == output->ndim);
+    int nrow = 1;
+    size_t ndim = input->ndim;
+    for (int i = 0; i < ndim - 1; ++i) {
+        assert(input->shape[i] == output->shape[i]);
+        nrow *= input->shape[i];
+    }
+    assert(input->shape[ndim - 1] == output->shape[ndim - 1]);
+    int ncol = input->shape[ndim - 1];
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+    if (stream_handle)
+        softmax_kernel<<<1, THREADS_PER_BLOCK, 0,
+                         *(cudaStream_t *)stream_handle->handle>>>(
+            nrow, ncol, input_data, output_data);
+    else
+        softmax_kernel<<<1, THREADS_PER_BLOCK>>>(nrow, ncol, input_data,
+                                                 output_data);
+    return 0;
+}
diff --git a/src/ops/SoftmaxCrossEntropy.cu b/src/ops/SoftmaxCrossEntropy.cu
new file mode 100644
index 0000000..59a9bf6
--- /dev/null
+++ b/src/ops/SoftmaxCrossEntropy.cu
@@ -0,0 +1,138 @@
+#include "gpu_runtime.h"
+
+// y = inputs[0], y_ = inputs[1]
+// np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
+__global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol,
+                                                    const float *input_a,
+                                                    const float *input_b,
+                                                    float *output) {
+    // Two dimensional thread blocks.
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= nrow)
+        return;
+    float maxval = input_a[id * ncol];
+    // Find max for a row.
+    for (int x = 1; x < ncol; ++x) {
+        maxval = max(maxval, input_a[id * ncol + x]);
+    }
+    // Deduct by max for a row, and raise to exp.
+    float sum = 0;
+    for (int x = 0; x < ncol; ++x) {
+        sum += exp(input_a[id * ncol + x] - maxval);
+    }
+    // Compute per-row loss.
+    float loss = 0;
+    for (int x = 0; x < ncol; ++x) {
+        size_t curid = id * ncol + x;
+        loss -= input_b[curid] * ((input_a[curid] - maxval) - log(sum));
+    }
+    output[id] = loss;
+}
+
+int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
+                             const DLArrayHandle input_b, DLArrayHandle output,
+                             DLStreamHandle stream_handle = NULL) {
+    size_t indim = input_a->ndim;
+
+    assert(indim == input_b->ndim && indim == output->ndim + 1);
+    int nrow = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        assert(input_a->shape[i] == input_b->shape[i]
+               && input_a->shape[i] == output->shape[i]);
+        nrow *= input_a->shape[i];
+    }
+    assert(input_a->shape[indim - 1] == input_b->shape[indim - 1]);
+    int ncol = input_a->shape[indim - 1];
+
+    const float *input_data_a = (const float *)input_a->data;
+    const float *input_data_b = (const float *)input_b->data;
+    float *output_data = (float *)output->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (nrow <= 1024) {
+        threads.x = nrow;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (nrow + 1023) / 1024;
+    }
+    // 1 block
+    if (stream_handle) {
+        matrix_softmax_cross_entropy_kernel<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            nrow, ncol, input_data_a, input_data_b, output_data);
+    } else {
+        matrix_softmax_cross_entropy_kernel<<<blocks, threads>>>(
+            nrow, ncol, input_data_a, input_data_b, output_data);
+    }
+    return 0;
+}
+
+__global__ void softmax_cross_entropy_gradient_kernel(int nrow, int ncol,
+                                                      const float *input_a,
+                                                      const float *input_b,
+                                                      const float *input_c,
+                                                      float *output) {
+    size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= nrow)
+        return;
+    float maxval = input_a[id * ncol];
+    // Find max for a row.
+    for (int x = 1; x < ncol; ++x) {
+        maxval = max(maxval, input_a[id * ncol + x]);
+    }
+    // Deduct by max for a row, and raise to exp.
+    float sum = 0;
+    for (int x = 0; x < ncol; ++x) {
+        sum += exp(input_a[id * ncol + x] - maxval);
+    }
+    for (int x = 0; x < ncol; ++x) {
+        size_t curid = id * ncol + x;
+        output[curid] =
+            (exp(input_a[curid] - maxval) / sum - input_b[curid]) * input_c[id];
+    }
+}
+
+int DLGpuSoftmaxCrossEntropy_Gradient(const DLArrayHandle input_a,
+                                      const DLArrayHandle input_b,
+                                      const DLArrayHandle input_c,
+                                      DLArrayHandle output,
+                                      DLStreamHandle stream_handle = NULL) {
+    size_t indim = input_a->ndim;
+    assert(indim >= 2 && indim == input_b->ndim && indim == input_c->ndim + 1
+           && indim == output->ndim);
+    int nrow = 1;
+    for (int i = 0; i < indim - 1; ++i) {
+        assert(input_a->shape[i] == input_b->shape[i]
+               && input_a->shape[i] == output->shape[i]
+               && input_a->shape[i] == input_c->shape[i]);
+        nrow *= input_a->shape[i];
+    }
+    assert(input_a->shape[indim - 1] == input_b->shape[indim - 1]
+           && input_a->shape[indim - 1] == output->shape[indim - 1]);
+    int ncol = input_a->shape[indim - 1];
+    const float *input_data_a = (const float *)input_a->data;
+    const float *input_data_b = (const float *)input_b->data;
+    const float *input_data_c = (const float *)input_c->data;
+    float *output_data = (float *)output->data;
+
+    dim3 blocks;
+    dim3 threads;
+    if (nrow <= 1024) {
+        threads.x = nrow;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (nrow + 1023) / 1024;
+    }
+    if (stream_handle) {
+        softmax_cross_entropy_gradient_kernel<<<
+            blocks, threads, 0, *(cudaStream_t *)stream_handle->handle>>>(
+            nrow, ncol, input_data_a, input_data_b, input_data_c, output_data);
+    } else {
+        softmax_cross_entropy_gradient_kernel<<<blocks, threads>>>(
+            nrow, ncol, input_data_a, input_data_b, input_data_c, output_data);
+    }
+    return 0;
+}
diff --git a/src/ops/Sqrt.cu b/src/ops/Sqrt.cu
new file mode 100644
index 0000000..a7f86e1
--- /dev/null
+++ b/src/ops/Sqrt.cu
@@ -0,0 +1,69 @@
+#include "gpu_runtime.h"
+
+__global__ void sqrt_kernel(float *input, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = sqrtf(input[ind]);
+}
+
+int DLGpuSqrt(const DLArrayHandle input, DLArrayHandle output,
+              DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        sqrt_kernel<<<blocks, threads, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        sqrt_kernel<<<blocks, threads>>>(input_data, output_data, size);
+    return 0;
+}
+
+__global__ void reciprocal_sqrt_kernel(float *input, float *output,
+                                       size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = rsqrtf(input[ind]);
+}
+
+int DLGpuReciprocalSqrt(const DLArrayHandle input, DLArrayHandle output,
+                        DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        reciprocal_sqrt_kernel<<<blocks, threads, 0,
+                                 *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        reciprocal_sqrt_kernel<<<blocks, threads>>>(input_data, output_data,
+                                                    size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Tanh.cu b/src/ops/Tanh.cu
new file mode 100644
index 0000000..ce30d71
--- /dev/null
+++ b/src/ops/Tanh.cu
@@ -0,0 +1,34 @@
+#include "gpu_runtime.h"
+
+__global__ void tanh_kernel(float *input, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = tanhf(input[ind]);
+}
+
+int DLGpuTanh(const DLArrayHandle input, DLArrayHandle output,
+              DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < input->ndim; i++) {
+        size *= input->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        tanh_kernel<<<blocks, threads, 0,
+                      *(cudaStream_t *)stream_handle->handle>>>(
+            input_data, output_data, size);
+    else
+        tanh_kernel<<<blocks, threads>>>(input_data, output_data, size);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Transpose.cu b/src/ops/Transpose.cu
new file mode 100644
index 0000000..f0cc8e5
--- /dev/null
+++ b/src/ops/Transpose.cu
@@ -0,0 +1,118 @@
+#include "gpu_runtime.h"
+
+__global__ void transpose_kernel(float *odata, const float *idata,
+                                 const uint *buf, const uint ndims,
+                                 size_t size) {
+    const uint *in_strides = buf;
+    const uint *out_strides = buf + ndims;
+    const uint *perm = buf + ndims * 2;
+    size_t o_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    uint i_idx = 0;
+    uint t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+        const uint ratio = t / out_strides[i];
+        t -= ratio * out_strides[i];
+        i_idx += ratio * in_strides[perm[i]];
+    }
+    odata[o_idx] = idata[i_idx];
+}
+
+int DLGpuTranspose(const DLArrayHandle input, DLArrayHandle output, int *perm,
+                   DLStreamHandle stream_handle = NULL) {
+    uint ndim = uint(input->ndim);
+    uint ndim_ = uint(output->ndim);
+    assert(ndim == ndim_);
+
+    int64_t *in_dims = input->shape;
+    int64_t *out_dims = output->shape;
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    uint *buf = (uint *)malloc(3 * ndim * sizeof(uint));
+    uint *gpu_buf = NULL;
+
+    uint in_stride = 1;
+    uint out_stride = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+        buf[i] = uint(in_stride);
+        buf[ndim + i] = uint(out_stride);
+        buf[ndim * 2 + i] = uint(perm[i]);
+        in_stride *= uint(in_dims[i]);
+        out_stride *= uint(out_dims[i]);
+    }
+
+    assert(in_stride == out_stride);
+    size_t size = in_stride;
+
+    int dev_id = (input->ctx).device_id;
+    if (is_chunk_init(dev_id) == false) {
+        chunk_init(dev_id);
+    }
+    size_t buf_size = 3 * ndim * sizeof(uint);
+    gpu_buf = (uint *)find_chunk(buf_size, dev_id);
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(
+        stream_handle ? *(cudaStream_t *)(stream_handle->handle) : NULL);
+
+    if (cu_stream != NULL) {
+        CUDA_CALL(cudaMemcpyAsync(gpu_buf, (void *)buf, buf_size,
+                                  cudaMemcpyHostToDevice, cu_stream));
+        transpose_kernel<<<blocks, threads, 0, cu_stream>>>(
+            output_data, input_data, gpu_buf, ndim, size);
+    } else {
+        CUDA_CALL(
+            cudaMemcpy(gpu_buf, (void *)buf, buf_size, cudaMemcpyHostToDevice));
+        transpose_kernel<<<blocks, threads>>>(output_data, input_data, gpu_buf,
+                                              ndim, size);
+    }
+
+    del_chunk(gpu_buf, dev_id);
+    free(buf);
+    return 0;
+}
+
+int DLGpuTransposeSimple(const DLArrayHandle input, DLArrayHandle output,
+                         const DLArrayHandle gpu_buffer,
+                         DLStreamHandle stream_handle = NULL) {
+    const float *input_data = (const float *)input->data;
+    float *output_data = (float *)output->data;
+    const uint *gpu_buf = (const uint *)gpu_buffer->data;
+    const uint ndim = output->ndim;
+
+    size_t size = 1;
+    for (uint i = 0; i < ndim; ++i) {
+        size *= output->shape[i];
+    }
+
+    dim3 blocks;
+    dim3 threads;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+
+    if (stream_handle) {
+        cudaStream_t cu_stream = (*(cudaStream_t *)(stream_handle->handle));
+        transpose_kernel<<<blocks, threads, 0, cu_stream>>>(
+            output_data, input_data, gpu_buf, ndim, size);
+    } else {
+        transpose_kernel<<<blocks, threads>>>(output_data, input_data, gpu_buf,
+                                              ndim, size);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ops/Where.cu b/src/ops/Where.cu
new file mode 100644
index 0000000..9a253df
--- /dev/null
+++ b/src/ops/Where.cu
@@ -0,0 +1,39 @@
+#include "gpu_runtime.h"
+
+__global__ void where_kernel(const float *cond, const float *arr1,
+                             const float *arr2, float *output, size_t size) {
+    size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (ind >= size)
+        return;
+    output[ind] = cond[ind] ? arr1[ind] : arr2[ind];
+}
+
+int DLGpuWhere(const DLArrayHandle cond, const DLArrayHandle arr1,
+               const DLArrayHandle arr2, DLArrayHandle output,
+               DLStreamHandle stream_handle = NULL) {
+    size_t size = 1;
+    for (index_t i = 0; i < cond->ndim; i++) {
+        size *= cond->shape[i];
+    }
+    dim3 blocks;
+    dim3 threads;
+    const float *cond_data = (const float *)cond->data;
+    const float *arr1_data = (const float *)arr1->data;
+    const float *arr2_data = (const float *)arr2->data;
+    float *output_data = (float *)output->data;
+    if (size <= 1024) {
+        threads.x = size;
+        blocks.x = 1;
+    } else {
+        threads.x = 1024;
+        blocks.x = (size + 1023) / 1024;
+    }
+    if (stream_handle)
+        where_kernel<<<blocks, threads, 0,
+                       *(cudaStream_t *)stream_handle->handle>>>(
+            cond_data, arr1_data, arr2_data, output_data, size);
+    else
+        where_kernel<<<blocks, threads>>>(cond_data, arr1_data, arr2_data,
+                                          output_data, size);
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..7c102f7
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,6 @@
+## Tests
+### Need to validate:
+* test_dnnl_op.py
+* test_transformer_ops.py
+* onnx/*
+* test_DistGCN/test_model_distGCN15d.py
diff --git a/tests/get_gpu_memory.py b/tests/get_gpu_memory.py
new file mode 100644
index 0000000..202f28b
--- /dev/null
+++ b/tests/get_gpu_memory.py
@@ -0,0 +1,14 @@
+from pynvml import smi as nvidia_smi
+
+nvidia_smi.nvmlInit()
+handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate
+
+ans = 0
+while(True):
+    mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+    # print(mem_res.used / (1024**2)) # usage in GiB
+    if (mem_res.used / (1024**2) > ans):
+        ans = mem_res.used / (1024**2)
+        print(ans)
+# print(f'mem: {100 * (mem_res.used / mem_res.total):.3f}%') # percentage usage
diff --git a/tests/hetu_cache/hetu_cache_config.yml b/tests/hetu_cache/hetu_cache_config.yml
new file mode 100644
index 0000000..0a79d96
--- /dev/null
+++ b/tests/hetu_cache/hetu_cache_config.yml
@@ -0,0 +1,10 @@
+shared: &shared
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13200
+  DMLC_NUM_WORKER : 1
+  DMLC_NUM_SERVER : 1
+  DMLC_PS_VAN_TYPE : p3
+launch :
+  worker : 1
+  server : 1
+  scheduler : true
diff --git a/tests/hetu_cache/hetu_cache_test.py b/tests/hetu_cache/hetu_cache_test.py
new file mode 100644
index 0000000..af97544
--- /dev/null
+++ b/tests/hetu_cache/hetu_cache_test.py
@@ -0,0 +1,34 @@
+from hetu import get_worker_communicate
+from hetu.launcher import launch
+from hetu.cstable import CacheSparseTable
+
+import ctypes
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+
+def test(args):
+    comm = get_worker_communicate()
+    node_id = 0
+    limit = 10000
+    length = 10000
+    width = 128
+    comm.InitTensor(ctypes.c_int(node_id), ctypes.c_int(2), ctypes.c_int(length), ctypes.c_int(width), ctypes.c_int(2), ctypes.c_double(0), ctypes.c_double(0.1), ctypes.c_ulonglong(123),
+                    ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
+    cache = CacheSparseTable(limit, length, width, node_id, "LFUOpt")
+    for i in tqdm(range(10000)):
+        key = np.random.randint(10000, size=1000).astype(np.uint64)
+        value = np.empty((key.size, width), np.float32)
+        ts = cache.embedding_lookup(key, value)
+        ts.wait()
+        grad = np.random.rand(key.size, width).astype(np.float32)
+        ts = cache.embedding_update(key, grad)
+        ts.wait()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config")
+    args = parser.parse_args()
+    launch(test, args)
diff --git a/tests/onnx/README.md b/tests/onnx/README.md
new file mode 100644
index 0000000..6fab6d6
--- /dev/null
+++ b/tests/onnx/README.md
@@ -0,0 +1,2 @@
+## Attention
+This part has not validated in the latest version. Package "onnx_tf" cannot be used in Python 3.7, and there're some bugs in hetu's onnx part.
diff --git a/tests/onnx/cnn_hetu_onnx_tf.py b/tests/onnx/cnn_hetu_onnx_tf.py
new file mode 100644
index 0000000..4aa5a10
--- /dev/null
+++ b/tests/onnx/cnn_hetu_onnx_tf.py
@@ -0,0 +1,161 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+
+import onnx
+from onnx_tf.backend import prepare
+
+import tensorflow.compat.v1 as tf
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 2
+
+
+# ctx=ht.gpu(0)
+executor_ctx = ht.cpu(0)
+
+
+def cnn(logs):
+
+    print("Build CNN model...")
+    logs.append('\n----------Hetu model to onnx to tensorflow(CNN)--------\n')
+    logs.append('step 1: Building cnn models based on Hetu....')
+    print(logs[-1])
+
+    W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1')
+    W2 = init.random_normal((64, 32, 5, 5), stddev=0.1, name='W2')
+    W3 = init.random_normal((7*7*64, 10), stddev=0.1, name='W3')
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+
+    X = ht.Variable(name="X")
+
+    z1 = ht.conv2d_op(X, W1, padding=2, stride=1)
+    z2 = ht.relu_op(z1)
+    z3 = ht.avg_pool2d_op(z2, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z4 = ht.conv2d_op(z3, W2, padding=2, stride=1)
+    z5 = ht.relu_op(z4)
+    z6 = ht.avg_pool2d_op(z5, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z6_flat = ht.array_reshape_op(z6, (-1, 7 * 7 * 64))
+    y = ht.matmul_op(z6_flat, W3)+b3
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 1, 28, 28)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    onnx_input_path = 'hetu_cnn_model.onnx'
+
+    logs.append(
+        'step 2: Loading Hetu model to onnx,filename is {}'.format(onnx_input_path))
+    print(logs[-1])
+
+    ax.hetu2onnx.export(executor, [X], [y], onnx_input_path)
+
+    logs.append(
+        'step 3: Validing onnx model between Hetu runtime and onnxruntime....')
+    print(logs[-1])
+
+    sess = rt.InferenceSession(onnx_input_path)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('step 4: Loading hetu_onnx model to tensorflow....')
+    print(logs[-1])
+
+    # convert onnx to tf
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # ,strict=False)  # prepare tf representation
+
+    onnx_output = tf_exp.run(X_val)
+
+    logs.append(
+        'step 5: Validing onnx model between Hetu runtime and tensorflow runtime....')
+    print(logs[-1])
+
+    np.testing.assert_allclose(onnx_output[0], ath[0].asnumpy(), rtol=1e-2)
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('\n----------Hetu model to onnx to tensorflow(CNN)--------end\n')
+    print(logs[-1])
+
+
+def onnx2tf(onnx_input_path):
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # ,strict=False)  # prepare tf representation
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 1, 28, 28)).astype(np.float32)
+
+    onnx_output = tf_exp.run(X_val)
+
+    logs.append('   Convert pass!')
+    print(logs[-1])
+    logs.append(
+        'step 5: validing onnx model between Hetu runtime and onnxruntime....')
+    print(logs[-1])
+    sess = rt.InferenceSession(onnx_input_path)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(onnx_output[0], pre, rtol=1e-2)
+    print('pass3')
+
+    # pb_output_path = 'model.pb'
+    # tf_exp.export_graph(pb_output_path)  # export the model
+    #
+    # with tf.Graph().as_default():
+    #     output_graph_def = tf.GraphDef()
+    #
+    #     with open(pb_output_path, "rb") as f:
+    #         output_graph_def.ParseFromString(f.read())
+    #         tf.import_graph_def(output_graph_def, name="")
+    #
+    #     with tf.Session() as sess:
+    #         tf.global_variables_initializer().run()
+    #         inp = sess.graph.get_tensor_by_name('actual_input_1:0')   #
+    #         # out0 = sess.graph.get_tensor_by_name('output1:0')
+    #         # out1 = sess.graph.get_tensor_by_name('73:0')
+    #         # out2 = sess.graph.get_tensor_by_name('74:0')
+    #         #
+    #         # img = np.load('random.npy')
+    #         # # img = img.reshape([1, 3, 300, 300])
+    #         # pre_num = sess.run([out0, out1, out2], feed_dict={inp: img})
+    #         # print(pre_num)
+
+
+if __name__ == '__main__':
+    logs = []
+    cnn(logs)
+
+    print('--------print all logs once!-------')
+    for log in logs:
+        print(log)
+    print('--------logs end!--------')
diff --git a/tests/onnx/cnn_tf_onnx_hetu.py b/tests/onnx/cnn_tf_onnx_hetu.py
new file mode 100644
index 0000000..bd945ab
--- /dev/null
+++ b/tests/onnx/cnn_tf_onnx_hetu.py
@@ -0,0 +1,176 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as hx
+
+import onnxruntime as rt
+
+import numpy as np
+
+
+import tensorflow.compat.v1 as tf
+# tf.disable_v2_behavior()
+
+import tf2onnx
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 128
+
+
+# ctx=ht.gpu(0)
+ctx = ht.cpu(0)
+
+
+def cnn(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
+
+    print("Build CNN model...")
+
+    W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1')
+    W2 = init.random_normal((64, 32, 5, 5), stddev=0.1, name='W2')
+    W3 = init.random_normal((7*7*64, 10), stddev=0.1, name='W3')
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+
+    X = ht.Variable(name="X")
+
+    z1 = ht.conv2d_op(X, W1, padding=2, stride=1)
+    z2 = ht.relu_op(z1)
+    z3 = ht.avg_pool2d_op(z2, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z4 = ht.conv2d_op(z3, W2, padding=2, stride=1)
+    z5 = ht.relu_op(z4)
+    z6 = ht.avg_pool2d_op(z5, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z6_flat = ht.array_reshape_op(z6, (-1, 7 * 7 * 64))
+    y = ht.matmul_op(z6_flat, W3)+b3
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 1, 28, 28)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    hx.hetu2onnx.export(executor, [X], [y], 'ath.onnx')
+    #
+    #
+    sess = rt.InferenceSession("ath.onnx")
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+
+
+def tf_model(logs, model_name='tf_cnn_model.onnx',):
+    logs.append('Building cnn models based on tensorflow....')
+    print(logs[-1])
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(20, 784)).astype(np.float32)
+
+    with tf.Session() as sess:
+
+        x = tf.placeholder(dtype=tf.float32, shape=(None, 784,), name='input')
+        z1 = tf.reshape(x, [-1, 28, 28, 1])
+
+        weight1 = tf.Variable(np.random.normal(scale=0.1, size=(
+            32, 1, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
+        z2 = tf.nn.conv2d(z1, weight1, padding='SAME', strides=[1, 1, 1, 1])
+        z3 = tf.nn.relu(z2)
+        z4 = tf.nn.avg_pool(
+            z3, ksize=[1, 2, 2, 1], padding='VALID', strides=[1, 2, 2, 1])
+
+        weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
+            64, 32, 5, 5)).transpose([2, 3, 1, 0]).astype(np.float32))
+        z5 = tf.nn.conv2d(z4, weight2, padding='SAME', strides=[1, 1, 1, 1])
+        z6 = tf.nn.relu(z5)
+        z7 = tf.nn.avg_pool(
+            z6, ksize=[1, 2, 2, 1], padding='VALID', strides=[1, 2, 2, 1])
+
+        z8 = tf.transpose(z7, [0, 3, 1, 2])
+        shape = (7 * 7 * 64, 10)
+        weight3 = tf.Variable(np.random.normal(
+            scale=0.1, size=shape).astype(np.float32))
+        #bias = tf.Variable(np.random.normal(scale=0.1, size=shape[-1:]).astype(np.float32))
+        z9 = tf.reshape(z8, (-1, shape[0]))
+        y = tf.matmul(z9, weight3)  # + bias
+        _ = tf.identity(y, name='output')
+
+        sess.run(tf.global_variables_initializer())
+        expected = sess.run(y, feed_dict={x: X_val})
+        graph_def = tf.graph_util.convert_variables_to_constants(
+            sess, sess.graph_def, ['output'])
+
+    tf.reset_default_graph()
+    tf.import_graph_def(graph_def, name='')
+
+    logs.append('saving tf model to onnx! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    with tf.Session() as sess:
+        onnx_graph = tf2onnx.tfonnx.process_tf_graph(sess.graph,
+                                                     input_names=['input:0'],
+                                                     output_names=['output:0'],)
+        model_proto = onnx_graph.make_model('cnn_model')
+        with open(model_name, 'wb') as f:
+            f.write(model_proto.SerializeToString())
+
+
+def onnx2hetu(logs, model_name='tf_cnn_model.onnx'):
+    logs.append('loading onnx file to hetu! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    x, y = hx.onnx2hetu.load_onnx(model_name)
+    logs.append('loading onnx file to hetu PASS!')
+    print(logs[-1])
+    executor = ht.Executor(
+        [y],
+        ctx=ctx)
+    rand = np.random.RandomState(seed=123)
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # X_val=rand.normal(scale=0.1, size=(20,784)).astype(np.float32)
+    #
+    X_val = train_set_x[:20, :]
+
+    logs.append(
+        'validing models use assert_allclose between hetu and onnxruntime!...')
+    print(logs[-1])
+
+    ath = executor.run(
+        feed_dict={
+            x: X_val})
+    sess = rt.InferenceSession(model_name)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+    logs.append('validing models(cnn) PASS!')
+    print(logs[-1])
+
+
+if __name__ == '__main__':
+    logs = []
+    tf_model(logs)
+    onnx2hetu(logs)
+
+    print('--------print all logs once!-------')
+    for i, log in enumerate(logs):
+        print(i+1, log)
+    print('--------logs end!--------')
+
+    # cnn(executor_ctx=ctx, num_epochs=5, print_loss_val_each_epoch=True)
diff --git a/tests/onnx/dnn_hetu_onnx_tf.py b/tests/onnx/dnn_hetu_onnx_tf.py
new file mode 100644
index 0000000..5665a02
--- /dev/null
+++ b/tests/onnx/dnn_hetu_onnx_tf.py
@@ -0,0 +1,116 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+
+import onnx
+from onnx_tf.backend import prepare
+
+import tensorflow.compat.v1 as tf
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 2
+
+
+# ctx=ht.gpu(0)
+executor_ctx = ht.cpu(0)
+
+
+def dnn(logs):
+
+    print("Build DNN model...")
+    logs.append('\n----------Hetu model to onnx to tensorflow(DNN)--------\n')
+    logs.append('step 1: Building dnn models based on Hetu....')
+    print(logs[-1])
+
+    W1 = init.random_normal((784, 256), stddev=0.1, name='W1')
+    b1 = init.random_normal((256,), stddev=0.1, name='b1')
+    W2 = init.random_normal((256, 256), stddev=0.1, name='W2')
+    b2 = init.random_normal((256,), stddev=0.1, name='b2')
+    W3 = init.random_normal((256, 10), stddev=0.1, name='W3')
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+
+    X = ht.Variable(name="X")
+
+    z1 = ht.matmul_op(X, W1)+b1
+    z2 = ht.relu_op(z1)
+
+    z3 = ht.matmul_op(z2, W2)+b2
+    z4 = ht.relu_op(z3)
+
+    z5 = ht.matmul_op(z4, W3)+b3
+
+    y = z5
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(batch_size, 784)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    onnx_input_path = 'hetu_dnn_model.onnx'
+
+    logs.append(
+        'step 2: Loading Hetu model to onnx,filename is {}'.format(onnx_input_path))
+    print(logs[-1])
+
+    ax.hetu2onnx.export(executor, [X], [y], onnx_input_path)
+
+    logs.append(
+        'step 3: Validing onnx model between Hetu runtime and onnxruntime....')
+    print(logs[-1])
+
+    sess = rt.InferenceSession(onnx_input_path)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('step 4: Loading hetu_onnx model to tensorflow....')
+    print(logs[-1])
+
+    # convert onnx to tf
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # ,strict=False)  # prepare tf representation
+
+    onnx_output = tf_exp.run(X_val)
+
+    logs.append(
+        'step 5: Validing onnx model between Hetu runtime and tensorflow runtime....')
+    print(logs[-1])
+
+    np.testing.assert_allclose(onnx_output[0], ath[0].asnumpy(), rtol=1e-2)
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('\n----------Hetu model to onnx to tensorflow(DNN)--------end\n')
+    print(logs[-1])
+
+
+if __name__ == '__main__':
+    logs = []
+    dnn(logs)
+
+    print('--------print all logs once!-------')
+    for log in logs:
+        print(log)
+    print('--------logs end!--------')
diff --git a/tests/onnx/dnn_tf_onnx_hetu.py b/tests/onnx/dnn_tf_onnx_hetu.py
new file mode 100644
index 0000000..9e3a653
--- /dev/null
+++ b/tests/onnx/dnn_tf_onnx_hetu.py
@@ -0,0 +1,127 @@
+import hetu as ht
+from hetu import onnx as hx
+
+import onnxruntime as rt
+
+import numpy as np
+
+
+import tensorflow.compat.v1 as tf
+# tf.disable_v2_behavior()
+
+import tf2onnx
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 128
+
+
+# ctx=ht.gpu(0)
+ctx = ht.cpu(0)
+
+
+def tf_model(logs, model_name='tf_dnn_model.onnx',):
+    logs.append('Building dnn(mlp) models based on tensorflow....')
+    print(logs[-1])
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(20, 784)).astype(np.float32)
+
+    with tf.Session() as sess:
+
+        x = tf.placeholder(dtype=tf.float32, shape=(None, 784,), name='input')
+        weight1 = tf.Variable(np.random.normal(
+            scale=0.1, size=(784, 256)).astype(np.float32))
+        bias1 = tf.Variable(np.random.normal(
+            scale=0.1, size=(256,)).astype(np.float32))
+        z2 = tf.matmul(x, weight1) + bias1
+        z3 = tf.nn.relu(z2)
+
+        weight2 = tf.Variable(np.random.normal(
+            scale=0.1, size=(256, 256)).astype(np.float32))
+        bias2 = tf.Variable(np.random.normal(
+            scale=0.1, size=(256,)).astype(np.float32))
+        z4 = tf.matmul(z3, weight2) + bias2
+        z5 = tf.nn.relu(z4)
+
+        weight3 = tf.Variable(np.random.normal(
+            scale=0.1, size=(256, 10)).astype(np.float32))
+        bias3 = tf.Variable(np.random.normal(
+            scale=0.1, size=(10,)).astype(np.float32))
+        y = tf.matmul(z5, weight3) + bias3
+
+        _ = tf.identity(y, name='output')
+
+        sess.run(tf.global_variables_initializer())
+        expected = sess.run(y, feed_dict={x: X_val})
+        graph_def = tf.graph_util.convert_variables_to_constants(
+            sess, sess.graph_def, ['output'])
+
+    tf.reset_default_graph()
+    tf.import_graph_def(graph_def, name='')
+
+    logs.append('saving tf model to onnx! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    with tf.Session() as sess:
+        onnx_graph = tf2onnx.tfonnx.process_tf_graph(sess.graph,
+                                                     input_names=['input:0'],
+                                                     output_names=['output:0'],)
+        model_proto = onnx_graph.make_model('dnn_model')
+        with open(model_name, 'wb') as f:
+            f.write(model_proto.SerializeToString())
+
+
+def onnx2hetu(logs, model_name='tf_dnn_model.onnx'):
+    logs.append('loading onnx file to hetu! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    x, y = hx.onnx2hetu.load_onnx(model_name)
+    logs.append('loading onnx file to hetu PASS!')
+    print(logs[-1])
+    executor = ht.Executor(
+        [y],
+        ctx=ctx)
+    rand = np.random.RandomState(seed=123)
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # X_val=rand.normal(scale=0.1, size=(20,784)).astype(np.float32)
+    #
+    X_val = train_set_x[:20, :]
+
+    logs.append(
+        'validing models use assert_allclose between hetu and onnxruntime!...')
+    print(logs[-1])
+
+    ath = executor.run(
+        feed_dict={
+            x: X_val})
+    sess = rt.InferenceSession(model_name)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+    logs.append('validing models(dnn) PASS!')
+    print(logs[-1])
+
+
+if __name__ == '__main__':
+    logs = []
+    tf_model(logs, model_name='tf_dnn_model.onnx')
+    onnx2hetu(logs, model_name='tf_dnn_model.onnx')
+
+    print('--------print all logs once!-------')
+    for i, log in enumerate(logs):
+        print(i+1, log)
+    print('--------logs end!--------')
+
+    # cnn(executor_ctx=ctx, num_epochs=5, print_loss_val_each_epoch=True)
diff --git a/tests/onnx/rnn_hetu_onnx_tf.py b/tests/onnx/rnn_hetu_onnx_tf.py
new file mode 100644
index 0000000..a45c710
--- /dev/null
+++ b/tests/onnx/rnn_hetu_onnx_tf.py
@@ -0,0 +1,130 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+
+import onnx
+from onnx_tf.backend import prepare
+
+import tensorflow.compat.v1 as tf
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 2
+
+diminput = 28
+dimhidden = 128
+dimoutput = 10
+nsteps = 28
+
+# ctx=ht.gpu(0)
+executor_ctx = ht.cpu(0)
+
+
+def rnn(logs):
+
+    print("Build RNN model...")
+    logs.append('\n----------Hetu model to onnx to tensorflow(RNN)--------\n')
+    logs.append('step 1: Building rnn(nsteps=28) models based on Hetu....')
+    print(logs[-1])
+
+    W1 = init.random_normal((diminput, dimhidden), stddev=0.1, name='W1')
+    b1 = init.random_normal((dimhidden,), stddev=0.1, name='b1')
+    W2 = init.random_normal(
+        (dimhidden+dimhidden, dimhidden), stddev=0.1, name='W2')
+    b2 = init.random_normal((dimhidden,), stddev=0.1, name='b2')
+    W3 = init.random_normal((dimhidden, dimoutput), stddev=0.1, name='W3')
+    b3 = init.random_normal((dimoutput,), stddev=0.1, name='b3')
+    last_state = init.zeros(shape=(batch_size, dimhidden), name='last_state')
+
+    #
+    # last_state = ht.Variable(value=np.zeros((batch_size, dimhidden)).astype(np.float32), name='initial_state', trainable=False)
+    # #last_state = np.zeros((batch_size, dimhidden)).astype(np.float32)
+
+    X = ht.Variable(name="X")
+
+    for i in range(nsteps):
+        cur_x = ht.slice_op(X, (0, i * diminput), (-1, diminput))
+        h = ht.matmul_op(cur_x, W1) + b1
+
+        # if i == 0:
+        #     last_state = ht.broadcastto_op(last_state, h)
+        s = ht.concat_op(h, last_state, axis=1)
+        s = ht.matmul_op(s, W2) + b2
+        last_state = ht.relu_op(s)
+
+    final_state = last_state
+    z1 = ht.matmul_op(final_state, W3) + b3
+    y = z1
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(batch_size, 784)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    onnx_input_path = 'hetu_rnn_model.onnx'
+
+    logs.append(
+        'step 2: Loading Hetu model to onnx,filename is {}'.format(onnx_input_path))
+    print(logs[-1])
+
+    ax.hetu2onnx.export(executor, [X], [y], onnx_input_path)
+
+    logs.append(
+        'step 3: Validing onnx model between Hetu runtime and onnxruntime....')
+    print(logs[-1])
+
+    sess = rt.InferenceSession(onnx_input_path)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('step 4: Loading hetu_onnx model to tensorflow....')
+    print(logs[-1])
+
+    # convert onnx to tf
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # ,strict=False)  # prepare tf representation
+
+    onnx_output = tf_exp.run(X_val)
+
+    logs.append(
+        'step 5: Validing onnx model between Hetu runtime and tensorflow runtime....')
+    print(logs[-1])
+
+    np.testing.assert_allclose(onnx_output[0], ath[0].asnumpy(), rtol=1e-2)
+    logs.append('pass!')
+    print(logs[-1])
+
+    logs.append('\n----------Hetu model to onnx to tensorflow(RNN)--------end\n')
+    print(logs[-1])
+
+
+if __name__ == '__main__':
+    logs = []
+    rnn(logs)
+
+    print('--------print all logs once!-------')
+    for log in logs:
+        print(log)
+    print('--------logs end!--------')
diff --git a/tests/onnx/rnn_tf_onnx_hetu.py b/tests/onnx/rnn_tf_onnx_hetu.py
new file mode 100644
index 0000000..0e0e947
--- /dev/null
+++ b/tests/onnx/rnn_tf_onnx_hetu.py
@@ -0,0 +1,138 @@
+import hetu as ht
+from hetu import onnx as hx
+
+import onnxruntime as rt
+
+import numpy as np
+
+
+import tensorflow.compat.v1 as tf
+# tf.disable_v2_behavior()
+
+import tf2onnx
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+
+
+# ctx=ht.gpu(0)
+ctx = ht.cpu(0)
+batch_size = 20
+
+
+def tf_model(logs, model_name='rnn',):
+    diminput = 28
+    dimhidden = 128
+    dimoutput = 10
+    nsteps = 28
+
+    logs.append(
+        'Building rnn(nsteps={}) models based on tensorflow....'.format(nsteps))
+    print(logs[-1])
+
+    with tf.Session() as sess:
+        x = tf.placeholder(dtype=tf.float32, shape=(
+            batch_size, 784,), name='input')
+
+        weight1 = tf.Variable(np.random.normal(
+            scale=0.1, size=(diminput, dimhidden)).astype(np.float32))
+        bias1 = tf.Variable(np.random.normal(
+            scale=0.1, size=(dimhidden,)).astype(np.float32))
+        weight2 = tf.Variable(np.random.normal(scale=0.1, size=(
+            dimhidden + dimhidden, dimhidden)).astype(np.float32))
+        bias2 = tf.Variable(np.random.normal(
+            scale=0.1, size=(dimhidden,)).astype(np.float32))
+        weight3 = tf.Variable(np.random.normal(
+            scale=0.1, size=(dimhidden, dimoutput)).astype(np.float32))
+        bias3 = tf.Variable(np.random.normal(
+            scale=0.1, size=(dimoutput,)).astype(np.float32))
+        # last_state = tf.zeros((tf.shape(x)[0], dimhidden), dtype=tf.float32)
+        last_state = tf.zeros((batch_size, dimhidden), dtype=tf.float32)
+
+        for i in range(nsteps):
+            cur_x = tf.slice(x, (0, i * diminput), (-1, diminput))
+            h = tf.matmul(cur_x, weight1) + bias1
+
+            s = tf.concat([h, last_state], axis=1)
+            s = tf.matmul(s, weight2) + bias2
+            last_state = tf.nn.relu(s)
+
+        final_state = last_state
+        y = tf.matmul(final_state, weight3) + bias3
+
+        _ = tf.identity(y, name='output')
+
+        sess.run(tf.global_variables_initializer())
+        # expected = sess.run(y, feed_dict={x: X_val})
+        graph_def = tf.graph_util.convert_variables_to_constants(
+            sess, sess.graph_def, ['output'])
+
+    tf.reset_default_graph()
+    tf.import_graph_def(graph_def, name='')
+
+    logs.append('saving tf model to onnx! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    with tf.Session() as sess:
+        onnx_graph = tf2onnx.tfonnx.process_tf_graph(sess.graph,
+                                                     input_names=['input:0'],
+                                                     output_names=['output:0'],)
+        model_proto = onnx_graph.make_model('dnn_model')
+        with open(model_name, 'wb') as f:
+            f.write(model_proto.SerializeToString())
+
+
+def onnx2hetu(logs, model_name=''):
+    logs.append('loading onnx file to hetu! filename is {}'.format(model_name))
+    print(logs[-1])
+
+    x, y = hx.onnx2hetu.load_onnx(model_name)
+    logs.append('loading onnx file to hetu PASS!')
+    print(logs[-1])
+    executor = ht.Executor(
+        [y],
+        ctx=ctx)
+    rand = np.random.RandomState(seed=123)
+    datasets = ht.data.mnist()
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    # X_val=rand.normal(scale=0.1, size=(20,784)).astype(np.float32)
+    #
+    X_val = train_set_x[:20, :]
+
+    logs.append(
+        'validing models use assert_allclose between hetu and onnxruntime!...')
+    print(logs[-1])
+
+    ath = executor.run(
+        feed_dict={
+            x: X_val})
+    sess = rt.InferenceSession(model_name)
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+    logs.append('validing models(rnn) PASS!')
+    print(logs[-1])
+
+
+if __name__ == '__main__':
+    logs = []
+    tf_model(logs, model_name='tf_rnn_model.onnx')
+    onnx2hetu(logs, model_name='tf_rnn_model.onnx')
+
+    print('--------print all logs once!-------')
+    for i, log in enumerate(logs):
+        print(i+1, log)
+    print('--------logs end!--------')
+
+    # cnn(executor_ctx=ctx, num_epochs=5, print_loss_val_each_epoch=True)
diff --git a/tests/onnx/test_cnn.py b/tests/onnx/test_cnn.py
new file mode 100644
index 0000000..413dfb6
--- /dev/null
+++ b/tests/onnx/test_cnn.py
@@ -0,0 +1,70 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 128
+
+
+# ctx=ht.gpu(0)
+ctx = ht.cpu(0)
+
+
+def cnn(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
+
+    print("Build CNN model...")
+
+    W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1')
+    W2 = init.random_normal((64, 32, 5, 5), stddev=0.1, name='W2')
+    W3 = init.random_normal((7*7*64, 10), stddev=0.1, name='W3')
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+
+    X = ht.Variable(name="X")
+
+    z1 = ht.conv2d_op(X, W1, padding=2, stride=1)
+    z2 = ht.relu_op(z1)
+    z3 = ht.avg_pool2d_op(z2, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z4 = ht.conv2d_op(z3, W2, padding=2, stride=1)
+    z5 = ht.relu_op(z4)
+    z6 = ht.avg_pool2d_op(z5, kernel_H=2, kernel_W=2, padding=0, stride=2)
+
+    z6_flat = ht.array_reshape_op(z6, (-1, 7 * 7 * 64))
+    y = ht.matmul_op(z6_flat, W3)+b3
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 1, 28, 28)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    ax.hetu2onnx.export(executor, [X], [y], 'ath.onnx')
+    #
+    #
+    sess = rt.InferenceSession("ath.onnx")
+    input = sess.get_inputs()[0].name
+
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
+
+
+if __name__ == '__main__':
+    cnn(executor_ctx=ctx, num_epochs=5, print_loss_val_each_epoch=True)
diff --git a/tests/onnx/test_mlp.py b/tests/onnx/test_mlp.py
new file mode 100644
index 0000000..bd8afc2
--- /dev/null
+++ b/tests/onnx/test_mlp.py
@@ -0,0 +1,66 @@
+import hetu as ht
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import os
+import pdb
+import ctypes
+import time
+batch_size = 128
+
+
+def mnist_mlp(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
+
+    print("Build 3-layer MLP model...")
+
+    W1 = init.random_normal((784, 256), stddev=0.1, name='W1')
+    W2 = init.random_normal((256, 256), stddev=0.1, name='W2')
+    W3 = init.random_normal((256, 10), stddev=0.1, name='W3')
+    b1 = init.random_normal((256,), stddev=0.1, name='b1')
+    b2 = init.random_normal((256,), stddev=0.1, name='b2')
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+
+    X = ht.Variable(name="X")
+
+    # relu(X W1+b1)
+    z1 = ht.matmul_op(X, W1)+b1
+    z2 = ht.relu_op(z1)
+
+    # relu(z3 W2+b2)
+    z3 = ht.matmul_op(z2, W2)+b2
+    z4 = ht.relu_op(z3)
+
+    # softmax(z5 W2+b2)
+    y = ht.matmul_op(z4, W3)+b3
+
+    executor = ht.Executor(
+        [y],
+        ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+    X_val = rand.normal(scale=0.1, size=(batch_size, 784)).astype(np.float32)
+
+    ath = executor.run(
+        feed_dict={
+            X: X_val})
+
+    ax.hetu2onnx.export(executor, [X], [y], 'ath.onnx')
+    #
+    #
+    sess = rt.InferenceSession("ath.onnx")
+    input = sess.get_inputs()[0].name
+    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]
+
+    np.testing.assert_allclose(pre, ath[0], rtol=1e-2)
+
+
+if __name__ == '__main__':
+    mnist_mlp(executor_ctx=None, num_epochs=5, print_loss_val_each_epoch=True)
diff --git a/tests/onnx/test_nodes.py b/tests/onnx/test_nodes.py
new file mode 100644
index 0000000..71ed814
--- /dev/null
+++ b/tests/onnx/test_nodes.py
@@ -0,0 +1,386 @@
+import hetu as ht
+from hetu import ndarray
+from hetu import init
+from hetu import onnx as ax
+
+import onnxruntime as rt
+
+import numpy as np
+import sys
+
+batch_size = 3
+rand = np.random.RandomState(seed=123)
+
+ctx = ndarray.gpu(0)
+# ctx=ndarray.cpu(0)
+
+
+def Check(executor, executor_res, input, output, input_value):
+    """
+
+    :type executor_res: object
+    """
+    ax.hetu2onnx.export(executor, input, output, 'ath.onnx')
+
+    sess = rt.InferenceSession("ath.onnx")
+    inps = [input.name for input in sess.get_inputs()]
+    assert len(inps) == len(
+        input_value), "Failed: shapes does not match of input_name and input_value"
+    feed_dict = {}
+    for i in range(len(inps)):
+        feed_dict[inps[i]] =\
+            input_value[i].asnumpy() if isinstance(
+                input_value[i], ndarray.NDArray) else input_value[i]
+
+    # pre=sess.run(None,{inps[0]:input_value[0].astype(np.float32)})[0]
+    pre = sess.run(None, feed_dict)[0]
+    # if ndarray.is_gpu_ctx(ctx):
+    #     res=executor_res[0].asnumpy()
+    # else:
+    #     res=executor_res[0].asnumpy()
+    np.testing.assert_allclose(executor_res[0].asnumpy(), pre, rtol=1e-3)
+
+
+def test_AddConst():
+    X = ht.Variable(name="X")
+    val = 3.3
+    y = X+val
+    executor = ht.Executor([y], ctx=ctx)
+
+    X_val = rand.normal(scale=0.1, size=(batch_size, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_AddElewise():
+    X = ht.Variable(name="X")
+    b3 = init.random_normal((10,), stddev=0.1, name='b3')
+    y = X+b3
+    executor = ht.Executor([y], ctx=ctx, enable_lazy=False)
+
+    X_val = rand.normal(scale=0.1, size=(batch_size, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_AvgPool():
+    X = ht.Variable(name="X")
+    y = ht.avg_pool2d_op(X, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    executor = ht.Executor([y], ctx=ctx)
+
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 10, 10, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_MaxPool():
+    X = ht.Variable(name="X")
+    y = ht.max_pool2d_op(X, kernel_H=2, kernel_W=2, padding=0, stride=2)
+    executor = ht.Executor([y], ctx=ctx)
+
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 10, 10, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_MatrixMult():
+    X = ht.Variable(name="X")
+    W1 = init.random_normal((10, 5), stddev=0.1, name='W1')
+    y = ht.matmul_op(X, W1)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(batch_size, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    # test transpose_A
+    X = ht.Variable(name="X")
+    W1 = init.random_normal((10, 5), stddev=0.1, name='W1')
+    y = ht.matmul_op(X, W1, True)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(10, batch_size)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+
+    # test transpose_B
+    X = ht.Variable(name="X")
+    W1 = init.random_normal((5, 10), stddev=0.1, name='W1')
+    y = ht.matmul_op(X, W1, trans_B=True)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(batch_size, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Relu():
+    X = ht.Variable(name="X")
+    y = ht.relu_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 10, 10, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Reshape():
+    X = ht.Variable(name="X")
+    y = ht.array_reshape_op(X, [-1, 10*10*10])
+    executor = ht.Executor([y], ctx=ctx)
+
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 10, 10, 10)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Conv2d():
+    X = ht.Variable(name="X")
+    W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1')
+    y = ht.conv2d_op(X, W1, padding=2, stride=1)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 1, 28, 28)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Concat():
+    A = ht.Variable(name="A")
+    B = ht.Variable(name="B")
+    y = ht.concat_op(A, B, axis=1)
+    executor = ht.Executor([y], ctx=ctx)
+    A_val = rand.normal(scale=0.1, size=(2, 3)).astype(np.float32)
+    B_val = rand.normal(scale=0.1, size=(2, 3)).astype(np.float32)
+
+    res = executor.run(feed_dict={A: A_val, B: B_val})
+    Check(executor, res, [A, B], [y], [A_val, B_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Sqrt():
+    X = ht.Variable(name="X")
+    y = ht.sqrt_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 3)).astype(np.float32)
+
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_rSqrt():
+    X = ht.Variable(name="X")
+    y = ht.rsqrt_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 3)).astype(np.float32)
+
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Tanh():
+    X = ht.Variable(name="X")
+    y = ht.tanh_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 3)).astype(np.float32)
+
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_BatchNorm():
+    X = ht.Variable(name="X")
+    bn_scale = init.random_normal((64,), stddev=0.1, name='bn_scale')
+    bn_bias = init.random_normal((64,), stddev=0.1, name='bn_bias')
+
+    y = ht.batch_normalization_op(X, bn_scale, bn_bias)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(
+        batch_size, 64, 28, 28)).astype(np.float32)
+
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X, bn_scale, bn_bias], [y], [
+          X_val, bn_scale.tensor_value, bn_bias.tensor_value])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Pad():
+    X = ht.Variable(name="X")
+    paddings = [[1, 1], [1, 1], [2, 1], [1, 3]]
+    y = ht.pad_op(X, paddings, constant_values=0)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(1, 1, 1, 1)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Div():
+    X = ht.Variable(name="X")
+    B = ht.Variable(name="B")
+    y = ht.div_op(X, B)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    B_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val, B: B_val})
+    Check(executor, res, [X, B], [y], [X_val, B_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_MultiplyConst():
+    X = ht.Variable(name="X")
+    const = 5.5
+    y = ht.mul_byconst_op(X, const)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_DivConst():
+    X = ht.Variable(name="X")
+    const = 5.5
+    y = ht.div_const_op(const, X)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Onehot():
+    X = ht.Variable(name="X")
+    classes = 10
+    y = ht.one_hot_op(X, classes)
+
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.randint(0, 10, 20,).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Opposite():
+    X = ht.Variable(name="X")
+    y = ht.opposite_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Softmax():
+    X = ht.Variable(name="X")
+    y = ht.softmax_op(X)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(128, 150)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_ReduceMean():
+
+    X = ht.Variable(name="X")
+    y = ht.reduce_mean_op(X, 1, keepdims=True)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 2)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_ReduceSum():
+
+    X = ht.Variable(name="X")
+    y = ht.reduce_sum_op(X, 0, keepdims=False)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(2, 23, 5)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Dropout():
+    X = ht.Variable(name="X")
+    y = ht.dropout_op(X, 1)
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(3, 2, 5)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Transpose():
+    X = ht.Variable(name="X")
+    y = ht.transpose_op(X, [2, 0, 1])
+    executor = ht.Executor([y], ctx=ctx)
+    X_val = rand.normal(scale=0.1, size=(3, 2, 5)).astype(np.float32)
+    res = executor.run(feed_dict={X: X_val})
+    Check(executor, res, [X], [y], [X_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+def test_Where():
+    cond = ht.Variable(name="Cond", dtype=np.bool)
+    A = ht.Variable(name="A")
+    B = ht.Variable(name="B")
+    y = ht.where_op(cond, A, B)
+    executor = ht.Executor([y], ctx=ctx)
+    shape = [2, 2, 3]
+    Cond_val = rand.randint(0, 2, size=shape, dtype=np.bool)
+    A_val = rand.normal(scale=0.1, size=shape).astype(np.float32)
+    B_val = rand.normal(scale=0.1, size=shape).astype(np.float32)
+    res = executor.run(feed_dict={cond: Cond_val, A: A_val, B: B_val})
+
+    Check(executor, res, [cond, A, B], [y], [Cond_val, A_val, B_val])
+    print(sys._getframe().f_code.co_name, 'pass!')
+
+
+if __name__ == '__main__':
+    test_AddConst()
+    test_AddElewise()
+    test_AvgPool()
+    test_MaxPool()
+    test_MatrixMult()
+    test_Relu()
+    test_Reshape()
+    test_Conv2d()
+    test_Concat()
+    test_Sqrt()
+    test_rSqrt()
+    test_Tanh()
+    # fixme:batchnorm,maybe sustainable:  Mismatched elements: 3 / 150528 (0.00199%)
+    # test_BatchNorm()
+    test_Pad()
+    test_Div()
+    test_MultiplyConst()
+    test_DivConst()
+    test_Onehot()
+    test_Opposite()
+    test_Softmax()
+    test_ReduceMean()
+    # test_ReduceSum()
+    # #fixme:not all close when keep_prob is not 1.0 in dropout.maybe has bug
+    test_Dropout()
+    test_Transpose()
+    test_Where()
diff --git a/tests/pstests/local_s2_w1.yml b/tests/pstests/local_s2_w1.yml
new file mode 100644
index 0000000..4bc37da
--- /dev/null
+++ b/tests/pstests/local_s2_w1.yml
@@ -0,0 +1,27 @@
+shared: &shared
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13200
+  DMLC_NUM_WORKER : 1
+  DMLC_NUM_SERVER : 2
+  DMLC_PS_VAN_TYPE : p3
+sched:
+  <<: *shared
+  DMLC_ROLE : scheduler
+s0:
+  <<: *shared
+  DMLC_ROLE : server
+  SERVER_ID : 0
+  DMLC_PS_SERVER_URI : 127.0.0.1
+  DMLC_PS_SERVER_PORT : 13201
+s1:
+  <<: *shared
+  DMLC_ROLE : server
+  SERVER_ID : 1
+  DMLC_PS_SERVER_URI : 127.0.0.1
+  DMLC_PS_SERVER_PORT : 13203
+w0:
+  <<: *shared
+  DMLC_ROLE : worker
+  WORKER_ID : 0
+  DMLC_PS_WORKER_URI : 127.0.0.1
+  DMLC_PS_WORKER_PORT : 13210
\ No newline at end of file
diff --git a/tests/pstests/local_s2_w2.yml b/tests/pstests/local_s2_w2.yml
new file mode 100644
index 0000000..72e0ee3
--- /dev/null
+++ b/tests/pstests/local_s2_w2.yml
@@ -0,0 +1,33 @@
+shared: &shared
+  DMLC_PS_ROOT_URI : 127.0.0.1
+  DMLC_PS_ROOT_PORT : 13200
+  DMLC_NUM_WORKER : 2
+  DMLC_NUM_SERVER : 2
+  DMLC_PS_VAN_TYPE : p3
+sched:
+  <<: *shared
+  DMLC_ROLE : scheduler
+s0:
+  <<: *shared
+  DMLC_ROLE : server
+  SERVER_ID : 0
+  DMLC_PS_SERVER_URI : 127.0.0.1
+  DMLC_PS_SERVER_PORT : 13201
+s1:
+  <<: *shared
+  DMLC_ROLE : server
+  SERVER_ID : 1
+  DMLC_PS_SERVER_URI : 127.0.0.1
+  DMLC_PS_SERVER_PORT : 13203
+w0:
+  <<: *shared
+  DMLC_ROLE : worker
+  WORKER_ID : 0
+  DMLC_PS_WORKER_URI : 127.0.0.1
+  DMLC_PS_WORKER_PORT : 13210
+w1:
+  <<: *shared
+  DMLC_ROLE : worker
+  WORKER_ID : 1
+  DMLC_PS_WORKER_URI : 127.0.0.1
+  DMLC_PS_WORKER_PORT : 13211
\ No newline at end of file
diff --git a/tests/pstests/test_apis.py b/tests/pstests/test_apis.py
new file mode 100644
index 0000000..ab14768
--- /dev/null
+++ b/tests/pstests/test_apis.py
@@ -0,0 +1,243 @@
+import hetu as ht
+
+import time
+import os
+import sys
+import yaml
+import multiprocessing
+from multiprocessing.sharedctypes import RawArray as rarr
+import argparse
+import signal
+import numpy as np
+from scipy.stats import truncnorm
+import ctypes
+import matplotlib.pyplot as plt
+
+nitem = 2000
+item_len = 1000
+indx1 = 30
+indx2 = 40
+
+
+def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False):
+    assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal')
+    init_type_map = {'constant': 0, 'uniform': 1,
+                     'normal': 2, 'truncated_normal': 3}
+    ctx = ht.cpu(0)
+    rank = int(os.environ["WORKER_ID"])
+    nrank = int(os.environ["DMLC_NUM_WORKER"])
+    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len)
+    if rank == 0:
+        arr = ht.array(local_arr, ctx=ctx)
+    else:
+        arr = ht.empty((nitem, item_len), ctx=ctx)
+    comm = ht.get_worker_communicate()
+    if sparse:
+        arr_len = ctypes.c_int(nitem)
+        arr_wid = ctypes.c_int(item_len)
+    else:
+        arr_len = ctypes.c_int(nitem * item_len)
+        arr_wid = ctypes.c_int(1)
+    itype = ctypes.c_int(init_type_map[init_type])
+    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, itype, ctypes.c_double(
+        init_a), ctypes.c_double(init_b), ctypes.c_ulonglong(123), ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
+
+    comm.Pull(ctypes.c_int(0), arr.handle)
+    comm.Wait(ctypes.c_int(0))
+    if rank == 0:
+        local_arr[:] = arr.asnumpy()
+    comm.BarrierWorker()
+    if rank != 0:
+        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
+    else:
+        if init_type == 'constant':
+            np.testing.assert_allclose(
+                np.full((nitem, item_len), init_a), arr.asnumpy(), rtol=5e-7)
+        else:
+            if init_type == 'uniform':
+                numpy_samples = np.random.uniform(
+                    low=init_a, high=init_b, size=(nitem, item_len)).astype(np.float32)
+            elif init_type == 'normal':
+                numpy_samples = np.random.normal(
+                    loc=init_a, scale=init_b, size=(nitem, item_len)).astype(np.float32)
+            else:
+                numpy_samples = truncnorm.rvs(-2.0, 2.0, loc=init_a,
+                                              scale=init_b, size=(nitem, item_len)).astype(np.float32)
+            fig, ax = plt.subplots(1, 1)
+            ax.hist(numpy_samples.flatten(), histtype='stepfilled',
+                    alpha=0.2, bins=50, label='numpy')
+            ax.hist(local_arr.flatten(), histtype='step',
+                    alpha=0.2, bins=50, label='ps')
+            ax.legend(loc='best', frameon=False)
+            # ax2.legend(loc='best', frameon=False)
+            file_name = '%s_%.1f_%.1f_%d.png' % (
+                init_type, init_a, init_b, int(sparse))
+            plt.savefig(file_name)
+            print('Check file %s.' % file_name)
+    print('Init parameters %d/%d passed.' % (rank, nrank))
+    if rank == 0:
+        comm.ClearOnServer(0)
+    comm.Clear(0)
+    comm.BarrierWorker()
+
+
+def test_api(rarr, rpush, rpull, sparse=False, lr=0.5):
+    ctx = ht.cpu(0)
+    rank = int(os.environ["WORKER_ID"])
+    nrank = int(os.environ["DMLC_NUM_WORKER"])
+    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(
+        nitem, item_len).copy()
+    local_push = np.frombuffer(rpush, dtype=np.float32).copy()
+    local_pull = np.frombuffer(rpull, dtype=np.float32).copy()
+    if rank == 0:
+        arr = ht.array(local_arr, ctx=ctx)
+    else:
+        arr = ht.empty((nitem, item_len), ctx=ctx)
+    comm = ht.get_worker_communicate()
+    if sparse:
+        arr_len = ctypes.c_int(nitem)
+        arr_wid = ctypes.c_int(item_len)
+    else:
+        arr_len = ctypes.c_int(nitem * item_len)
+        arr_wid = ctypes.c_int(1)
+    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),
+                    ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1))
+    if sparse:
+        local_arr[:] = 0
+        for j in local_push:
+            local_arr[int(j)] += 1
+        if rank == 0:
+            push_ind = ht.array(local_push.reshape(indx1, indx2), ctx=ctx)
+            push_val = ht.array(
+                np.ones((indx1, indx2, item_len)).astype(np.float32), ctx=ctx)
+            comm.SparsePush(0, push_ind.handle, push_val.handle, None)
+            comm.Wait(0)
+        comm.BarrierWorker()
+        comm.Pull(0, arr.handle)
+        comm.Wait(0)
+        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
+        print('SparsePush DensePull %d/%d passed.' % (rank, nrank))
+        comm.BarrierWorker()
+
+        for j in local_push:
+            local_arr[int(j)] += 1
+        if rank == 0:
+            push_ind = ht.array(local_push.reshape(indx1, indx2), ctx=ctx)
+            push_val = ht.array(
+                np.ones((indx1, indx2, item_len)).astype(np.float32), ctx=ctx)
+            comm.SDPushPull(0, push_ind.handle,
+                            push_val.handle, arr.handle, None)
+            comm.Wait(0)
+        comm.BarrierWorker()
+        if rank != 0:
+            comm.Pull(0, arr.handle)
+            comm.Wait(0)
+        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
+        print('SDPushPull %d/%d passed.' % (rank, nrank))
+        comm.BarrierWorker()
+
+        for j in local_push:
+            local_arr[int(j)] += 1
+        pull_ind = ht.array(local_pull.reshape(indx1, indx2), ctx=ctx)
+        pull_val = ht.empty((indx1, indx2, item_len), ctx=ctx)
+        if rank == 0:
+            push_ind = ht.array(local_push.reshape(indx1, indx2), ctx=ctx)
+            push_val = ht.array(
+                np.ones((indx1, indx2, item_len)).astype(np.float32), ctx=ctx)
+            comm.SSPushPull(0, push_ind.handle, push_val.handle,
+                            pull_ind.handle, pull_val.handle, None)
+            comm.Wait(0)
+        comm.BarrierWorker()
+        if rank != 0:
+            comm.SparsePull(0, pull_ind.handle, pull_val.handle)
+            comm.Wait(0)
+        np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape(
+            indx1, indx2, item_len), pull_val.asnumpy(), rtol=5e-7)
+        print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank))
+        comm.BarrierWorker()
+
+    else:
+        if rank == 0:
+            comm.Push(0, arr.handle, None)
+            comm.Wait(0)
+        comm.BarrierWorker()
+        comm.Pull(0, arr.handle)
+        comm.Wait(0)
+        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
+        print('DensePush DensePull %d/%d passed.' % (rank, nrank))
+        comm.BarrierWorker()
+        if rank == 0:
+            temp_push_val = ht.array(
+                np.ones((nitem, item_len)).astype(np.float32), ctx=ctx)
+            comm.DDPushPull(0, temp_push_val.handle, arr.handle, None)
+            comm.Wait(0)
+        comm.BarrierWorker()
+        if rank != 0:
+            comm.Pull(0, arr.handle)
+            comm.Wait(0)
+        np.testing.assert_allclose(local_arr + 1, arr.asnumpy())
+        print('DenseDensePushPull %d/%d passed.' % (rank, nrank))
+        comm.BarrierWorker()
+    if rank == 0:
+        comm.ClearOnServer(0)
+    comm.Clear(0)
+    comm.BarrierWorker()
+
+
+def start_process(settings, args, arr=None, push_arr=None, pull_arr=None):
+    for key, value in settings.items():
+        os.environ[key] = str(value)
+    if os.environ['DMLC_ROLE'] == "server":
+        ht.server_init()
+        ht.server_finish()
+    elif os.environ['DMLC_ROLE'] == "worker":
+        ht.worker_init()
+        test_api(arr, push_arr, pull_arr)
+        test_init_ps(arr, 'constant', 1234.567)
+        test_init_ps(arr, 'uniform', -0.5, 0.4)
+        test_init_ps(arr, 'normal', 5.6, 2.0)
+        test_init_ps(arr, 'truncated_normal', -2.3, 1.4)
+        test_api(arr, push_arr, pull_arr, True)
+        test_init_ps(arr, 'constant', 1234.567, True)
+        test_init_ps(arr, 'uniform', -0.5, 0.4, True)
+        test_init_ps(arr, 'normal', 5.6, 2.0, True)
+        test_init_ps(arr, 'truncated_normal', -2.3, 1.4, True)
+        ht.worker_finish()
+    elif os.environ['DMLC_ROLE'] == "scheduler":
+        ht.scheduler_init()
+        ht.scheduler_finish()
+    else:
+        raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in process_list:
+        proc.kill()
+    exit(0)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default='./local_s2_w2.yml')
+    args = parser.parse_args()
+    settings = yaml.load(open(args.config).read(), Loader=yaml.FullLoader)
+    process_list = []
+    arr = rarr('f', np.random.rand(nitem * item_len,).astype(np.float32))
+    push_arr = rarr('f', np.random.randint(
+        0, nitem, (indx1 * indx2)).astype(np.float32))
+    pull_arr = rarr('f', np.random.randint(
+        0, nitem, (indx1 * indx2)).astype(np.float32))
+    for key, value in settings.items():
+        if key != 'shared':
+            if key[0] != 'w':
+                proc = multiprocessing.Process(
+                    target=start_process, args=[value, args])
+            else:
+                proc = multiprocessing.Process(target=start_process, args=[
+                                               value, args, arr, push_arr, pull_arr])
+            process_list.append(proc)
+            proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in process_list:
+        proc.join()
diff --git a/tests/pstests/test_bandwidth.py b/tests/pstests/test_bandwidth.py
new file mode 100644
index 0000000..ea856f6
--- /dev/null
+++ b/tests/pstests/test_bandwidth.py
@@ -0,0 +1,297 @@
+import hetu as ht
+
+import time
+import os
+import sys
+import yaml
+import multiprocessing
+import argparse
+import signal
+import numpy as np
+import ctypes
+from tqdm import tqdm
+
+from concurrent.futures import ThreadPoolExecutor
+import threading
+
+
+def pointer(arr):
+    assert(arr.data.c_contiguous)
+    assert(arr.dtype == np.long)
+    return ctypes.cast(arr.ctypes.data, ctypes.POINTER(ctypes.c_long))
+
+
+def test(func_name, nitem=2000, item_len=10000, ind_len=500, max_thread=10, ret_ans=False):
+    func_name = func_name.lower()
+    ctx = ht.cpu(0)
+    rank = int(os.environ["WORKER_ID"])
+    nrank = int(os.environ["DMLC_NUM_WORKER"])
+
+    comm = ht.get_worker_communicate()
+    byte_count = 0
+    if func_name == 'pushnpull':
+        inarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            comm.Push(name, inarr.handle, None)
+            comm.Pull(name, outarr.handle)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += nitem * item_len * 4 * 2
+    elif func_name == 'pushpull':
+        inarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            comm.DDPushPull(name, inarr.handle, outarr.handle, None)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += nitem * item_len * 4 * 2
+    elif func_name == 'sparsepushnpull':
+        inarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            np_ind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            inind = ht.array(np_ind.astype(np.float32), ctx=ctx)
+            uni_ind_len = np.unique(np_ind).size
+            comm.SparsePush(name, inind.handle, inarr.handle, None)
+            comm.Pull(name, outarr.handle)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += (nitem + uni_ind_len) * item_len * 4
+    elif func_name == 'sparsepushnsparsepull':
+        inarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+
+        def func(name):
+            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            inind = ht.array(np_inind.astype(np.float32), ctx=ctx)
+            outind = ht.array(np_outind.astype(np.float32), ctx=ctx)
+            uni_inind_len = np.unique(np_inind).size
+            uni_outind_len = np.unique(np_outind).size
+            comm.SparsePush(name, inind.handle, inarr.handle, None)
+            comm.SparsePull(name, outind.handle, outarr.handle)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
+    elif func_name == 'push':
+        inarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            comm.Push(name, inarr.handle, None)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += nitem * item_len * 4
+    elif func_name == 'pull':
+        outarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            comm.Pull(name, outarr.handle)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += nitem * item_len * 4
+    elif func_name == 'sparsepush':
+        inarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+
+        def func(name):
+            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            inind = ht.array(np_inind.astype(np.float32), ctx=ctx)
+            uni_inind_len = np.unique(np_inind).size
+            comm.SparsePush(name, inind.handle, inarr.handle, None)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += uni_inind_len * item_len * 4
+    elif func_name == 'sparsepull':
+        outarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+
+        def func(name):
+            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            outind = ht.array(np_outind.astype(np.float32), ctx=ctx)
+            uni_outind_len = np.unique(np_outind).size
+            comm.SparsePull(name, outind.handle, outarr.handle)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += uni_outind_len * item_len * 4
+    elif func_name == 'sdpushpull':
+        inarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(nitem, item_len), ctx=ctx)
+
+        def func(name):
+            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            inind = ht.array(np_inind.astype(np.float32), ctx=ctx)
+            uni_inind_len = np.unique(np_inind).size
+            comm.SDPushPull(name, inind.handle, inarr.handle,
+                            outarr.handle, None)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += (uni_inind_len + nitem) * item_len * 4
+    elif func_name == 'sspushpull':
+        inarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+        outarr = ht.array(np.random.rand(ind_len, item_len), ctx=ctx)
+
+        def func(name):
+            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len,))
+            inind = ht.array(np_inind.astype(np.float32), ctx=ctx)
+            uni_inind_len = np.unique(np_inind).size
+            outind = ht.array(np_outind.astype(np.float32), ctx=ctx)
+            uni_outind_len = np.unique(np_outind).size
+            comm.SSPushPull(name, inind.handle, inarr.handle,
+                            outind.handle, outarr.handle, None)
+            comm.Wait(name)
+            nonlocal byte_count
+            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
+    else:
+        assert False
+    if 'sparse' in func_name or func_name in ('sdpushpull', 'sspushpull'):
+        arr_len = ctypes.c_int(nitem)
+        arr_wid = ctypes.c_int(item_len)
+        sparse_init = ctypes.c_int(1)
+    else:
+        arr_len = ctypes.c_int(nitem * item_len)
+        arr_wid = ctypes.c_int(1)
+        sparse_init = ctypes.c_int(0)
+    for i in range(max_thread):
+        comm.InitTensor(i, sparse_init, arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0), ctypes.c_double(1), ctypes.c_ulonglong(123),
+                        ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
+    t = ThreadPoolExecutor(max_workers=max_thread)
+    if ret_ans:
+        task_list = [None for i in range(max_thread)]
+        for i in range(max_thread):
+            task_list[i] = t.submit(func, i)
+        curByte = byte_count
+        start = time.time()
+        cnt = 0
+        while cnt < 30:
+            for i in range(max_thread):
+                if task_list[i].done():
+                    cnt += 1
+                    task_list[i] = t.submit(func, i)
+        speed = (byte_count - curByte) / (time.time() - start) / 2 ** 20
+        t.shutdown()
+        for i in range(max_thread):
+            comm.ClearOnServer(i)
+            comm.Clear(i)
+        return speed
+    else:
+        def watch():
+            start = time.time()
+            while True:
+                time.sleep(1)
+                speed = byte_count / (time.time() - start)
+                print("speed : {} MB/s".format(speed / 2**20))
+        task_list = [None for i in range(max_thread)]
+        threading.Thread(target=watch).start()
+        while True:
+            for i in range(max_thread):
+                if task_list[i] is None or task_list[i].done():
+                    task_list[i] = t.submit(func, i)
+
+
+def test_dense_n_draw(range_size, func, trial=5, use_text=False):
+    assert func in ('pushpull', 'push', 'pull', 'pushnpull')
+    assert trial >= 3
+    ans = {}
+    for i in tqdm(range_size):
+        temps = []
+        for _ in range(trial):
+            temps.append(test(func, i, 1, ret_ans=True))
+        temps.remove(max(temps))
+        temps.remove(min(temps))
+        ans[i] = sum(temps) / (trial - 2)
+    print(ans)
+    import matplotlib.pyplot as plt
+    xs = list(ans.keys())
+    ys = list(ans.values())
+    plt.bar(xs, ys, width=range_size.step // 2)
+    plt.xlabel('Data Size')
+    plt.ylabel('Bandwidth MB/s')
+    plt.title('Bandwidth of ' + func)
+    if use_text:
+        for xx, yy in zip(xs, ys):
+            plt.text(xx, yy + 20, '%.0f' % yy, ha='center', va='bottom')
+    plt.savefig('test_dense_bandwidth.png')
+
+
+def test_sparse_n_draw(range_ind_len, range_item_len, func, trial=5, use_text=False):
+    assert func in ('sparsepush', 'sparsepull')
+    assert trial >= 3
+    ans = {}
+    for i in tqdm(range_ind_len):
+        for j in range_item_len:
+            nitem = 5 * i
+            temps = []
+            for _ in range(trial):
+                temps.append(test(func, nitem, j, i, ret_ans=True))
+            temps.remove(max(temps))
+            temps.remove(min(temps))
+            ans[(i, j)] = sum(temps) / (trial - 2)
+    print(ans)
+    import matplotlib.pyplot as plt
+    from mpl_toolkits.mplot3d import Axes3D
+    xs, ys = [], []
+    for k in ans.keys():
+        xs.append(k[0])
+        ys.append(k[1])
+    ax = plt.subplot(111, projection='3d')
+    zs = list(ans.values())
+    ax.bar3d([xx - range_ind_len.step // 4 for xx in xs], [yy - range_item_len.step //
+                                                           4 for yy in ys], np.zeros_like(xs), range_ind_len.step // 2, range_item_len.step // 2, zs)
+    if use_text:
+        for xx, yy, zz in zip(xs, ys, zs):
+            ax.text(xx, yy, zz, '%.0f' % zz, ha='center', va='bottom')
+    ax.set_xlabel('Index Size')
+    ax.set_ylabel('Item Length')
+    ax.set_zlabel('Bandwidth MB/s')
+    ax.set_title('Bandwidth of ' + func)
+    plt.savefig('test_sparse_bandwidth.png')
+
+
+def start_process(settings, args):
+    for key, value in settings.items():
+        os.environ[key] = str(value)
+    if os.environ['DMLC_ROLE'] == "server":
+        ht.server_init()
+        ht.server_finish()
+    elif os.environ['DMLC_ROLE'] == "worker":
+        ht.worker_init()
+        test(args.func)
+        # test_dense_n_draw(range(100000, 1000000, 100000), 'pushpull')
+        # test_sparse_n_draw(range(100, 600, 100), range(1000, 6000, 1000), 'sparsepush')
+        ht.worker_finish()
+    elif os.environ['DMLC_ROLE'] == "scheduler":
+        ht.scheduler_init()
+        ht.scheduler_finish()
+    else:
+        raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in process_list:
+        proc.kill()
+    exit(0)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default='./local_s2_w1.yml')
+    parser.add_argument("--func", default='pushpull')
+    args = parser.parse_args()
+    assert args.func in ('pushpull', 'pushnpull', 'sparsepushnpull', 'sparsepushnsparsepull',
+                         'push', 'pull', 'sparsepush', 'sparsepull', 'sdpushpull', 'sspushpull')
+    file_path = args.config
+    settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader)
+    process_list = []
+    for key, value in settings.items():
+        if key != 'shared':
+            proc = multiprocessing.Process(
+                target=start_process, args=[value, args])
+            process_list.append(proc)
+            proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in process_list:
+        proc.join()
diff --git a/tests/pstests/test_push_data.py b/tests/pstests/test_push_data.py
new file mode 100644
index 0000000..3b84a91
--- /dev/null
+++ b/tests/pstests/test_push_data.py
@@ -0,0 +1,100 @@
+import hetu as ht
+import numpy as np
+
+import argparse
+import six.moves.cPickle as pickle
+import gzip
+import sys
+import json
+import os
+import ctypes
+import yaml
+import multiprocessing
+import signal
+
+
+def start_process(settings):
+    for key, value in settings.items():
+        os.environ[key] = str(value)
+    if os.environ['DMLC_ROLE'] == "server":
+        ht.server_init()
+        ht.server_finish()
+    elif os.environ['DMLC_ROLE'] == "worker":
+        ht.worker_init()
+        test()
+        ht.worker_finish()
+    elif os.environ['DMLC_ROLE'] == "scheduler":
+        ht.scheduler_init()
+        ht.scheduler_finish()
+    else:
+        raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for proc in process_list:
+        proc.kill()
+    exit(0)
+
+
+def pointer(arr):
+    assert(arr.data.c_contiguous)
+    assert(arr.dtype == np.long)
+    return ctypes.cast(arr.ctypes.data, ctypes.POINTER(ctypes.c_long))
+
+
+def test():
+    ctx = ht.cpu(0)
+    rank = int(os.environ["WORKER_ID"])
+    nrank = int(os.environ["DMLC_NUM_WORKER"])
+    arr = ht.array(np.random.rand(2, rank+100), ctx=ctx)
+    print(arr.asnumpy())
+
+    push_indices = np.array([2*rank+1, 2*rank+2])
+
+    if rank == 0:
+        pull_indices = np.array([3])
+    elif rank == 1:
+        pull_indices = np.array([1])
+
+    push_length = np.array([rank+100, rank+100])
+
+    if rank == 0:
+        pull_length = np.array([101])
+        out_arr = ht.array(np.zeros(101), ctx=ctx)
+    elif rank == 1:
+        pull_length = np.array([100])
+        out_arr = ht.array(np.zeros(100), ctx=ctx)
+
+    print(out_arr.asnumpy())
+
+    worker_communicate = ht.get_worker_communicate()
+    query = worker_communicate.PushData(
+        pointer(push_indices), 2, arr.handle, pointer(push_length))
+
+    worker_communicate.WaitData(query)
+
+    worker_communicate.BarrierWorker()
+    worker_communicate.PullData(
+        pointer(pull_indices), 1, out_arr.handle, pointer(pull_length))
+    worker_communicate.WaitData(query)
+
+    print(out_arr.asnumpy())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default='./local_s2_w2.yml')
+    args = parser.parse_args()
+    file_path = args.config
+    settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader)
+    process_list = []
+    for key, value in settings.items():
+        if key != 'shared':
+            proc = multiprocessing.Process(
+                target=start_process, args=[value, ])
+            process_list.append(proc)
+            proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in process_list:
+        proc.join()
diff --git a/tests/pstests/test_tf_bandwidth.py b/tests/pstests/test_tf_bandwidth.py
new file mode 100644
index 0000000..9d4cbd2
--- /dev/null
+++ b/tests/pstests/test_tf_bandwidth.py
@@ -0,0 +1,110 @@
+import tensorflow as tf
+import numpy as np
+import argparse
+import os
+import time
+import json
+import multiprocessing
+import signal
+
+
+def pop_env():
+    for k in ['https_proxy', 'http_proxy']:
+        if k in os.environ:
+            os.environ.pop(k)
+
+
+pop_env()
+
+
+def launch_server(cluster, task_id):
+    server = tf.train.Server(cluster, job_name='ps', task_index=task_id)
+    server.join()
+
+
+def test_bandwidth(cluster, task_id):
+    print('test bandwidth')
+    iters = 1000
+    params_size = 128 * 100
+    ps_device = "/job:ps/task:0/cpu:0"
+    worker_device = "/job:worker/task:%d/cpu:0" % (task_id)
+
+    with tf.device(ps_device):
+        dtype = tf.int32
+        params = tf.get_variable("params", shape=[params_size], dtype=dtype,
+                                 initializer=tf.zeros_initializer())
+    with tf.device(tf.compat.v1.train.replica_device_setter(
+            worker_device=worker_device,
+            cluster=cluster)):
+        update = tf.get_variable("update", shape=[params_size], dtype=dtype,
+                                 initializer=tf.ones_initializer())
+        add_op = params.assign(update)
+
+        server = tf.train.Server(
+            cluster, job_name="worker", task_index=task_id)
+        init = tf.global_variables_initializer()
+        sv = tf.train.Supervisor(
+            is_chief=(task_id == 0),
+            init_op=init,
+            recovery_wait_secs=1)
+        sess_config = tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=False,
+            device_filters=["/job:ps",
+                            "/job:worker/task:%d" % task_id])
+        sess = sv.prepare_or_wait_for_session(
+            server.target, config=sess_config)
+
+        sess.run(init)
+        # warm up
+        for i in range(5):
+            sess.run(add_op.op)
+
+        start_time = time.time()
+        for i in range(iters):
+            sess.run(add_op.op)
+        elapsed_time = time.time() - start_time
+        ans = float(iters)*(params_size / 1024 / 1024)/elapsed_time
+        print("transfer rate: %f MB/s" % (ans))
+
+
+def signal_handler(signal, frame):
+    print("SIGINT signal caught, stop Training")
+    for ps in server_list:
+        ps.kill()
+    for worker in worker_list:
+        worker.kill()
+    exit(0)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, default='./tf_local_s1_w2.json', help="config file path")
+    args = parser.parse_args()
+
+    config = json.load(open(args.config))
+    print(config)
+    # exit()
+    cluster = tf.train.ClusterSpec(config)
+
+    for i, ps in enumerate(config['ps']):
+        proc = multiprocessing.Process(target=launch_server, args=[cluster, i])
+        server_list.append(proc)
+        proc.start()
+    for i, worker in enumerate(config['worker']):
+        proc = multiprocessing.Process(
+            target=test_bandwidth, args=[cluster, i])
+        worker_list.append(proc)
+        proc.start()
+    signal.signal(signal.SIGINT, signal_handler)
+    for proc in worker_list:
+        proc.join()
+    for ps in server_list:
+        ps.kill()
+
+
+if __name__ == '__main__':
+    server_list = []
+    worker_list = []
+    main()
diff --git a/tests/pstests/tf_local_s1_w2.json b/tests/pstests/tf_local_s1_w2.json
new file mode 100644
index 0000000..b53dbb3
--- /dev/null
+++ b/tests/pstests/tf_local_s1_w2.json
@@ -0,0 +1,9 @@
+{
+    "worker": [
+        "127.0.0.1:12349",
+        "127.0.0.1:12348"
+    ],
+    "ps": [
+        "127.0.0.1:22345"
+    ]
+}
\ No newline at end of file
diff --git a/tests/test_DistGCN/prepare_data_GCN15d.py b/tests/test_DistGCN/prepare_data_GCN15d.py
new file mode 100644
index 0000000..1c0189f
--- /dev/null
+++ b/tests/test_DistGCN/prepare_data_GCN15d.py
@@ -0,0 +1,253 @@
+from graphmix import Graph
+
+import numpy as np
+import scipy.sparse as sp
+import os
+import sys
+import math
+import argparse
+import pickle as pkl
+import networkx as nx
+
+'''
+Usage example: (in Dir Hetu/)
+    python ./tests/test_DistGCN/prepare_data_GCN15d.py --size 8 --replication 2 --dataset Reddit
+'''
+
+
+def coo_slice(a, row_range, col_range):
+    a = a.tocoo()
+    condition = np.where((a.row >= row_range[0]) & (a.row < row_range[1]) & (
+        a.col >= col_range[0]) & (a.col < col_range[1]))
+    return sp.coo_matrix((a.data[condition], (a.row[condition]-row_range[0], a.col[condition]-col_range[0])), shape=(row_range[1]-row_range[0], col_range[1]-col_range[0]))
+
+
+def get_adj_matrix_all(A, replication, size, dir_name):
+    node_count = A.shape[0]
+
+    n_per_proc = math.ceil(float(node_count) / (size // replication))
+    stages = size // (replication ** 2)
+    col_block = stages*n_per_proc
+    row_block = math.ceil(float(node_count)/(size//replication))
+
+    for rank in range(size):
+        rank_row = rank // replication  # i
+        rank_col = rank % replication  # j
+
+        col_start = int(col_block*rank_col)
+        col_end = int(col_block*(rank_col+1))
+        if col_end > node_count:
+            col_end = node_count
+
+        row_start = int(row_block*rank_row)
+        row_end = int(row_block*(rank_row+1))
+        if row_end > node_count:
+            row_end = node_count
+
+        a = coo_slice(A.tocoo(), row_range=(row_start, row_end),
+                      col_range=(col_start, col_end))
+        sp.save_npz(dir_name+"adj_part"+str(rank)+".npz", a)
+        print("adj_part: rank = %d" % rank, a.shape, len(a.data))
+
+
+def get_inputs(H, replication, rank, size):
+    node_count = H.shape[0]
+    rank_row = rank // replication  # i
+    row_block = math.ceil(float(node_count)/(size//replication))
+    row_start = int(row_block*rank_row)
+    row_end = int(row_block*(rank_row+1))
+    if row_end > node_count:
+        row_end = node_count
+    h = H[row_start:row_end, :]
+    print("inputs_part: rank = %d" % rank, h.shape)
+    return h
+
+
+def parse_index_file(filename):
+    index = []
+    for line in open(filename):
+        index.append(int(line.strip()))
+    return index
+
+
+def normalize(mx):
+    """Row-normalize sparse matrix"""
+    rowsum = np.array(mx.sum(1))
+    r_inv = np.power(rowsum, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    mx = r_mat_inv.dot(mx)
+    return mx
+
+
+def check_sparsity(adj):
+    if args.size == -1:
+        return
+    adj = adj.tocoo()
+    node_count = adj.shape[0]
+    block_num = args.size//args.replication
+    p = math.ceil(float(node_count)/(args.size//args.replication))
+    starts = list(range(0, node_count, p))
+    ends = list(range(p, node_count, p))+[node_count]
+    sparsity = np.zeros(shape=(block_num, block_num), dtype=int)
+    for i in range(block_num):
+        for j in range(block_num):
+            sparsity[i, j] = np.where((adj.row >= starts[i]) & (adj.row < ends[i]) & (
+                adj.col >= starts[j]) & (adj.col < ends[j]))[0].shape[0]
+    print(sparsity)
+
+
+def load_data(args):
+    dataset = args.dataset
+    data_dir = './tests/test_DistGCN/datasets/%s/' % dataset
+
+    # ---load data---
+    if dataset == "Reddit":
+        adj = sp.load_npz(data_dir+'raw/reddit_graph.npz')
+        inputs = np.load(data_dir+'raw/reddit_data.npz')
+        x, y = inputs['feature'], inputs['label']
+    elif dataset == 'Proteins':
+        adj = sp.load_npz(data_dir+'protein_adj.npz')
+        y = np.load(data_dir+'protein_labels.npy')
+        y = y.astype(int)
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (132534 + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[132534, 602]).astype(np.float32)
+    elif dataset == 'Arch':
+        adj = sp.load_npz(data_dir+'arch_adj.npz')
+        y = np.random.randint(10, size=adj.shape[0])
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (adj.shape[0] + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[adj.shape[0], 602]).astype(np.float32)
+    elif dataset == 'Products':
+        adj = sp.load_npz(data_dir+'products_adj.npz')
+        x = np.load(data_dir+'products_feat.npy')
+        y = np.load(data_dir+'products_label.npy').astype(np.int)
+    elif dataset == 'Youtube':
+        adj = np.load(data_dir+'youtube_coo.npy', allow_pickle=True).item()
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (adj.shape[0] + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[adj.shape[0], 602]).astype(np.float32)
+        y = np.load(data_dir+'youtube_label.npy')
+
+    graph = Graph(edge_index=np.vstack(
+        [adj.row, adj.col]), num_nodes=x.shape[0])
+
+    # ---preprocess graph---
+    graph.add_self_loop()
+    normed_val = graph.gcn_norm(True)
+    node_count = graph.num_nodes
+
+    # ---construct adj,x,y---
+    edge_index = graph.edge_index
+    adj = sp.coo_matrix(
+        (normed_val, (edge_index[0], edge_index[1])), shape=(node_count, node_count))
+
+    # ---check block sparsity---
+    print('Sparsity before reordering:')
+    check_sparsity(adj)
+
+    if args.shuffle == 1:
+        print("Shuffle the graph...")
+        order = np.random.permutation(node_count)
+        adj = adj.tocsr()[:, order][order]
+        x = x[order, :]
+        y = y[order]
+        print('Sparsity after Shuffle:')
+        check_sparsity(adj)
+
+    print('node_count = %d, num_features = %d, num_classes = %d, edge_count = %d' % (
+        adj.shape[0], x.shape[1], np.max(y)+1, len(adj.data)))
+    return adj, x, y
+
+
+def prepare_data(args, prepare_all_data=False):
+    dataset, replication, size = args.dataset, args.replication, args.size
+    print("Preparing data...")
+
+    adj_all, input_all, label_all = load_data(args)
+
+    if prepare_all_data:
+        size_set = [1, 2, 4, 8, 4, 8]
+        replication_set = [1, 1, 1, 1, 2, 2]
+    else:
+        size_set = [size]
+        replication_set = [replication]
+
+    for i in range(len(size_set)):
+        replication, size = replication_set[i], size_set[i]
+        print("size=%d, replication=%s, dataset=%s" %
+              (size, replication, dataset))
+
+        if size == 1:  # whole graph for single GPU
+            replication = 1
+            dir_name = "./tests/test_DistGCN/data_GCN15d/%s_whole_graph/" % dataset
+            if not os.path.exists(dir_name):
+                os.makedirs(dir_name)
+            adj_all = adj_all.tocoo()
+            sp.save_npz(dir_name+"adj_whole.npz", adj_all)
+            print("adj_whole: ", adj_all.shape, len(adj_all.data))
+            np.save(dir_name+"input_whole.npy", input_all)
+            print("inputs_all: ", input_all.shape)
+            np.save(dir_name+"label_whole.npy", label_all)
+            print("labels_all: ", label_all.shape)
+            print("Data preparation done!")
+        else:  # partitioned graph for multiple GPU
+            dir_name = "./tests/test_DistGCN/data_GCN15d/%s_size_%d_rep_%d/" % (
+                dataset, size, replication)
+            if not os.path.exists(dir_name):
+                os.makedirs(dir_name)
+            for rank in range(size):
+                input_part = get_inputs(input_all, replication, rank, size)
+                label_part = get_inputs(
+                    label_all.reshape(-1, 1), replication, rank, size).reshape(-1)
+                np.save(dir_name+"input"+str(rank)+".npy", input_part)
+                np.save(dir_name+"label"+str(rank)+".npy", label_part)
+            print("Done inputs and labels!")
+
+            get_adj_matrix_all(adj_all, replication, size, dir_name)
+            print("Data preparation done!")
+
+
+def get_dataset(args):
+    if args.dataset in ['Reddit', 'reddit']:
+        args.dataset = 'Reddit'
+    elif args.dataset in ['Proteins', 'proteins']:
+        args.dataset = 'Proteins'
+    elif args.dataset in ['Arch', 'arch']:
+        args.dataset = 'Arch'
+    elif args.dataset in ['Products', 'products']:
+        args.dataset = 'Products'
+    elif args.dataset in ['All', 'all']:
+        args.dataset = 'All'
+    else:
+        print(
+            "Dataset should be in ['Reddit','Proteins','Arch','Products','All']")
+        assert False
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--replication', type=int, default=1,
+                    help='Replication of distGCN1.5D.')
+parser.add_argument('--size', type=int, default=8,
+                    help='Number of devices')
+parser.add_argument('--dataset', type=str, default="Reddit",
+                    help='Choose dataset [Reddit, Proteins, Arch, Products].')
+parser.add_argument('--shuffle', type=int, default=1,
+                    help='Whether to shuffle the graph before algorithm.')
+args = parser.parse_args()
+
+get_dataset(args)
+
+if args.size == -1:
+    prepare_data(args, True)
+elif args.dataset == 'All':
+    dataset = ['Reddit', 'Proteins', 'Arch', 'Products']
+    for i in range(len(dataset)):
+        args.dataset = dataset[i]
+        prepare_data(args)
+else:
+    prepare_data(args)
diff --git a/tests/test_DistGCN/prepare_data_GCN15d_reorder.py b/tests/test_DistGCN/prepare_data_GCN15d_reorder.py
new file mode 100644
index 0000000..6bd6c61
--- /dev/null
+++ b/tests/test_DistGCN/prepare_data_GCN15d_reorder.py
@@ -0,0 +1,480 @@
+from graphmix import Graph
+
+import numpy as np
+import scipy.sparse as sp
+import os
+import sys
+import math
+import argparse
+import matplotlib.pyplot as plt
+import networkx as nx
+from scipy.sparse.csgraph import reverse_cuthill_mckee
+import pickle as pkl
+import time
+
+'''
+Usage example: (in Dir Hetu/)
+    python ./tests/test_DistGCN/prepare_data_GCN15d_reorder.py --size 8 --replication 2 --dataset Reddit  --reorder_alg metis
+'''
+
+
+def coo_slice(a, row_range, col_range):
+    a = a.tocoo()
+    condition = np.where((a.row >= row_range[0]) & (a.row < row_range[1]) & (
+        a.col >= col_range[0]) & (a.col < col_range[1]))
+    return sp.coo_matrix((a.data[condition], (a.row[condition]-row_range[0], a.col[condition]-col_range[0])), shape=(row_range[1]-row_range[0], col_range[1]-col_range[0]))
+
+
+def get_adj_matrix_all(A, replication, size, dir_name):
+    node_count = A.shape[0]
+
+    n_per_proc = math.ceil(float(node_count) / (size // replication))
+    stages = size // (replication ** 2)
+    col_block = stages*n_per_proc
+    row_block = math.ceil(float(node_count)/(size//replication))
+
+    for rank in range(size):
+        rank_row = rank // replication  # i
+        rank_col = rank % replication  # j
+
+        col_start = int(col_block*rank_col)
+        col_end = int(col_block*(rank_col+1))
+        if col_end > node_count:
+            col_end = node_count
+
+        row_start = int(row_block*rank_row)
+        row_end = int(row_block*(rank_row+1))
+        if row_end > node_count:
+            row_end = node_count
+
+        a = coo_slice(A.tocoo(), row_range=(row_start, row_end),
+                      col_range=(col_start, col_end))
+        sp.save_npz(dir_name+"adj_part"+str(rank)+".npz", a)
+        print("adj_part: rank = %d" % rank, a.shape, len(a.data))
+
+
+def get_inputs(H, replication, rank, size, block_no=None):
+    node_count = H.shape[0]
+    rank_row = rank // replication  # i
+    row_block = math.ceil(float(node_count)/(size//replication))
+    row_start = int(row_block*rank_row)
+    row_end = int(row_block*(rank_row+1))
+    if row_end > node_count:
+        row_end = node_count
+    h = H[row_start:row_end, :]
+    print("inputs_part: rank = %d" % rank, h.shape)
+    return h
+
+
+def matrix_visualize(m, title, img_name, args):
+    print("Visualization matrix after partitioning...")
+    dir_name = "./tests/test_DistGCN/matrix_visualization/%s/" % args.dataset
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+    plt.clf()
+    if not isinstance(m, sp.coo_matrix):
+        m = sp.coo_matrix(m)
+    fig = plt.figure()
+    ax = fig.add_subplot(111, facecolor='white')
+    ax.plot(m.col, m.row, ',', color='black')
+    ax.set_xlim(0, m.shape[1])
+    ax.set_ylim(0, m.shape[0])
+    ax.set_aspect('equal')
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+    ax.invert_yaxis()
+    ax.set_aspect('equal')
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+    plt.rcParams['savefig.dpi'] = 400
+    plt.rcParams['figure.dpi'] = 400
+    plt.title(title)
+    plt.savefig(dir_name+img_name)
+    print("Visualization done!")
+
+
+def parse_index_file(filename):
+    index = []
+    for line in open(filename):
+        index.append(int(line.strip()))
+    return index
+
+
+def normalize(mx):
+    """Row-normalize sparse matrix"""
+    rowsum = np.array(mx.sum(1))
+    r_inv = np.power(rowsum, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    mx = r_mat_inv.dot(mx)
+    return mx
+
+# ------------------------
+# ---Graph reorder utils---
+
+
+def metis_reorder(adj, nparts=1000):
+    adj = adj.tocoo()
+    node_count = adj.shape[0]
+    # construct the graph, x/y/num_classes doesn't matter
+    graph = Graph(edge_index=np.vstack(
+        [adj.row, adj.col]), num_nodes=node_count)
+    # ---partition the graph using metis and calculate reorder index---
+    print("Metis reorder nparts = %d" % nparts)
+    nodes = graph.partition(nparts)
+
+    reindex = np.zeros(shape=(node_count,), dtype=int)
+    class_dic = {i: [] for i in range(nparts)}
+    for i in range(node_count):
+        class_dic[nodes[i]].append(i)
+    cnt = 0
+    np.random.seed(123)
+    part_order = np.array(range(nparts))
+
+    for i in range(nparts):
+        for j in class_dic[part_order[i]]:
+            reindex[j] = cnt
+            cnt += 1
+    return reindex
+
+
+def rcm_reorder(adj):
+    # ---graph reordering using RCM---
+    node_count = adj.shape[0]
+    reindex_reverse = np.array(
+        list(reverse_cuthill_mckee(adj.tocsr(), symmetric_mode=True)))
+    reindex = np.zeros((node_count,), int)
+    for i in range(node_count):
+        reindex[reindex_reverse[i]] = i
+    return reindex
+
+
+def slashburn_reorder(adj):
+    node_count = adj.shape[0]
+    reindex = np.zeros((node_count,), int)
+
+    G = nx.Graph()
+    edges = []
+    for i in range(adj.row.shape[0]):
+        if(adj.row[i] < adj.col[i]):
+            edges.append((adj.row[i], adj.col[i]))
+    G.add_nodes_from([i for i in range(node_count)])
+    G.add_edges_from(edges)
+    front = 0
+    end = node_count-1
+
+    def slash_burn(G, front, end):
+        deg = list(G.degree)
+        d = sorted(deg, key=lambda deg: deg[1], reverse=True)
+        for i in range(int(0.005*node_count)):
+            if(i < len(d)):
+                reindex[front] = d[i][0]
+                front += 1
+                G.remove_node(d[i][0])
+
+        print(len(list(G.nodes)))
+        if(len(list(G.nodes)) == 0):
+            return
+        components = list(
+            sorted(nx.connected_components(G), key=len, reverse=False))
+        nCom = len(components)
+        if(len(components[nCom-1]) > 1):
+            for i in range(nCom-1):
+                cur_com = components[i]
+                for node in cur_com:
+                    reindex[end] = node
+                    end -= 1
+                    G.remove_node(node)
+
+            if(len(list(G.nodes)) == 0):
+                return
+            slash_burn(G, front, end)
+        else:
+            nodes = list(G.nodes)
+            for n in nodes:
+                reindex[front] = n
+                G.remove_node(n)
+                front += 1
+            return
+        return
+
+    slash_burn(G, front, end)
+    reverse_reindex = np.zeros((node_count,), int)
+    for i in range(node_count):
+        reverse_reindex[reindex[i]] = i
+    return reverse_reindex
+
+
+def deg_reorder(adj):
+    node_count = adj.shape[0]
+    degree = np.zeros((node_count))
+    for i in range(adj.nnz):
+        degree[adj.row[i]] += 1
+    reindex = np.argsort(-degree)
+    reverse_reindex = np.zeros((node_count,), int)
+    for i in range(node_count):
+        reverse_reindex[reindex[i]] = i
+    return reverse_reindex
+
+# return reverse reorder index
+
+
+def graph_reorder(adj, reorder_alg='metis'):
+    print("Calculating the reordering index...")
+    print('Reorder_alg = %s' % (reorder_alg))
+    node_count = adj.shape[0]
+
+    if args.size == 1:
+        adj = adj.tocoo()
+        if reorder_alg == 'metis':
+            nparts = node_count//args.part_size
+            reindex = metis_reorder(adj, nparts=nparts)
+        elif reorder_alg == 'rcm':
+            reindex = rcm_reorder(adj)
+        elif reorder_alg == 'slashburn':
+            reindex = slashburn_reorder(adj)
+        elif reorder_alg == 'deg':
+            reindex = deg_reorder(adj)
+        else:
+            print(
+                "Supported reordering algorithms are [metis, rcm, slashburn, deg].")
+            exit(-1)
+    elif args.size//args.replication in [2, 4, 8]:
+        s = args.size//args.replication
+        reorder_count = math.ceil(float(node_count)/s)
+        starts = list(range(0, node_count, reorder_count))
+        ends = list(range(reorder_count, node_count,
+                          reorder_count))+[node_count]
+        reindexs = []
+        for i in range(s):
+            index0, index1 = starts[i], ends[i]
+            a = coo_slice(adj, row_range=(index0, index1),
+                          col_range=(index0, index1))
+            if reorder_alg == 'metis':
+                nparts = reorder_count//args.part_size
+                reindex_part = metis_reorder(a, nparts=nparts)
+            elif reorder_alg == 'rcm':
+                reindex_part = rcm_reorder(a)
+            elif reorder_alg == 'slashburn':
+                reindex_part = slashburn_reorder(a)
+            elif reorder_alg == 'deg':
+                reindex_part = deg_reorder(a)
+            else:
+                print(
+                    "Supported reordering algorithms are [metis, rcm, slashburn, deg].")
+                exit(-1)
+            reindex_part = np.array(reindex_part)+index0
+            reindexs.append(reindex_part)
+        reindex = np.concatenate(reindexs)
+    reverse_reindex = np.zeros((node_count,), int)
+    for i in range(node_count):
+        reverse_reindex[reindex[i]] = i
+    print("Got reordered index!")
+    return reverse_reindex
+
+# ------------------------
+
+
+def check_sparsity(adj):
+    adj = adj.tocoo()
+    node_count = adj.shape[0]
+    block_num = args.size//args.replication
+    p = math.ceil(float(node_count)/(args.size//args.replication))
+    starts = list(range(0, node_count, p))
+    ends = list(range(p, node_count, p))+[node_count]
+    sparsity = np.zeros(shape=(block_num, block_num), dtype=int)
+    for i in range(block_num):
+        for j in range(block_num):
+            sparsity[i, j] = np.where((adj.row >= starts[i]) & (adj.row < ends[i]) & (
+                adj.col >= starts[j]) & (adj.col < ends[j]))[0].shape[0]
+    print(sparsity)
+
+
+def load_data(args):
+    part_size, vis, dataset = args.part_size, args.visualize, args.dataset
+    data_dir = './tests/test_DistGCN/datasets/%s/' % dataset
+
+    # Original graph data should be in ./tests/test_DistGCN/datasets/
+    # ---load data---
+    if dataset == "Reddit":
+        adj = sp.load_npz(data_dir+'raw/reddit_graph.npz')
+        inputs = np.load(data_dir+'raw/reddit_data.npz')
+        x, y = inputs['feature'], inputs['label']
+    elif dataset == 'Proteins':
+        adj = sp.load_npz(data_dir+'protein_adj.npz')
+        y = np.load(data_dir+'protein_labels.npy')
+        y = y.astype(int)
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (132534 + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[132534, 602]).astype(np.float32)
+    elif dataset == 'Arch':
+        adj = sp.load_npz(data_dir+'arch_adj.npz')
+        y = np.random.randint(10, size=adj.shape[0])
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (adj.shape[0] + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[adj.shape[0], 602]).astype(np.float32)
+    elif dataset == 'Products':
+        adj = sp.load_npz(data_dir+'products_adj.npz')
+        x = np.load(data_dir+'products_feat.npy')
+        y = np.load(data_dir+'products_label.npy').astype(np.int)
+    elif dataset == 'Youtube':
+        adj = np.load(data_dir+'youtube_coo.npy', allow_pickle=True).item()
+        np.random.seed(123)
+        bounds = np.sqrt(6.0 / (adj.shape[0] + 602))
+        x = np.random.uniform(low=-bounds, high=bounds,
+                              size=[adj.shape[0], 602]).astype(np.float32)
+        y = np.load(data_dir+'youtube_label.npy')
+
+    graph = Graph(edge_index=np.vstack(
+        [adj.row, adj.col]), num_nodes=x.shape[0])
+
+    # ---preprocess graph---
+    graph.add_self_loop()
+    normed_val = graph.gcn_norm(True)
+    node_count = graph.num_nodes
+    nparts = node_count//part_size
+
+    # ---construct adj,x,y---
+    edge_index = graph.edge_index
+    adj = sp.coo_matrix(
+        (normed_val, (edge_index[0], edge_index[1])), shape=(node_count, node_count))
+
+    # ---check block sparsity---
+    print('Sparsity before reordering:')
+    check_sparsity(adj)
+
+    if args.shuffle == 1:
+        print("Shuffle the graph...")
+        order = np.random.permutation(node_count)
+        adj = adj.tocsr()[:, order][order]
+        x = x[order, :]
+        y = y[order]
+        print('Sparsity after Shuffle:')
+        check_sparsity(adj)
+
+    # ---graph reordering---
+    reindex_reverse = graph_reorder(adj, args.reorder_alg)
+
+    # ---reorder the graph
+    print("Reordering the graph...")
+    adj = adj.tocsr()[:, reindex_reverse][reindex_reverse]
+    x = x[reindex_reverse, :]
+    y = y[reindex_reverse]
+    print("Reordering done!")
+
+    # ---check block sparsity---
+    print('Sparsity after reordering:')
+    check_sparsity(adj)
+
+    # ---visualize adj---
+    if vis:
+        if args.reorder_alg == 'metis':
+            img_name = "partitioned_%d_metis.png" % (nparts)
+            title = "Matrix Reordered by METIS %d parts" % nparts
+        elif args.reorder_alg == 'rcm':
+            img_name = "partitioned_rcm.png"
+            title = "Matrix Reordered by RCM"
+        elif args.reorder_alg == 'slashburn':
+            img_name = "partitioned_%d_slashburn.png" % (nparts)
+            title = "Matrix Reordered by slashburn %d parts" % nparts
+        elif args.reorder_alg == 'deg':
+            img_name = "partitioned_deg.png"
+            title = "Matrix Reordered by deg"
+        matrix_visualize(adj, title, img_name, args)
+
+    print('node_count = %d, num_features = %d, num_classes = %d, edge_count = %d' % (
+        adj.shape[0], x.shape[1], np.max(y)+1, len(adj.data)))
+    return adj, x, y
+
+
+def prepare_data(args):
+    replication, size, dataset, reorder_alg = args.replication, args.size, args.dataset, args.reorder_alg
+    print("Preparing data...")
+
+    adj_all, input_all, label_all = load_data(args)
+    print("size=%d, replication=%s, reorder_alg=%s, dataset=%s" %
+          (size, replication, reorder_alg, dataset))
+
+    if size == 1:  # whole graph for single GPU
+        replication = 1
+        dir_name = "./tests/test_DistGCN/data_GCN15d_reorder/%s/%s_whole_graph/" % (
+            reorder_alg, dataset)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        sp.save_npz(dir_name+"adj_whole.npz", adj_all)
+        print("adj_whole: ", adj_all.shape, len(adj_all.data))
+        np.save(dir_name+"input_whole.npy", input_all)
+        print("inputs_all: ", input_all.shape)
+        np.save(dir_name+"label_whole.npy", label_all)
+        print("labels_all: ", label_all.shape)
+        print("Data preparation done!")
+    else:  # partitioned graph for multiple GPU
+        dir_name = "./tests/test_DistGCN/data_GCN15d_reorder/%s/%s_size_%d_rep_%d/" % (
+            reorder_alg, dataset, size, replication)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+
+        for rank in range(size):
+            input_part = get_inputs(input_all, replication, rank, size)
+            label_part = get_inputs(
+                label_all.reshape(-1, 1), replication, rank, size).reshape(-1)
+            np.save(dir_name+"input"+str(rank)+".npy", input_part)
+            np.save(dir_name+"label"+str(rank)+".npy", label_part)
+        print("Done inputs and labels!")
+
+        get_adj_matrix_all(adj_all, replication, size, dir_name)
+        print("Data preparation done!")
+
+
+def get_dataset(args):
+    if args.dataset in ['Reddit', 'reddit']:
+        args.dataset = 'Reddit'
+    elif args.dataset in ['Proteins', 'proteins']:
+        args.dataset = 'Proteins'
+    elif args.dataset in ['Arch', 'arch']:
+        args.dataset = 'Arch'
+    elif args.dataset in ['Products', 'products']:
+        args.dataset = 'Products'
+    elif args.dataset in ['All', 'all']:
+        args.dataset = 'All'
+    else:
+        print(
+            "Dataset should be in ['Reddit','Proteins','Arch','Products','All']")
+        assert False
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--replication', type=int, default=1,
+                    help='Replication of distGCN1.5D [1,2 ].')
+parser.add_argument('--size', type=int, default=2,
+                    help='Number of devices [2, 4, 8, 16]')
+parser.add_argument('--visualize', type=int, default=0,
+                    help='Visualize matrix after partitioning or not [0, 1].')
+parser.add_argument('--part_size', type=int, default=200,
+                    help='Metis cluster size.')
+parser.add_argument('--reorder_alg', type=str, default="metis",
+                    help='Graph reordering algorithm [rcm, metis, slashburn, deg, go].')
+parser.add_argument('--dataset', type=str, default="Reddit",
+                    help='Choose dataset [Reddit, Proteins, Arch, Products].')
+parser.add_argument('--shuffle', type=int, default=1,
+                    help='Whether to shuffle the graph before algorithm.')
+args = parser.parse_args()
+
+get_dataset(args)
+if args.size == -1:
+    size_set = [1, 2, 4, 8, 4, 8]
+    replication_set = [1, 1, 1, 1, 2, 2]
+    for i in range(len(size_set)):
+        args.replication, args.size = replication_set[i], size_set[i]
+        prepare_data(args)
+elif args.dataset == 'All':
+    dataset = ['Reddit', 'Proteins', 'Arch', 'Products']
+    for i in range(len(dataset)):
+        args.dataset = dataset[i]
+        prepare_data(args)
+else:
+    prepare_data(args)
diff --git a/tests/test_DistGCN/test_group_comm.py b/tests/test_DistGCN/test_group_comm.py
new file mode 100644
index 0000000..2b6b3d2
--- /dev/null
+++ b/tests/test_DistGCN/test_group_comm.py
@@ -0,0 +1,104 @@
+import numpy as np
+import scipy.sparse as sp
+import math
+import hetu as ht
+from hetu.context import DeviceGroup
+from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+
+
+def convert_to_context(device_list):
+    return DeviceGroup([ht.gpu(x) for x in device_list])
+
+
+def test_default():
+    comm1 = ht.new_group_comm()
+    a = ht.array(np.array([1, 2, 3, 4, 5]), ctx=ctx)
+    comm1.dlarrayNcclAllReduce(
+        a, a, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum)
+    print("Default Allreduce device=%d" % comm1.dev_id, a.asnumpy())
+
+
+def test_broadcast(group, root):
+    device_group = convert_to_context(group)
+    comm1 = ht.new_group_comm(device_group)
+    a = ht.array(np.array([-1, -1, -1, -1, -1]), ctx=ctx)
+    if rank == root:
+        a = ht.array(np.array([2, 3, 4, 5, 6]), ctx=ctx)
+    if rank in group:
+        comm1.dlarrayBroadcast(
+            a, a, ncclDataType_t.ncclFloat32, root=device_group.index(ht.gpu(root)))
+    print("Broadcast device=%d" % comm1.dev_id, a.asnumpy())
+
+
+def test_allreduce(group):
+    device_group = convert_to_context(group)
+    comm1 = ht.new_group_comm(device_group)
+    a = ht.array(np.array([1, 2, 3, 4, 5]), ctx=ctx)
+    if rank in group:
+        comm1.dlarrayNcclAllReduce(
+            a, a, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum)
+    print("Allreduce device=%d" % comm1.dev_id, a.asnumpy())
+
+
+def test_allgather(group):
+    device_group = convert_to_context(group)
+    comm1 = ht.new_group_comm(device_group)
+    a = ht.array(np.array([rank, rank]), ctx=ctx)
+    b = ht.array(np.zeros(2*len(group)), ctx=ctx)
+    if rank in group:
+        comm1.dlarrayAllGather(a, b, ncclDataType_t.ncclFloat32)
+    print("Allgather device=%d" % comm1.dev_id, b.asnumpy())
+
+
+def test_group_broadcast():
+    row_procs = []
+    for i in range(0, 8, 2):
+        row_procs.append(list(range(i, i+2)))
+
+    col_procs = []
+    for i in range(2):
+        col_procs.append(list(range(i, 8, 2)))
+
+    row_groups = []
+    for i in range(len(row_procs)):
+        row_groups.append(ht.new_group_comm(convert_to_context(row_procs[i])))
+
+    col_groups = []
+    for i in range(len(col_procs)):
+        col_groups.append(ht.new_group_comm(convert_to_context(col_procs[i])))
+
+    rank_row = rank // 2
+    rank_col = rank % 2
+    group_row = row_procs[rank_row]
+    group_col = col_procs[rank_col]
+    comm_row = row_groups[rank_row]
+    comm_col = col_groups[rank_col]
+
+    a = ht.array(np.array([rank, rank, rank, rank, rank]), ctx=ctx)
+    comm_row.dlarrayBroadcast(
+        a, a, ncclDataType_t.ncclFloat32, root=1)
+    print("Broadcast device=%d, a:" % device_id, a.asnumpy())
+
+    b = ht.array(np.array([rank, rank, rank, rank, rank]), ctx=ctx)
+    comm_col.dlarrayBroadcast(
+        b, b, ncclDataType_t.ncclFloat32, root=1)
+    print("Broadcast device=%d, b:" % device_id, b.asnumpy())
+
+
+comm = ht.wrapped_mpi_nccl_init()
+device_id = comm.dev_id
+rank = comm.rank
+size = comm.nrank
+ctx = ht.gpu(rank)
+a = ht.array(np.array([1, 2, 3, 4, 5]), ctx=ctx)
+
+test_default()
+
+test_broadcast(group=[0, 2, 4, 5, 6], root=4)
+test_broadcast(group=[1, 4, 2, 7], root=4)
+test_allreduce(group=[1, 4, 2, 5])
+test_allreduce(group=[0, 7, 6, 2, 4])
+test_allgather(group=[2, 5, 3, 7])
+test_allgather(group=[2, 6, 1, 7, 4])
+
+test_group_broadcast()
diff --git a/tests/test_DistGCN/test_model_distGCN15d.py b/tests/test_DistGCN/test_model_distGCN15d.py
new file mode 100644
index 0000000..cb5eafc
--- /dev/null
+++ b/tests/test_DistGCN/test_model_distGCN15d.py
@@ -0,0 +1,291 @@
+import numpy as np
+import scipy.sparse as sp
+import hetu as ht
+from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t
+import math
+import time
+import argparse
+
+'''
+Usage example: (in Dir Hetu/)
+    Original graph data:
+        Single GPU:
+            mpirun -quiet --allow-run-as-root -np 1 python tests/test_DistGCN/test_model_distGCN15d.py --replication 1 --dataset Reddit
+        Multiple GPU:
+            mpirun -quiet --allow-run-as-root -np 8 python tests/test_DistGCN/test_model_distGCN15d.py --replication 2 --dataset Reddit
+    Reordered graph data:
+        Single GPU:
+            mpirun -quiet --allow-run-as-root -np 1 python tests/test_DistGCN/test_model_distGCN15d.py --replication 1 --dataset Reddit --reorder 1 --reorder_alg metis
+        Multiple GPU:
+            mpirun -quiet --allow-run-as-root -np 8 python tests/test_DistGCN/test_model_distGCN15d.py --replication 2 --dataset Reddit --reorder 1 --reorder_alg metis
+'''
+
+
+def row_num(node_count, rank, size):
+    n_per_proc = math.ceil(float(node_count) / size)
+    if(node_count % size == 0):
+        return int(node_count/size)
+    if(rank < size-1):
+        return int(n_per_proc)
+    else:
+        return int(node_count % n_per_proc)
+
+
+def col_num(node_count, replication, rank):
+    rank_col = rank % replication  # j
+    col_block = math.ceil(float(node_count) / replication)
+    col_start = int(col_block*rank_col)
+    col_end = int(col_block*(rank_col+1))
+    if col_end > node_count:
+        col_end = node_count
+    return col_end-col_start
+
+
+def convert_to_one_hot(vals, max_val=0):
+    """Helper method to convert label array to one-hot array."""
+    if max_val == 0:
+        max_val = vals.max() + 1
+    one_hot_vals = np.zeros((vals.size, max_val))
+    one_hot_vals[np.arange(vals.size), vals] = 1
+    return one_hot_vals
+
+
+def get_proc_groups(size, replication):
+    if replication == 1:
+        return None, None, None, None
+
+    row_procs = []
+    for i in range(0, size, replication):
+        row_procs.append(list(range(i, i + replication)))
+
+    col_procs = []
+    for i in range(replication):
+        col_procs.append(list(range(i, size, replication)))
+
+    row_groups = []
+    for i in range(len(row_procs)):
+        row_groups.append(ht.new_group_comm(row_procs[i]))
+
+    col_groups = []
+    for i in range(len(col_procs)):
+        col_groups.append(ht.new_group_comm(col_procs[i]))
+
+    return row_procs, col_procs, row_groups, col_groups
+
+
+def load_data(args, size, replication, rank):
+    print("Loading data for rank %d..." % rank)
+    dataset = args.dataset
+    reorder_alg = args.reorder_alg
+    dir_name = "./tests/test_DistGCN/data_GCN15d/%s_size_%d_rep_%d/" % (
+        dataset, size, replication)
+    if args.reorder:
+        dir_name = "./tests/test_DistGCN/data_GCN15d_reorder/%s/%s_size_%d_rep_%d/" % (
+            reorder_alg, dataset, size, replication)
+    adj_part = sp.load_npz(dir_name+"adj_part"+str(rank)+".npz")
+    data_part, row_part, col_part = adj_part.data, adj_part.row, adj_part.col
+    input_part = np.load(dir_name+"input"+str(rank)+".npy")
+    label_part = np.load(dir_name+"label"+str(rank)+".npy")
+    print("Data loading done for rank %d." % rank)
+    return adj_part, data_part, row_part, col_part, input_part, label_part
+
+
+def load_data_whole(args):
+    dataset = args.dataset
+    reorder_alg = args.reorder_alg
+    print("Loading dataset %s ..." % dataset)
+    dir_name = "./tests/test_DistGCN/data_GCN15d/%s_whole_graph/" % (dataset)
+    if args.reorder:
+        dir_name = "./tests/test_DistGCN/data_GCN15d_reorder/%s/%s_whole_graph/" % (
+            reorder_alg, dataset)
+    adj_whole = sp.load_npz(dir_name+"adj_whole.npz")
+    adj_whole = adj_whole.tocoo()
+    data_whole, row_whole, col_whole = adj_whole.data, adj_whole.row, adj_whole.col
+    input_whole = np.load(dir_name+"input_whole.npy")
+    label_whole = np.load(dir_name+"label_whole.npy")
+    print("Data loading done for dataset %s." % dataset)
+    return adj_whole, data_whole, row_whole, col_whole, input_whole, label_whole
+
+
+def test(args):
+    comm = ht.wrapped_mpi_nccl_init()
+    device_id = comm.dev_id
+    rank = comm.rank
+    size = comm.nrank
+
+    dataset_info = {'Reddit': [232965, 602, 41], 'Proteins': [
+        132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47]}
+
+    node_count, num_features, num_classes = dataset_info[args.dataset]
+
+    hidden_layer_size = 128
+    if num_features < 128:
+        hidden_layer_size = 64
+
+    replication = args.replication
+
+    node_Count_Self = row_num(
+        node_count, rank//replication, size // replication)
+    node_Count_All = node_count
+
+    _, _, row_groups, col_groups = get_proc_groups(size, replication)
+
+    executor_ctx = ht.gpu(device_id)
+
+    if size > 1:
+        adj_part, data_part, row_part, col_part, input_part, label_part = load_data(
+            args, size, replication, rank)
+    else:
+        adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole(
+            args)
+
+    adj_matrix = ht.sparse_array(
+        data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx)
+
+    # train:val:test=6:2:2
+    # Our optimization on distributed GNN algorithm does NOT affect the correctness!
+    # Here due to the limitation of current slice_op, data is split continuously.
+    # Continuous split is unfriendly for reordered graph data where nodes are already clustered.
+    # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy.
+    # The better way is to split data randomly!
+    train_split, test_split = 0.6, 0.8
+    train_node = int(train_split*node_Count_Self)
+    test_node = int(test_split*node_Count_Self)
+
+    A = ht.Variable(name="A", trainable=False)
+    H = ht.Variable(name="H")
+    np.random.seed(123)
+    bounds = np.sqrt(6.0 / (num_features + hidden_layer_size))
+    W1_val = np.random.uniform(
+        low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32)
+    W1 = ht.Variable(name="W1", value=W1_val)
+    bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size))
+    np.random.seed(123)
+    W2_val = np.random.uniform(
+        low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32)
+
+    W2 = ht.Variable(name="W2", value=W2_val)
+    y_ = ht.Variable(name="y_")
+
+    z = ht.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All,
+                          size, replication, device_id, comm, [row_groups, col_groups], True)
+    H1 = ht.relu_op(z)
+    y = ht.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All,
+                          size, replication, device_id, comm, [row_groups, col_groups], True)
+
+    y_train = ht.slice_op(y, (0, 0), (train_node, num_classes))
+    label_train = ht.slice_op(y_, (0, 0), (train_node, num_classes))
+
+    y_test = ht.slice_op(
+        y, (test_node, 0), (node_Count_Self-test_node, num_classes))
+    label_test = ht.slice_op(
+        y_, (test_node, 0), (node_Count_Self-test_node, num_classes))
+
+    loss = ht.softmaxcrossentropy_op(y_train, label_train)
+    loss_test = ht.softmaxcrossentropy_op(y_test, label_test)
+    opt = ht.optim.AdamOptimizer()
+    train_op = opt.minimize(loss)
+
+    executor = ht.Executor([loss, y, loss_test, train_op], ctx=executor_ctx)
+
+    feed_dict = {
+        A: adj_matrix,
+        H: ht.array(input_part, ctx=executor_ctx),
+        y_: ht.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx),
+    }
+
+    epoch_num = 100
+    epoch_all, epoch_0 = 0, 0
+
+    for i in range(epoch_num):
+        epoch_start_time = time.time()
+        results = executor.run(feed_dict=feed_dict)
+        loss = results[0].asnumpy().sum()
+        y_out = results[1]
+        loss_test = results[2].asnumpy().sum()
+        epoch_end_time = time.time()
+        epoch_time = epoch_end_time-epoch_start_time
+        epoch_all += epoch_time
+        if i == 0:
+            epoch_0 = epoch_time
+
+        print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" %
+              (i, rank, epoch_time, epoch_all))
+
+        y_out_train, y_predict = y_out.asnumpy().argmax(
+            axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:]
+        label_train, label_test = label_part[:
+                                             train_node], label_part[test_node:]
+        train_acc = ht.array(
+            np.array([(y_out_train == label_train).sum()]), ctx=executor_ctx)
+        test_acc = ht.array(
+            np.array([(y_predict == label_test).sum()]), ctx=executor_ctx)
+        train_loss = ht.array(np.array([loss]), ctx=executor_ctx)
+        test_loss = ht.array(np.array([loss_test]), ctx=executor_ctx)
+
+        if replication > 1:
+            col_groups[rank % replication].dlarrayNcclAllReduce(
+                test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            col_groups[rank % replication].dlarrayNcclAllReduce(
+                test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            col_groups[rank % replication].dlarrayNcclAllReduce(
+                train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            col_groups[rank % replication].dlarrayNcclAllReduce(
+                train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+        else:
+            comm.dlarrayNcclAllReduce(
+                test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            comm.dlarrayNcclAllReduce(
+                test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            comm.dlarrayNcclAllReduce(
+                train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+            comm.dlarrayNcclAllReduce(
+                train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+
+        test_acc = float(test_acc.asnumpy()[0]) / \
+            (node_count-test_split*node_count)
+        test_loss = test_loss.asnumpy()[0]/(node_count-test_split*node_count)
+        train_acc = float(train_acc.asnumpy()[0])/(train_split*node_count)
+        train_loss = train_loss.asnumpy()[0]/(train_split*node_count)
+
+        if rank == 0:
+            print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"
+                  % (i, train_loss, train_acc, test_loss, test_acc))
+
+    avg_epoch_time = (epoch_all-epoch_0)/(epoch_num-1)
+    results = ht.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx)
+    comm.dlarrayNcclAllReduce(
+        results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum)
+    results = results.asnumpy()/size
+
+    if rank == 0:
+        print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" %
+              (results[0], results[1]))
+
+
+def get_dataset(args):
+    if args.dataset in ['Reddit', 'reddit']:
+        args.dataset = 'Reddit'
+    elif args.dataset in ['Proteins', 'proteins']:
+        args.dataset = 'Proteins'
+    elif args.dataset in ['Arch', 'arch']:
+        args.dataset = 'Arch'
+    elif args.dataset in ['Products', 'products']:
+        args.dataset = 'Products'
+    else:
+        print("Dataset should be in ['Reddit','Proteins','Arch','Products']")
+        assert False
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--replication', type=int, default=1,
+                    help='Replication of distGCN1.5D.')
+parser.add_argument('--reorder', type=int, default=0,
+                    help='Reorder graph or not.')
+parser.add_argument('--reorder_alg', type=str, default="metis",
+                    help='Graph reordering algorithm [rcm, metis, slashburn, deg].')
+parser.add_argument('--dataset', type=str, default="Reddit",
+                    help='Choose dataset [Reddit, Proteins, Arch, Products].')
+args = parser.parse_args()
+get_dataset(args)
+test(args)
diff --git a/tests/test_datatransfer_op.py b/tests/test_datatransfer_op.py
new file mode 100644
index 0000000..8fc0d28
--- /dev/null
+++ b/tests/test_datatransfer_op.py
@@ -0,0 +1,60 @@
+import numpy as np
+import hetu as ht
+
+
+def test_dense():
+    npw = np.random.random((5, 10)).astype(np.float32)
+    npx = np.random.random((7, 5)).astype(np.float32)
+    cpuctx = ht.cpu(0)
+    gpuctx = ht.gpu(0)
+
+    X = ht.Variable(name="x")
+    mid = X + 3
+    W = ht.Variable(name='w', value=npw, ctx=cpuctx)
+    y = ht.matmul_op(mid, W)
+    opt = ht.optim.SGDOptimizer(learning_rate=0.1)
+    train_op = opt.minimize(y)
+    executor = ht.Executor([y, train_op], ctx=gpuctx)
+    pred_y, _ = executor.run(
+        feed_dict={X: npx}, convert_to_numpy_ret_vals=True)
+
+    nppred_y = np.matmul((npx + 3), npw)
+    np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6)
+    new_npw = npw - 0.1 * \
+        np.matmul((npx + 3).T, np.ones(nppred_y.shape).astype(np.float32))
+    np.testing.assert_allclose(
+        executor.config.placeholder_to_arr_map[W].asnumpy(), new_npw, rtol=1e-10)
+
+
+test_dense()
+
+
+def test_sparse():
+    npemb = np.random.random((100, 20)).astype(np.float32)
+    npind = np.array(np.random.randint(100, size=(10,)))
+    npw = np.random.random((20, 30)).astype(np.float32)
+    cpuctx = ht.cpu(0)
+    gpuctx = ht.gpu(0)
+
+    embedding = ht.Variable('embeddingtable', value=npemb, ctx=cpuctx)
+    index = ht.Variable(name="index", ctx=cpuctx)
+    W = ht.Variable(name="w", value=npw)
+    y = ht.embedding_lookup_op(embedding, index)  # (10, 20)
+    y = ht.matmul_op(y, W)
+    opt = ht.optim.SGDOptimizer(0.1)
+    train_op = opt.minimize(y)
+    executor = ht.Executor([y, train_op], ctx=gpuctx)
+
+    out, _ = executor.run(feed_dict={index: npind.astype(
+        np.float32)}, convert_to_numpy_ret_vals=True)
+
+    np_out = np.matmul(npemb[npind], npw)
+    np.testing.assert_allclose(out, np_out, rtol=1e-6)
+    tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T)
+    for i, localid in enumerate(npind):
+        npemb[localid] -= 0.1 * tmp_grad[i]
+    np.testing.assert_allclose(
+        executor.config.placeholder_to_arr_map[embedding].asnumpy(), npemb, rtol=1e-6)
+
+
+test_sparse()
diff --git a/tests/test_dnnl_op.py b/tests/test_dnnl_op.py
new file mode 100644
index 0000000..be6de79
--- /dev/null
+++ b/tests/test_dnnl_op.py
@@ -0,0 +1,1135 @@
+import numpy as np
+import time
+import ctypes
+from hetu import cpu_links as cpu_op
+from hetu import ndarray
+from hetu.ndarray import numpyasdlarrayhandle
+
+
+def save_to_file(data, file):
+    f = open(file, 'a+')
+    f.write(data)
+    f.close()
+
+
+#     0    1    2    3    4    5   6    7     8    9   10   11   12     13    14    15    16   17    18     19
+ll = [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900,
+      1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]
+
+
+def test_boradcast_to():
+    for i in range(len(ll)):
+        # ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        to_shape = (1000, ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        y = np.empty(to_shape, dtype=np.float32)
+        arr_x = numpyasdlarrayhandle(x)
+        arr_y = numpyasdlarrayhandle(y)
+        # print(arr_x.asnumpy())
+        start = time.time()
+        for _ in range(10):
+            cpu_op.broadcast_to(arr_x, arr_y)
+        end = time.time()
+        for _ in range(10):
+            kkk = np.broadcast_to(x, to_shape)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        np.testing.assert_allclose(kkk, y, rtol=1e-5)
+
+
+# test_boradcast_to()
+
+def test_reduce_sum_axis_zero():
+    for i in range(len(ll)):
+        a = 1
+        #
+        # shape = (ll[i], ll[i])
+        # to_shape = (1000, ll[i], ll[i])
+        # x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y=np.empty(to_shape,dtype=np.float32)
+        # arr_x  = numpyasdlarrayhandle(x)
+        # arr_y = numpyasdlarrayhandle(y)
+        # # print(arr_x.asnumpy())
+        # start = time.time()
+        # for _ in range(10):
+        #     cpu_op.broadcast_to(arr_x, arr_y)
+        # end = time.time()
+        # for _ in range(10):
+        #     kkk = np.broadcast_to(x, to_shape)
+        # end1 = time.time()
+        # print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        # np.testing.assert_allclose(kkk, y, rtol=1e-5)
+    shape = (2, 2, 2)
+    to_shape = (2, 2)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    y = np.empty(to_shape, dtype=np.float32)
+    arr_x = numpyasdlarrayhandle(x)
+    arr_y = numpyasdlarrayhandle(y)
+    cpu_op.reduce_sum_axis_zero(arr_x, arr_y)
+    np_y = np.sum(x, axis=0)
+    print('x:', x)
+    print('np_y:', np_y)
+    print('y:', y)
+
+# test_reduce_sum_axis_zero()
+
+
+def test_average_pooling():
+    ctx = ndarray.cpu(0)
+
+    def np_average_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, H, W = input.shape
+        assert ((H + 2 * padding - kernel_H) % stride == 0)
+        assert ((W + 2 * padding - kernel_W) % stride == 0)
+        pooled_H = (H + 2 * padding - kernel_H) / stride + 1
+        pooled_W = (W + 2 * padding - kernel_W) / stride + 1
+        pooled_layer = np.zeros(
+            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in xrange(N):
+            for c in xrange(C):
+                for h in xrange(pooled_H):
+                    for w in xrange(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in xrange(hs, hend):
+                            for j in xrange(ws, wend):
+                                pooled_layer[n][c][h][w] += input[n][c][i][j]
+                        pooled_layer[n][c][h][w] /= pooling_size
+        return pooled_layer
+
+    def np_average_pooling_gradient(gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in xrange(N):
+            for c in xrange(C):
+                for h in xrange(pooled_H):
+                    for w in xrange(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in xrange(hs, hend):
+                            for j in xrange(ws, wend):
+                                gradient_x[n][c][i][j] += gradient_y[n][c][h][w] / \
+                                    pooling_size
+
+        return gradient_x
+
+    shapeX = (100, 3, 28, 28)
+    # (1,1,5,5)
+    shapeY = (100, 3, 24, 24)
+    #  input : x , filter : f , output: y
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+
+    arr_x = numpyasdlarrayhandle(x)
+    arr_gradient_y = numpyasdlarrayhandle(gradient_y)
+    pool_layer = np.empty(shapeY, dtype=np.float32)
+    gradient_x = np.empty(shapeX, dtype=np.float32)
+    arr_pool_layer = numpyasdlarrayhandle(pool_layer)
+    arr_gradient_x = numpyasdlarrayhandle(gradient_x)
+
+    cpu_op.avg_pool(arr_x, 5, 5, arr_pool_layer)
+    cpu_op.avg_pool_gradient(arr_gradient_y, 5, 5, arr_gradient_x)
+
+    np_pool_layer = np_average_pooling(x, 5, 5)
+    np_gradient_x = np_average_pooling_gradient(gradient_y, 5, 5)
+
+    np.testing.assert_allclose(np_pool_layer, pool_layer, rtol=1e-5)
+
+    np.testing.assert_allclose(np_gradient_x, gradient_x, rtol=1e-5)
+    # print(arr_gradient_x.asnumpy())
+    # print("asdasdf:",np_gradient_x)
+
+
+# test_average_pooling()
+
+
+def test_max_pooling():
+    ctx = ndarray.cpu(0)
+
+    def np_max_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, H, W = input.shape
+        assert ((H + 2 * padding - kernel_H) % stride == 0)
+        assert ((W + 2 * padding - kernel_W) % stride == 0)
+        pooled_H = (H + 2 * padding - kernel_H) / stride + 1
+        pooled_W = (W + 2 * padding - kernel_W) / stride + 1
+
+        pooled_layer = np.zeros(
+            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+
+                        hargmax = hs
+                        wargmax = ws
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
+                                    hargmax = i
+                                    wargmax = j
+                        pooled_layer[n][c][h][w] = input[n][c][hargmax][wargmax]
+
+        return pooled_layer
+
+    def np_max_pooling_gradient(input, gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+        # print(N,C,H,W)
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+
+        for n in xrange(N):
+            for c in xrange(C):
+                for h in xrange(pooled_H):
+                    for w in xrange(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+
+                        hargmax = hs
+                        wargmax = ws
+                        for i in xrange(hs, hend):
+                            for j in xrange(ws, wend):
+                                # print(n,c,i,j)
+                                if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
+                                    hargmax = i
+                                    wargmax = j
+                        gradient_x[n][c][hargmax][wargmax] += gradient_y[n][c][h][w]
+
+        return gradient_x
+
+    shapeX = (100, 3, 28, 28)
+    shapeY = (100, 3, 14, 14)
+    # shapeX=(1,1,2,2)
+    # shapeY=(1,1,1,1)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    # x = np.arange(1,37).reshape(shapeX)
+    # print(x)
+    # x = np.ones(shapeX).astype(np.float32)
+    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+    # gradient_y = np.ones(shapeY).astype(np.float32)
+    arr_x = numpyasdlarrayhandle(x)
+    arr_gradient_y = numpyasdlarrayhandle(gradient_y)
+    pool_layer = np.empty(shapeY, dtype=np.float32)
+    gradient_x = np.empty(shapeX, dtype=np.float32)
+    arr_pool_layer = numpyasdlarrayhandle(pool_layer)
+    arr_gradient_x = numpyasdlarrayhandle(gradient_x)
+
+    pool_layer1 = np.empty(shapeY, dtype=np.float32)
+    gradient_x1 = np.empty(shapeX, dtype=np.float32)
+    arr_pool_layer1 = numpyasdlarrayhandle(pool_layer1)
+    arr_gradient_x1 = numpyasdlarrayhandle(gradient_x1)
+
+    np_pool_layer = np_max_pooling(x, 2, 2, 0, 2)
+    cpu_op.max_pool(arr_x, 2, 2, arr_pool_layer, 0, 2)
+    # print('poollayer:',np_pool_layer)
+    # print(arr_pool_layer.asnumpy())
+
+    np_gradient_x = np_max_pooling_gradient(x, np_pool_layer, 2, 2, 0, 2)
+    cpu_op.max_pool_gradient(arr_x, arr_pool_layer, 2,
+                             2, arr_gradient_x1, 0, 2)
+
+    # print(arr_pool_layer.asnumpy())
+    # print(np_pool_layer)
+
+    np.testing.assert_allclose(np_pool_layer, pool_layer, rtol=1e-5)
+
+    np.testing.assert_allclose(np_gradient_x, gradient_x1, rtol=1e-5)
+
+    '''
+   for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (3, 3, ll[i], ll[i])
+        to_shape = (3, 3, ll[i] / 2, ll[i] / 2)
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = ndarray.array(x, ctx=ctx)
+        arr_y = ndarray.empty(to_shape, ctx=ctx)
+        # print(arr_x.asnumpy())
+        start = time.time()
+        for _ in range(10):
+            cpu_op.max_pooling(arr_x, 2, 2, arr_y, 0, 2)
+        end = time.time()
+        for _ in range(10):
+            out=np_max_pooling(x,2,2,0,2)
+        end1=time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:",end1-end )
+        y = arr_y.asnumpy()
+        np.testing.assert_allclose (out, y, rtol=1e-5)
+    '''
+
+
+# test_max_pooling()
+
+
+def test_matrix_multiply():
+    for i in range(len(ll)):
+        # ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        z = np.zeros(shape, dtype=np.float32)
+
+        start = time.time()
+        # numpy
+        for _ in range(10):
+            c_np = np.dot(x, y)
+        time1 = time.time()
+        arr_x = numpyasdlarrayhandle(x)
+        arr_y = numpyasdlarrayhandle(y)
+        arr_z = numpyasdlarrayhandle(z)
+        time2 = time.time()
+        for _ in range(1):
+            cpu_op.matrix_multiply(
+                arr_x, False,
+                arr_y, False,
+                arr_z)
+        time4 = time.time()
+        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
+        np.testing.assert_allclose(c_np, z, rtol=1e-5)
+#
+#
+# test_matrix_multiply()
+
+
+def test_matrix_elementwise_multiply_by_const():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = numpyasdlarrayhandle(x)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = 4.754545
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+
+        start = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_z)
+        end = time.time()
+        for _ in range(10):
+            nu = x * val
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+#         output=z.asnumpy()
+        np.testing.assert_allclose(nu, z, rtol=1e-5)
+#
+
+# test_matrix_elementwise_multiply_by_const()
+
+
+def test_matrix_elementwise_add_by_const():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = numpyasdlarrayhandle(x)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = 4.754545
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+
+        start = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_z)
+        end = time.time()
+        for _ in range(10):
+            nu = x + val
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        np.testing.assert_allclose(nu, z, rtol=1e-5)
+
+
+# test_matrix_elementwise_add_by_const()
+
+
+def test_matrix_elementwise_multiply_by_const():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = numpyasdlarrayhandle(x)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = np.random.uniform(-5, 5)
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+
+        start = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_z)
+        end = time.time()
+        for _ in range(10):
+            nu = x * val
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        np.testing.assert_allclose(nu, z, rtol=1e-5)
+
+
+# test_matrix_elementwise_multiply_by_const()
+
+def test_matrix_elementwise_multiply():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        start = time.time()
+        # numpy
+        for _ in range(10):
+            c_np = x * y
+        time1 = time.time()
+        arr_x = numpyasdlarrayhandle(x)
+        arr_y = numpyasdlarrayhandle(y)
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        time2 = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_multiply(
+                arr_x,
+                arr_y,
+                arr_z)
+        time4 = time.time()
+        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
+        np.testing.assert_allclose(c_np, z, rtol=1e-5)
+
+
+# test_matrix_elementwise_multiply()
+
+def test_matrix_elementwise_add():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        start = time.time()
+        # numpy
+        for _ in range(10):
+            c_np = x + y
+        time1 = time.time()
+        arr_x = numpyasdlarrayhandle(x)
+        arr_y = numpyasdlarrayhandle(y)
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        time2 = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_add(
+                arr_x,
+                arr_y,
+                arr_z)
+        time4 = time.time()
+        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
+        np.testing.assert_allclose(c_np, z, rtol=1e-5)
+
+
+# test_matrix_elementwise_add()
+
+def test_matrix_div_const():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = 4.754545
+        arr_x = numpyasdlarrayhandle(x)
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_divide_by_const(arr_x, val, arr_z)
+        end = time.time()
+        for _ in range(10):
+            nu = x / val
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        np.testing.assert_allclose(nu, z, rtol=1e-5)
+
+# test_matrix_div_const()
+
+
+def test_divide_elewise():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+        start = time.time()
+        # numpy
+
+        for _ in range(10):
+            c_np = x / y
+        time1 = time.time()
+        arr_x = numpyasdlarrayhandle(x)
+        arr_y = numpyasdlarrayhandle(y)
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        time2 = time.time()
+        for _ in range(10):
+            cpu_op.matrix_elementwise_divide(
+                arr_x,
+                arr_y,
+                arr_z)
+        time4 = time.time()
+        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
+        np.testing.assert_allclose(c_np, z, rtol=1e-5)
+
+
+# test_divide_elewise()
+
+
+def test_arrayset_oneslike():
+    for i in range(10000):  # len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (10000, 10000)  # ll[i], ll[i])
+        #x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        #arr_x = ndarray.array(x, ctx=ctx)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = 1
+        z = np.zeros(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        # print(z)
+        # print(x)
+        # print(val)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.array_set(arr_z, val)
+        end = time.time()
+        for _ in range(10):
+            output_val = np.ones(shape)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        # print(out)
+        # print(x*val)
+        # print(z)
+        np.testing.assert_allclose(output_val, z, rtol=1e-5)
+
+# test_arrayset_oneslike()
+
+
+def test_arrayset_zeroslike():
+    for i in range(10000):
+        ctx = ndarray.cpu(0)
+        shape = (10, 100)
+        #x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
+        #arr_x = ndarray.array(x, ctx=ctx)
+        # arr_y = ndarray.array(y, ctx=ctx)
+        val = 0
+        z = np.empty(shape, dtype=np.float32)
+        arr_z = numpyasdlarrayhandle(z)
+        # print(x)
+        # print(val)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.array_set(arr_z, val)
+        end = time.time()
+        for _ in range(10):
+            output_val = np.zeros(shape)
+        end1 = time.time()
+        # print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        # print(out)
+        # print(x*val)
+        # print(z)
+        np.testing.assert_allclose(output_val, z, rtol=1e-5)
+
+# test_arrayset_zeroslike()
+
+
+def test_softmax():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = ndarray.array(x, ctx=ctx)
+        z = ndarray.empty(shape, ctx=ctx)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.softmax(arr_x, z)
+        end = time.time()
+        for _ in range(10):
+            b = x - np.max(x, axis=1, keepdims=True)
+            expb = np.exp(b)
+            softmax = expb / np.sum(expb, axis=1, keepdims=True)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        out = z.asnumpy()
+        np.testing.assert_allclose(softmax, out, rtol=1e-5)
+
+
+# test_softmax()
+
+def test_softmax_crossentropy():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], 10)
+        y = np.random.uniform(-5, 5, shape).astype(np.float32)
+        y_ = np.random.uniform(-5, 5, shape).astype(np.float32)
+        arr_y = ndarray.array(y, ctx=ctx)
+        arr_y_ = ndarray.array(y_, ctx=ctx)
+        arr_out = ndarray.empty((1,), ctx=ctx)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.softmax_crossentropy(arr_y, arr_y_, arr_out)
+        end = time.time()
+        for _ in range(10):
+            b = y - np.max(y, axis=1, keepdims=True)
+            expb = np.exp(b)
+            softmax = expb / np.sum(expb, axis=1, keepdims=True)
+            cross_entropy = np.mean(
+                -np.sum(y_ * np.log(softmax), axis=1), keepdims=True)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        out = arr_out.asnumpy()
+        # print(out)
+        # print(cross_entropy)
+        np.testing.assert_allclose(cross_entropy, out, rtol=1e-3)
+
+
+# test_softmax_crossentropy()
+
+def test_sqrt():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(0, 10, shape).astype(np.float32)
+        arr_x = ndarray.array(x, ctx=ctx)
+        arr_y = ndarray.empty(shape, ctx=ctx)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.sqrt(arr_x, arr_y)
+        end = time.time()
+        for _ in range(10):
+            out = np.sqrt(x)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        y = arr_y.asnumpy()
+        np.testing.assert_allclose(out, y, rtol=1e-3)
+
+# test_sqrt()
+
+
+def test_tanh():
+    # TODO
+    raise NotImplementedError
+
+
+# test_tanh()
+
+def test_sigmoid():
+    # TODO
+    raise NotImplementedError
+
+
+# test_sigmoid()
+
+
+def test_opposite():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = ndarray.array(x, ctx=ctx)
+        arr_y = ndarray.empty(shape, ctx=ctx)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.opposite(arr_x, arr_y)
+        end = time.time()
+        for _ in range(10):
+            out = -x
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        y = arr_y.asnumpy()
+        np.testing.assert_allclose(out, y, rtol=1e-5)
+
+
+# test_opposite()
+
+def test_relu():
+    for i in range(len(ll)):
+        ctx = ndarray.cpu(0)
+        shape = (ll[i], ll[i])
+        x = np.random.uniform(-1, 1, shape).astype(np.float32)
+        arr_x = ndarray.array(x, ctx=ctx)
+        arr_y = ndarray.empty(shape, ctx=ctx)
+        start = time.time()
+        for _ in range(10):
+            cpu_op.relu(arr_x, arr_y)
+        end = time.time()
+        for _ in range(10):
+            out = np.maximum(x, 0).astype(np.float32)
+        end1 = time.time()
+        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
+        y = arr_y.asnumpy()
+        np.testing.assert_allclose(out, y, rtol=1e-5)
+# test_relu()
+
+
+def test_relu_gradient():
+    shape = (2, 2)
+    ctx = ndarray.cpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    print("x:", x)
+    grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
+    print("g:", grad_x)
+    arr_x = ndarray.array(x, ctx=ctx)
+    arr_grad_x = ndarray.array(grad_x, ctx=ctx)
+    arr_y = ndarray.empty(shape, ctx=ctx)
+    cpu_op.relu_gradient(arr_x, arr_grad_x, arr_y)
+    y = arr_y.asnumpy()
+    print(y)
+    np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
+
+# test_relu_gradient()
+
+
+def test_conv2d():
+    ctx = ndarray.cpu(0)
+
+    # im2col and np_conv2d are helper functions
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = int((H + 2 * padding - filter_H) / stride + 1)
+        out_W = int((W + 2 * padding - filter_W) / stride + 1)
+
+        y_row_size = int(C * filter_H * filter_W)
+        y_col_size = int(out_H * out_W)
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index / out_W
+                out_x = col_index % out_W
+                in_y = int(out_y * stride - padding)
+                in_x = int(out_x * stride - padding)
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = int((H + 2 * padding - filter_H) / stride + 1)
+        out_W = int((W + 2 * padding - filter_W) / stride + 1)
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        #print("shape", im2col_matrix.shape)
+        #print("shape", filter_matrix.shape)
+        #print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+        # return im2col_matrix
+
+    shapeX = (100, 3, 28, 28)
+    shapeF = (10, 3, 5, 5)
+    shapeY = (100, 10, 24, 24)
+    # shapeX=(1,1,2,2)
+    # shapeF=(1,1,1,1)
+    # shapeY=(1,1,2,2)
+    #shapeW = (100, 3 * 5 * 5, 24 * 24)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+    # print("x:",x)
+    # print("f:",f)
+    # y = np.zeros(shapeY).astype(np.float32)
+    arr_x = ndarray.array(x, ctx=ctx)
+    arr_f = ndarray.array(f, ctx=ctx)
+    arr_y = ndarray.empty(shapeY, ctx=ctx)
+    #arr_workspace = ndarray.empty(shapeW, ctx=ctx)
+
+    cpu_op.conv2d(arr_x, arr_f, arr_y, 0, 1)
+    y = arr_y.asnumpy()
+    # print("y:",y)
+    z = np_conv2d(x, f)
+    # print("z:",z)
+    np.testing.assert_allclose(z, y, rtol=1e-5)
+
+# test_conv2d()
+
+
+def test_conv2d_Gradient():
+    ctx = ndarray.cpu(0)
+
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) / stride + 1
+        out_W = (W + 2 * padding - filter_W) / stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index / out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) / stride + 1
+        out_W = (W + 2 * padding - filter_W) / stride + 1
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        # print("shape", im2col_matrix.shape)
+        # print("shape", filter_matrix.shape)
+        # print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+        # return im2col_matrix
+
+    def im2col_transpose(X, filter_H, filter_W, Y, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) / stride + 1
+        out_W = (W + 2 * padding - filter_W) / stride + 1
+        _, y_row_size, y_col_size = Y.shape
+
+        der_X_shape = (N, C, H, W)
+        der_X = np.zeros(der_X_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index / out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                der_X[batch_index, c, y,
+                                      x] += Y[batch_index, row_idx, col_index]
+                            row_idx += 1
+        return der_X
+
+    def np_conv2d_transpose(X, Filter, Y, padding=0, stride=1):
+        """Implement a conv2d_transpose as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        X_N, X_C, X_H, X_W = X.shape
+        Y_N, Y_C, Y_H, Y_W = Y.shape
+        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))  # transformed to im2col Y
+        # XX = X.reshape((X_N, X_C, X_W * X_H))   # transformed to im2col X
+        F_filter = Filter.reshape((filter_outChannel, -1))
+        gradient_im2col_XX = np.matmul(F_filter.T, YY)
+
+        gradient_X = im2col_transpose(
+            X, filter_H, filter_W, gradient_im2col_XX, padding, stride)  # gradient of x
+        im2col_XX = im2col(X, filter_H, filter_W, padding, stride)
+        gradient_filter = np.zeros(shape=F_filter.shape, dtype=X.dtype)
+
+        for i in range(X_N):
+            gradient_filter += np.matmul(YY[i], im2col_XX[i].T)
+        gradient_filter = gradient_filter.reshape(Filter.shape)
+
+        return gradient_X, gradient_filter
+
+    # shapeX = (100, 3, 28, 28)
+    # shapeF = (10, 3, 5, 5)
+    # shapeY = (100, 10, 24, 24)
+    shapeX = (1, 1, 2, 2)
+    shapeF = (1, 1, 1, 1)
+    shapeY = (1, 1, 2, 2)
+    shapeW = (100, 3 * 5 * 5, 24 * 24)
+    shapeFF = (100, 10, 3, 5, 5)
+    #  input : x , filter : f , output: y
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+    # print('x:',x)
+    # print('f:',f)
+
+    der_y = np.ones(shape=shapeY)
+    gradient_x, gradient_f = np_conv2d_transpose(x, f, der_y)
+
+    # print('gredent_f:',gradient_f)
+    print(gradient_f.shape)
+    # print('der_y:',der_y)
+
+    arr_x = ndarray.array(x, ctx=ctx)
+    arr_f = ndarray.array(f, ctx=ctx)
+    gradient_y = ndarray.array(der_y, ctx=ctx)
+    gradient_xx = ndarray.array(x, ctx=ctx)
+    gradient_ff = ndarray.array(f, ctx=ctx)
+    cpu_op.conv2d_gradient_of_filter(arr_x, gradient_y, gradient_ff)
+    cpu_op.conv2d_gradient_of_data(arr_f, gradient_y, gradient_xx)
+
+    np.testing.assert_allclose(gradient_x, gradient_xx.asnumpy(), rtol=1e-5)
+    np.testing.assert_allclose(gradient_f, gradient_ff.asnumpy(), rtol=1e-5)
+
+
+# test_conv2d_Gradient()
+
+
+def test_concat():
+    ctx = ndarray.cpu(0)
+    shape1 = (1, 2)
+    shape2 = (1, 2)
+    x1 = np.array([1, 2, 2, 2]).reshape((2, 2))
+    x2 = np.array([3, 4, 4, 4, 4, 4]).reshape((2, 3))
+    arr_x1 = ndarray.array(x1, ctx=ctx)
+    arr_x2 = ndarray.array(x2, ctx=ctx)
+    to_shape = (2, 5)
+
+    arr_y = ndarray.empty(to_shape, ctx=ctx)
+    cpu_op.concat(arr_x1, arr_x2, arr_y, axis=1)
+
+    print(arr_y.asnumpy())
+    #
+    # grad_x1 = ndarray.empty(shape1, ctx=ctx)
+    # grad_x2 = ndarray.empty(shape2, ctx=ctx)
+    # grad_y = np.array([1, 2, 3, 4]).reshape((1, 4))
+    # grad_y_arr = ndarray.array(grad_y, ctx=ctx)
+    #
+    # gpu_op.concat_gradient(grad_y_arr, grad_x1, axis=1, idx=0)
+    # gpu_op.concat_gradient(grad_y_arr, grad_x2, axis=1, idx=1)
+    # print(grad_x1.asnumpy())
+    # print(grad_x2.asnumpy())
+# test_concat()
+
+
+def test_Batch_Normalization():
+    ctx = ndarray.cpu(0)
+    shape = (1, 1, 2, 2)
+    shape2 = [2]
+    np.random.seed(111)
+    x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+    scale = np.random.uniform(0, 1, size=shape2).astype(np.float32)
+    bias = np.random.uniform(0, 1, size=shape2).astype(np.float32)
+    arr_x = ndarray.array(x, ctx=ctx)
+    arr_scale = ndarray.array(scale, ctx=ctx)
+    arr_bias = ndarray.array(bias, ctx=ctx)
+
+    arr_y = ndarray.empty(shape, ctx=ctx)
+    print('x:', x)
+    print('scale:', scale)
+    print('bias:', bias)
+
+    cpu_op.Batch_Normalization(arr_x, arr_scale, arr_bias, arr_y, 0.99, 0.01)
+
+    print(arr_y.asnumpy())
+
+    # gradient
+    arr_gradient_x = ndarray.empty(shape, ctx=ctx)
+    arr_gradient_scale = ndarray.empty(shape2, ctx=ctx)
+    arr_gradient_bias = ndarray.empty(shape2, ctx=ctx)
+
+    cpu_op.Batch_Normalization_gradient(arr_y, arr_x, arr_scale, arr_bias,
+                                        arr_gradient_x, arr_gradient_scale,
+                                        arr_gradient_bias, 0.01)
+    print('arr_gradient_x:', arr_gradient_x.asnumpy())
+    print('arr_gradient_scale:', arr_gradient_scale.asnumpy())
+    print('arr_gradient_bias:', arr_gradient_bias.asnumpy())
+
+#
+    # #   tf
+    # import tensorflow as tf
+    #
+    # tf_x=tf.placeholder(tf.float32,[1,1,2,2])
+    # train_flag=tf.placeholder(tf.bool)
+    # tf_scale=tf.Variable(scale,dtype=tf.float32)
+    # tf_bias=tf.Variable(bias,dtype=tf.float32)
+    #
+    # def batch_norm(input, scale, shift):
+    #     axis = list(range(len(input.get_shape()) - 1))
+    #     a_mean, a_var = tf.nn.moments(input, axis)
+    #     return tf.nn.batch_normalization(input, mean=a_mean, variance=a_var,
+    #                 offset=shift, scale=scale, variance_epsilon=1e-2, name=None)
+    #
+    # out=batch_norm(tf_x,tf_scale,tf_bias)
+    #
+    # with tf.Session as sess:
+    #     sess.run(tf.global_variables_initializer())
+    #     oo=sess.run([out],feed_dict={tf_x:x,train_flag:True})
+    #     print(oo)
+
+
+# test_Batch_Normalization()
+
+
+'''
+
+for i in range(1):
+    ll=[500,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000,2000,3000,4000,5000,6000,7000,8000,9000]
+
+    transA='N'
+    transB='T'
+
+
+    M = ll[i]
+    N = ll[i]
+    K = ll[i]
+    A = np.random.randn(M, K)
+    B = np.random.randn(K, N)
+    C = np.zeros((M, N))
+
+
+    start = time.time()
+    ###numpy
+    for i in range(100):
+        C_np = np.dot(A, B)
+    #print(C_np)
+    time1 = time.time()
+    print("numpy:total cost time:%.4f " % (time1 - start))
+    M, K = A.shape
+    K1, N = B.shape
+    #print(B)
+    linear_A = np.reshape(A, (-1))
+    linear_B = np.reshape(B, (-1),order='F')
+    linear_C = np.reshape(C, (-1))
+    #print(linear_B)
+    linear_A = np.ascontiguousarray(linear_A, dtype=np.float32)
+    linear_B = np.ascontiguousarray(linear_B, dtype=np.float32)
+    linear_C = np.ascontiguousarray(linear_C, dtype=np.float32)
+    #print(linear_B)
+    #print(linear_A)
+    time4 = time.time()
+    for i in range(100):
+        #cpu_op.matmul(A,B,C)
+        xx=cpu_op.matmul2(M,N,K,linear_A,linear_B,linear_C)
+        #xx=cpu_op.sgemm(transA,transB,M,N,K,linear_A,linear_B,linear_C)
+    time2 = time.time()
+    print("dnnl:total cost time:%.4f " % (time2 - time4))
+    #print(xx)
+
+    #save_to_file("%d,%.4f,%.4f\n" % (M, time1 - start, time2 - time4), 'time.txt')
+
+'''
+
+'''
+##c++
+linear_A = np.reshape(A, (-1))
+linear_B = np.reshape(B, (-1))
+linear_C = np.reshape(C, (-1))
+
+time4 = time.time()
+print("total cost time:%.4f " % (time4 - time1))
+
+so = ctypes.cdll.LoadLibrary
+lib = so("../../build/lib/lib_dnnl_op.so")
+star = time.time()
+print("total cost time:%.4f " % (star - time4))
+
+#if not linear_A.flags['C_CONTIGUOUS']:
+linear_A = np.ascontiguousarray(linear_A, dtype=np.float32)
+#if not linear_B.flags['C_CONTIGUOUS']:
+linear_B = np.ascontiguousarray(linear_B, dtype=np.float32)
+#if not linear_C.flags['C_CONTIGUOUS']:
+linear_C = np.ascontiguousarray(linear_C, dtype=np.float32)
+
+start2 = time.time()
+print("total cost time:%.4f " % (start2 - star))
+for i in range(100):
+    lib.test(M, N, K, linear_A.ctypes.data_as(ctypes.c_void_p), linear_B.ctypes.data_as(ctypes.c_void_p),
+         linear_C.ctypes.data_as(ctypes.c_void_p))
+time2 = time.time()
+print("last,total cost time:%.4f " % ((time2 - start2)/100))
+
+print(linear_C)
+'''
+
+
+def test_transpose():
+    shape = (4321, 1234)
+    ctx = ndarray.cpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    y = np.empty((shape[1], shape[0]), dtype=np.float32)
+
+    arr_x = ndarray.numpyasdlarrayhandle(x)
+    arr_y = ndarray.numpyasdlarrayhandle(y)
+    cpu_op.transpose(arr_x, arr_y, [1, 0])
+    np.testing.assert_allclose(np.transpose(x), y)
+
+    shape = (21, 43, 65, 11)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    y = np.empty((65, 11, 43, 21), dtype=np.float32)
+
+    arr_x = ndarray.numpyasdlarrayhandle(x)
+    arr_y = ndarray.numpyasdlarrayhandle(y)
+    cpu_op.transpose(arr_x, arr_y,  perm=[2, 3, 1, 0])
+    np.testing.assert_allclose(np.transpose(x, [2, 3, 1, 0]), y)
+
+# test_transpose()
+
+
+def test_embedding_lookup():
+    emb = np.random.rand(5, 5)
+    ctx = ndarray.cpu(0)
+    print(emb)
+    emb = ndarray.array(emb, ctx=ctx)
+    ids = [[0, 1], [0, 1]]
+    ids = np.array(ids)
+    print(ids)
+    ids = ndarray.array(ids, ctx=ctx)
+
+    output = ndarray.empty((2, 2, 5), ctx=ctx)
+    cpu_op.embedding_lookup(emb, ids, output)
+    print(output.asnumpy())
+
+
+test_embedding_lookup()
diff --git a/tests/test_embedding_op.py b/tests/test_embedding_op.py
new file mode 100644
index 0000000..3a6a7bb
--- /dev/null
+++ b/tests/test_embedding_op.py
@@ -0,0 +1,99 @@
+import numpy as np
+import tensorflow as tf
+import hetu as ht
+
+
+def test_embedding(executor_ctx=ht.gpu(0)):
+    embedding = ht.Variable('embeddingtable', value=np.random.rand(5, 5))
+    index = ht.Variable(name="index")
+    ids = [[0, 1], [0, 1]]
+    ids = np.array(ids)
+    ids = ht.array(ids, ctx=executor_ctx)
+    y = ht.embedding_lookup_op(embedding, index)
+    opt = ht.optim.SGDOptimizer(0.1)
+    train_op = opt.minimize(y)
+    executor = ht.Executor([y, train_op], ctx=executor_ctx)
+
+    print("embedding:",
+          executor.config.placeholder_to_arr_map[embedding].asnumpy())
+    print("ids:", ids.asnumpy())
+    out, _ = executor.run(feed_dict={index: ids})
+    print(out.asnumpy())
+    print(executor.config.placeholder_to_arr_map[embedding].asnumpy())
+
+
+def test_embedding_with_tf(opt_name, iters=10000, executor_ctx=ht.gpu(0)):
+    from time import time
+
+    value = np.random.rand(5, 5)
+    ids = [[0, 1], [0, 1]]
+    ids = np.array(ids)
+
+    # tf part
+    tf_embedding = tf.Variable(value, dtype=tf.float32)
+    tf_ids = tf.placeholder(tf.int32)
+    tf_y = tf.nn.embedding_lookup(tf_embedding, tf_ids)
+    tf_opts = {
+        'sgd': tf.train.GradientDescentOptimizer(0.1),
+        'momentum': tf.train.MomentumOptimizer(0.1, momentum=0.9),
+        'nesterov': tf.train.MomentumOptimizer(0.1, momentum=0.9, use_nesterov=True),
+        'adagrad': tf.train.AdagradOptimizer(0.1, initial_accumulator_value=1e-7, use_locking=True),
+        'adam': tf.train.AdamOptimizer(0.1, epsilon=1e-7, use_locking=True),
+    }
+    tf_opt = tf_opts[opt_name]
+
+    tf_trainop = tf_opt.minimize(tf_y)
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        start = time()
+        for i in range(iters):
+            tf_out, _ = sess.run([tf_y, tf_trainop], feed_dict={tf_ids: ids})
+        end = time()
+        print('tensorflow time using: ', end - start)
+        tf_new_embedding = sess.run([tf_embedding])[0]
+        print(tf_out)
+        print(tf_new_embedding)
+
+    print()
+
+    # hetu part
+    embedding = ht.Variable('embeddingtable', value=value)
+    index = ht.Variable(name="index")
+
+    ids = ht.array(ids, ctx=executor_ctx)
+    y = ht.embedding_lookup_op(embedding, index)
+    hetu_opts = {
+        'sgd': ht.optim.SGDOptimizer(0.1),
+        'momentum': ht.optim.MomentumOptimizer(0.1),
+        'nesterov': ht.optim.MomentumOptimizer(0.1, nesterov=True),
+        'adagrad': ht.optim.AdaGradOptimizer(0.1),
+        'adam': ht.optim.AdamOptimizer(0.1),
+    }
+    opt = hetu_opts[opt_name]
+
+    train_op = opt.minimize(y)
+    executor = ht.Executor([y, train_op], ctx=executor_ctx)
+
+    start = time()
+    for i in range(iters):
+        out, _ = executor.run(feed_dict={index: ids})
+    end = time()
+    print('hetu time using: ', end - start)
+    out = out.asnumpy()
+    new_embedding = executor.config.placeholder_to_arr_map[embedding].asnumpy()
+    print(out)
+    print(new_embedding)
+
+    np.testing.assert_allclose(out, tf_out, rtol=1e-5)
+    np.testing.assert_allclose(new_embedding, tf_new_embedding, rtol=1e-5)
+
+
+test_embedding()
+test_embedding(ht.cpu(0))
+test_embedding_with_tf(opt_name='sgd')
+test_embedding_with_tf(opt_name='sgd', executor_ctx=ht.cpu(0))
+test_embedding_with_tf(opt_name='momentum')
+test_embedding_with_tf(opt_name='nesterov', iters=1000)
+test_embedding_with_tf(opt_name='adagrad')
+test_embedding_with_tf(opt_name='adam')
diff --git a/tests/test_gpu_initializers.py b/tests/test_gpu_initializers.py
new file mode 100644
index 0000000..5f259bb
--- /dev/null
+++ b/tests/test_gpu_initializers.py
@@ -0,0 +1,209 @@
+import numpy as np
+from scipy.stats import truncnorm
+import matplotlib.pyplot as plt
+from time import time
+
+import hetu as ht
+from hetu import stream
+from hetu import cpu_links as cpu_op
+from hetu import gpu_links as gpu_op
+
+
+def test_normal(size, mean=0, std=1):
+    ctx = ht.gpu(0)
+    cuda_x = ht.empty(size, ctx=ctx)
+    stre = stream.create_stream_handle(ctx)
+    np_st = time()
+    for i in range(10):
+        x = np.random.normal(loc=mean, scale=std, size=size).astype(np.float32)
+        cuda_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cu_st = time()
+    for i in range(10):
+        gpu_op.normal_init(cuda_x, mean, std, 123, stre)
+    stre.sync()
+    cu_en = time()
+    print('cuda time: ', cu_en - cu_st)
+    fig, ax = plt.subplots(1, 1)
+    cuda_x = cuda_x.asnumpy()
+    assert (cuda_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cuda_x.flatten(), histtype='step',
+            alpha=0.2, bins=50, label='cuda')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'normal_%f_%f.png' % (mean, std)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_normal((1024, 128), 0, 1)
+test_normal((1024, 128), 4.5, 2.6)
+test_normal((1024, 128), -2.6, 4.5)
+test_normal((1024, 128, 128), -10, 9)
+
+
+def test_uniform(size, lb=-1, ub=1):
+    ctx = ht.gpu(0)
+    cuda_x = ht.empty(size, ctx=ctx)
+    stre = stream.create_stream_handle(ctx)
+    np_st = time()
+    for i in range(10):
+        x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32)
+        cuda_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cu_st = time()
+    for i in range(10):
+        gpu_op.uniform_init(cuda_x, lb, ub, 123, stre)
+    stre.sync()
+    cu_en = time()
+    print('cuda time: ', cu_en - cu_st)
+    fig, ax = plt.subplots(1, 1)
+    cuda_x = cuda_x.asnumpy()
+    assert (cuda_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cuda_x.flatten(), histtype='step',
+            alpha=0.2, bins=50, label='cuda')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'uniform_%f_%f.png' % (lb, ub)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_uniform((1024, 128), 0, 1)
+test_uniform((1024, 128), -100, 100)
+test_uniform((1024, 128), -4.5, -4.4)
+test_uniform((1024, 128, 128), -10, 9)
+
+
+def test_truncated_normal(size, mean=0, std=1):
+    ctx = ht.gpu(0)
+    cuda_x = ht.empty(size, ctx=ctx)
+    stre = stream.create_stream_handle(ctx)
+    np_st = time()
+    for i in range(10):
+        x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std,
+                          size=size).astype(np.float32)
+        cuda_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cu_st = time()
+    for i in range(10):
+        gpu_op.truncated_normal_init(cuda_x, mean, std, 123, stre)
+    stre.sync()
+    cu_en = time()
+    print('cuda time: ', cu_en - cu_st)
+    fig, ax = plt.subplots(1, 1)
+    cuda_x = cuda_x.asnumpy()
+    assert (cuda_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cuda_x.flatten(), histtype='step',
+            alpha=0.2, bins=50, label='cuda')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'truncated_normal_%f_%f.png' % (mean, std)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_truncated_normal((1024, 128), 0, 1)
+test_truncated_normal((1024, 128), 4.5, 2.6)
+test_truncated_normal((1024, 128), -2.6, 4.5)
+test_truncated_normal((1024, 128, 128), -10, 9)
+
+
+def test_cpu_normal(size, mean=0, std=1):
+    cpu_x = ht.empty(size, ctx=ht.cpu(0))
+    np_st = time()
+    for i in range(10):
+        x = np.random.normal(loc=mean, scale=std, size=size).astype(np.float32)
+        cpu_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cpu_st = time()
+    for i in range(10):
+        cpu_op.normal_init(cpu_x, mean, std, 123)
+    cpu_en = time()
+    print('cpu time: ', cpu_en - cpu_st)
+    fig, ax = plt.subplots(1, 1)
+    cpu_x = cpu_x.asnumpy()
+    assert (cpu_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'normal_%f_%f_cpu.png' % (mean, std)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_cpu_normal((1024, 128), 0, 1)
+test_cpu_normal((1024, 128), 4.5, 2.6)
+test_cpu_normal((1024, 128), -2.6, 4.5)
+test_cpu_normal((1024, 128, 128), -10, 9)
+
+
+def test_cpu_uniform(size, lb=-1, ub=1):
+    cpu_x = ht.empty(size, ctx=ht.cpu(0))
+    np_st = time()
+    for i in range(10):
+        x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32)
+        cpu_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cpu_st = time()
+    for i in range(10):
+        cpu_op.uniform_init(cpu_x, lb, ub, 123)
+    cpu_en = time()
+    print('cpu time: ', cpu_en - cpu_st)
+    fig, ax = plt.subplots(1, 1)
+    cpu_x = cpu_x.asnumpy()
+    assert (cpu_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'uniform_%f_%f_cpu.png' % (lb, ub)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_cpu_uniform((1024, 128), 0, 1)
+test_cpu_uniform((1024, 128), -100, 100)
+test_cpu_uniform((1024, 128), -4.5, -4.4)
+test_cpu_uniform((1024, 128, 128), -10, 9)
+
+
+def test_cpu_truncated_normal(size, mean=0, std=1):
+    cpu_x = ht.empty(size, ctx=ht.cpu(0))
+    np_st = time()
+    for i in range(10):
+        x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std,
+                          size=size).astype(np.float32)
+        cpu_x[:] = x
+    np_en = time()
+    print('numpy time: ', np_en - np_st)
+    cpu_st = time()
+    for i in range(10):
+        cpu_op.truncated_normal_init(cpu_x, mean, std, 123)
+    cpu_en = time()
+    print('cpu time: ', cpu_en - cpu_st)
+    fig, ax = plt.subplots(1, 1)
+    cpu_x = cpu_x.asnumpy()
+    assert (cpu_x.shape == x.shape)
+    ax.hist(x.flatten(), histtype='stepfilled',
+            alpha=0.2, bins=50, label='numpy')
+    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
+    ax.legend(loc='best', frameon=False)
+    file_name = 'truncated_normal_%f_%f.png' % (mean, std)
+    plt.savefig(file_name)
+    plt.close()
+
+
+test_cpu_truncated_normal((1024, 128), 0, 1)
+test_cpu_truncated_normal((1024, 128), 4.5, 2.6)
+test_cpu_truncated_normal((1024, 128), -2.6, 4.5)
+test_cpu_truncated_normal((1024, 128, 128), -10, 9)
diff --git a/tests/test_gpu_op.py b/tests/test_gpu_op.py
new file mode 100644
index 0000000..f8e2cc4
--- /dev/null
+++ b/tests/test_gpu_op.py
@@ -0,0 +1,1340 @@
+import numpy as np
+import hetu as ht
+from hetu import gpu_links as gpu_op
+
+
+def test_array_set():
+    ctx = ht.gpu(0)
+    shape = (500, 200)
+    # oneslike
+    arr_x = ht.empty(shape, ctx=ctx)
+    gpu_op.array_set(arr_x, 1.)
+    x = arr_x.asnumpy()
+    np.testing.assert_allclose(np.ones(shape), x)
+    # zeroslike
+    gpu_op.array_set(arr_x, 0.)
+    x = arr_x.asnumpy()
+    np.testing.assert_allclose(np.zeros(shape), x)
+
+
+def test_broadcast_to():
+    ctx = ht.gpu(0)
+    shape = (200, 300)
+    to_shape = (130, 200, 300)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(to_shape, ctx=ctx)
+    gpu_op.broadcast_to(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np.broadcast_to(x, to_shape), y)
+
+
+def test_reduce_sum_axis_zero():
+    ctx = ht.gpu(0)
+    shape = (20, 1, 1)
+    temp_shape = list(shape)
+    temp_shape[0] = (temp_shape[0] + 1) // 2
+    temp_shape = tuple(temp_shape)
+    to_shape = (1, 1)
+    x = np.random.uniform(0, 20, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(to_shape, ctx=ctx)
+    arr_workspace = ht.empty(shape=temp_shape, ctx=ctx)
+    gpu_op.reduce_sum_axis_zero(arr_x, arr_y, arr_workspace)
+    y = arr_y.asnumpy()
+    y_ = np.sum(x, axis=0)
+    for index, _ in np.ndenumerate(y):
+        v = y[index]
+        v_ = y_[index]
+        if abs((v - v_) / v_) > 1e-4:
+            print(index, v, v_)
+    np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5)
+
+
+def test_matrix_elementwise_add():
+    ctx = ht.gpu(0)
+    shape = (500, 200)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    y = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_add(arr_x, arr_y, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(x + y, z, rtol=1e-5)
+
+
+def test_matrix_elementwise_add_by_const():
+    shape = (2000, 3000)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    val = np.random.uniform(-5, 5)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(x + val, y, rtol=1e-5)
+
+
+def test_matrix_elementwise_multiply():
+    ctx = ht.gpu(0)
+    shape = (500, 200)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    y = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(x * y, z, rtol=1e-5)
+
+
+def test_matrix_elementwise_multiply_by_const():
+    shape = (2000, 3000)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    val = np.random.uniform(-5, 5)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(x * val, y, rtol=1e-5)
+
+
+def test_matrix_elementwise_divide():
+    ctx = ht.gpu(0)
+    shape = (500, 200)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    y = np.random.uniform(1, 10, size=shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_divide(arr_x, arr_y, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(x / y, z, rtol=1e-5)
+
+
+def test_matrix_elementwise_divide_const():
+    shape = (2000, 3000)
+    ctx = ht.gpu(0)
+    val = np.random.uniform(-5, 5)
+    x = np.random.uniform(1, 10, size=shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_elementwise_divide_const(val, arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(val / x, y, rtol=1e-5)
+
+
+def test_matrix_opposite():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_opposite(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(-x, y)
+
+
+def test_matrix_multiply():
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((500, 1000), ctx=ctx)
+    gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5)
+
+    x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((1000, 2000), ctx=ctx)
+    gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5)
+
+    x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((1000, 2000), ctx=ctx)
+    gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z,
+                               rtol=1e-5)
+
+
+def test_matrix_sqrt():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_sqrt(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np.sqrt(x), y, rtol=1e-5)
+
+
+def test_matrix_rsqrt():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.matrix_rsqrt(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(1 / np.sqrt(x), y, rtol=1e-5)
+
+
+def test_relu():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.relu(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y)
+
+
+def test_leaky_relu():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    alpha = 10
+    gpu_op.leaky_relu(arr_x, float(alpha), arr_y)
+    y = arr_y.asnumpy()
+
+
+def test_relu_gradient():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_grad_x = ht.array(grad_x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
+
+
+def test_leaky_relu_gradient():
+    shape = (2000, 2500)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_grad_x = ht.array(grad_x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    alpha = 10
+    gpu_op.leaky_relu_gradient(arr_x, arr_grad_x, alpha, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(
+        np.where(np.greater(x, 0), grad_x, alpha * grad_x).astype(np.float32), y)
+
+
+def test_softmax():
+    def softmax_func(y):
+        """Numerically stable softmax."""
+        b = y - np.max(y, axis=1, keepdims=True)
+        expb = np.exp(b)
+        softmax = expb / np.sum(expb, axis=1, keepdims=True)
+        return softmax
+    ctx = ht.gpu(0)
+    shape = (400, 1000)
+    x = np.random.uniform(-5, 5, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shape, ctx=ctx)
+    gpu_op.softmax(arr_x, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(softmax_func(x), y, rtol=1e-5)
+
+
+def test_softmax_cross_entropy():
+    def softmax_func(y):
+        """Numerically stable softmax."""
+        b = y - np.max(y, axis=1, keepdims=True)
+        expb = np.exp(b)
+        softmax = expb / np.sum(expb, axis=1, keepdims=True)
+        return softmax
+    ctx = ht.gpu(0)
+    shape = (400, 1000)
+    y = np.random.uniform(-5, 5, shape).astype(np.float32)
+    y_ = np.random.uniform(-5, 5, shape).astype(np.float32)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_y_ = ht.array(y_, ctx=ctx)
+    arr_out = ht.empty((400,), ctx=ctx)
+    gpu_op.softmax_cross_entropy(arr_y, arr_y_, arr_out)
+    out = arr_out.asnumpy()
+    # numpy calculation
+    cross_entropy = -np.sum(y_ * np.log(softmax_func(y)), axis=1)
+    np.testing.assert_allclose(cross_entropy, out, rtol=1e-4)
+    # test cudnn
+    gpu_op.CuDNN_softmax_cross_entropy(arr_y, arr_y_, arr_out)
+    out = arr_out.asnumpy()
+    np.testing.assert_allclose(cross_entropy, out, rtol=1e-4)
+
+
+def test_softmax_cross_entropy_gradient():
+    def softmax_func(y):
+        """Numerically stable softmax."""
+        b = y - np.max(y, axis=1, keepdims=True)
+        expb = np.exp(b)
+        softmax = expb / np.sum(expb, axis=1, keepdims=True)
+        return softmax
+    ctx = ht.gpu(0)
+    shape = (400, 1000)
+    y = np.random.uniform(-5, 5, shape).astype(np.float32)
+    y_ = np.random.uniform(-5, 5, shape).astype(np.float32)
+    grad = np.random.uniform(-5, 5, (400,)).astype(np.float32)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_y_ = ht.array(y_, ctx=ctx)
+    arr_grad = ht.array(grad, ctx=ctx)
+    arr_out = ht.empty(shape, ctx=ctx)
+    gpu_op.softmax_cross_entropy_gradient(arr_y, arr_y_, arr_grad, arr_out)
+    out = arr_out.asnumpy()
+    # numpy calculation
+    np_grad = (softmax_func(y) + -1 * y_) * np.expand_dims(grad, -1)
+    np.testing.assert_allclose(np_grad, out, rtol=1e-4, atol=1e-8)
+
+    # test cudnn
+    gpu_op.CuDNN_softmax_cross_entropy_gradient(
+        arr_grad, arr_y, arr_y_, arr_out)
+    out = arr_out.asnumpy()
+    np.testing.assert_allclose(np_grad, out, rtol=1e-4, atol=1e-8)
+
+
+def test_conv2d():
+    ctx = ht.gpu(0)
+    # im2col and np_conv2d are helper functions
+
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        print("shape", im2col_matrix.shape)
+        print("shape", filter_matrix.shape)
+        print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+        # return im2col_matrix
+
+    shapeX = (100, 3, 28, 28)
+    shapeF = (10, 3, 5, 5)
+    shapeY = (100, 10, 24, 24)
+    shapeW = (100, 3*5*5, 24*24)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+    y = np.zeros(shapeY).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_f = ht.array(f, ctx=ctx)
+    arr_y = ht.empty(shapeY, ctx=ctx)
+    arr_workspace = ht.empty(shapeW, ctx=ctx)
+
+    gpu_op.conv2d(arr_x, arr_f, arr_y, arr_workspace)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np_conv2d(x, f), y, rtol=1e-5)
+
+
+def test_conv2d_Gradient():
+    ctx = ht.gpu(0)
+
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+
+    def im2col_transpose(X, filter_H, filter_W, Y, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+        _, y_row_size, y_col_size = Y.shape
+
+        der_X_shape = (N, C, H, W)
+        der_X = np.zeros(der_X_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                der_X[batch_index, c, y,
+                                      x] += Y[batch_index, row_idx, col_index]
+                            row_idx += 1
+        return der_X
+
+    def np_conv2d_transpose(X, Filter, Y, padding=0, stride=1):
+        """Implement a conv2d_transpose as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        X_N, X_C, X_H, X_W = X.shape
+        Y_N, Y_C, Y_H, Y_W = Y.shape
+        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))    # transformed to im2col Y
+        # XX = X.reshape((X_N, X_C, X_W * X_H))   # transformed to im2col X
+        F_filter = Filter.reshape((filter_outChannel, -1))
+        gradient_im2col_XX = np.matmul(F_filter.T, YY)
+
+        gradient_X = im2col_transpose(
+            X, filter_H, filter_W, gradient_im2col_XX, padding, stride)    # gradient of x
+        im2col_XX = im2col(X, filter_H, filter_W, padding, stride)
+        gradient_filter = np.zeros(shape=F_filter.shape, dtype=X.dtype)
+
+        for i in range(X_N):
+            gradient_filter += np.matmul(YY[i], im2col_XX[i].T)
+        gradient_filter = gradient_filter.reshape(Filter.shape)
+
+        return gradient_X, gradient_filter
+
+    shapeX = (100, 3, 28, 28)
+    shapeF = (10, 3, 5, 5)
+    shapeY = (100, 10, 24, 24)
+    shapeW = (100, 3*5*5, 24*24)
+    shapeFF = (100, 10, 3, 5, 5)
+    #  input : x , filter : f , output: y
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+
+    der_y = np.ones(shape=shapeY)
+    gradient_x, gradient_f = np_conv2d_transpose(x, f, der_y)
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_f = ht.array(f, ctx=ctx)
+    gradient_y = ht.array(der_y, ctx=ctx)
+    gradient_xx = ht.array(x, ctx=ctx)
+    gradient_ff = ht.array(f, ctx=ctx)
+
+    arr_workspace_im2col = ht.empty(shapeW, ctx=ctx)
+    arr_workspace_batch_filter = ht.empty(shapeFF, ctx=ctx)
+    gpu_op.conv2d_gradient_of_filter(
+        arr_x, gradient_y, gradient_ff, arr_workspace_im2col, arr_workspace_batch_filter)
+    gpu_op.conv2d_gradient_of_data(
+        arr_f, gradient_y, gradient_xx, arr_workspace_im2col)
+
+    np.testing.assert_allclose(gradient_x, gradient_xx.asnumpy(), rtol=1e-5)
+    # test ok
+    np.testing.assert_allclose(gradient_f, gradient_ff.asnumpy(), rtol=1e-5)
+
+
+def test_average_pooling():
+    ctx = ht.gpu(0)
+
+    def np_average_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, H, W = input.shape
+        assert((H + 2 * padding - kernel_H) % stride == 0)
+        assert((W + 2 * padding - kernel_W) % stride == 0)
+        pooled_H = (H + 2 * padding - kernel_H) // stride + 1
+        pooled_W = (W + 2 * padding - kernel_W) // stride + 1
+        pooled_layer = np.zeros(
+            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                pooled_layer[n][c][h][w] += input[n][c][i][j]
+                        pooled_layer[n][c][h][w] /= pooling_size
+        return pooled_layer
+
+    def np_average_pooling_gradient(gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                gradient_x[n][c][i][j] += gradient_y[n][c][h][w] / \
+                                    pooling_size
+
+        return gradient_x
+
+    shapeX = (100, 3, 28, 28)
+    # (1,1,5,5)
+    shapeY = (100, 3, 24, 24)
+    #  input : x , filter : f , output: y
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_gradient_y = ht.array(gradient_y, ctx=ctx)
+    arr_pool_layer = ht.empty(shapeY, ctx=ctx)
+    arr_gradient_x = ht.empty(shapeX, ctx=ctx)
+
+    gpu_op.average_pooling2d(arr_x, 5, 5, arr_pool_layer)
+    gpu_op.average_pooling2d_gradient(arr_gradient_y, 5, 5, arr_gradient_x)
+
+    np_pool_layer = np_average_pooling(x, 5, 5)
+    np_gradient_x = np_average_pooling_gradient(gradient_y, 5, 5)
+
+    np.testing.assert_allclose(
+        np_pool_layer, arr_pool_layer.asnumpy(), rtol=1e-5)
+
+    np.testing.assert_allclose(
+        np_gradient_x, arr_gradient_x.asnumpy(), rtol=1e-5)
+
+
+def test_reshape():
+    ctx = ht.gpu(0)
+
+    def np_reshape(X, output_shape):
+        return X.reshape(output_shape)
+
+    shapeX = (10, 5, 28, 28)
+    shapeY = (50, 28, 28)
+
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    y = np_reshape(x, shapeY)
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeY, ctx=ctx)
+    gpu_op.array_reshape(arr_x, arr_y)
+    np.testing.assert_allclose(y, arr_y.asnumpy(), rtol=1e-5)
+
+
+def test_conv2d_broadcast_to():
+    ctx = ht.gpu(0)
+    shapeX = (32)
+    shapeY = (100, 32, 28, 28)
+    shapeW = (100, 28, 28, 32)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    np_y = np.broadcast_to(x, shapeW)
+    np_y = np_y.swapaxes(1, 3)
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeY, ctx=ctx)
+    gpu_op.conv2d_broadcast_to(arr_x, arr_y)
+
+    np.testing.assert_allclose(np_y, arr_y.asnumpy(), rtol=1e-5)
+
+
+def test_conv2d_reduce_sum():
+    ctx = ht.gpu(0)
+    shapeX = (32,)
+    shapeY = (100, 32, 28, 28)
+    shapeW = (100, 28, 28, 32)
+    x = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+    np_y = np.sum(x, axis=(0, 2, 3))
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeX, ctx=ctx)
+
+    gpu_op.conv2d_reduce_sum(arr_x, arr_y)
+    np.testing.assert_allclose(np_y, arr_y.asnumpy(), rtol=1e-5)
+
+
+def test_cudnn_conv2d():
+    ctx = ht.gpu(0)
+    # im2col and np_conv2d are helper functions
+
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        print("shape", im2col_matrix.shape)
+        print("shape", filter_matrix.shape)
+        print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+        # return im2col_matrix
+
+    shapeX = (100, 3, 28, 28)
+    shapeF = (10, 3, 5, 5)
+    shapeY = (100, 10, 24, 24)
+    shapeW = (100, 3*5*5, 24*24)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+    y = np.zeros(shapeY).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_f = ht.array(f, ctx=ctx)
+    arr_y = ht.empty(shapeY, ctx=ctx)
+    arr_workspace = ht.empty(shapeW, ctx=ctx)
+
+    gpu_op.CuDNN_conv2d(arr_x, arr_f, arr_y)
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np_conv2d(x, f), y, rtol=1e-5)
+
+
+def test_cudnn_conv2d_Gradient():
+    ctx = ht.gpu(0)
+
+    def im2col(X, filter_H, filter_W, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        y_row_size = C * filter_H * filter_W
+        y_col_size = out_H * out_W
+        y_shape = (N, y_row_size, y_col_size)
+        Y = np.empty(y_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                Y[batch_index, row_idx,
+                                    col_index] = X[batch_index, c, y, x]
+                            row_idx += 1
+        return Y
+
+    def np_conv2d(X, Filter, padding=0, stride=1):
+        """Implement a conv2d as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+
+        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
+        filter_matrix = Filter.reshape(filter_outChannel, -1)
+        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
+
+    def im2col_transpose(X, filter_H, filter_W, Y, padding, stride):
+        N, C, H, W = X.shape
+        assert (H + 2 * padding - filter_H) % stride == 0
+        assert (W + 2 * padding - filter_W) % stride == 0
+        out_H = (H + 2 * padding - filter_H) // stride + 1
+        out_W = (W + 2 * padding - filter_W) // stride + 1
+        _, y_row_size, y_col_size = Y.shape
+
+        der_X_shape = (N, C, H, W)
+        der_X = np.zeros(der_X_shape, dtype=X.dtype)
+
+        for batch_index in range(N):
+            for col_index in range(y_col_size):
+                out_y = col_index // out_W
+                out_x = col_index % out_W
+                in_y = out_y * stride - padding
+                in_x = out_x * stride - padding
+                row_idx = 0
+                for c in range(0, C):
+                    for y in range(in_y, in_y + filter_H):
+                        for x in range(in_x, in_x + filter_W):
+                            if (x < 0 or x >= W or y < 0 or y >= H):
+                                Y[batch_index, row_idx, col_index] = 0
+                            else:
+                                der_X[batch_index, c, y,
+                                      x] += Y[batch_index, row_idx, col_index]
+                            row_idx += 1
+        return der_X
+
+    def np_conv2d_transpose(X, Filter, Y, padding=0, stride=1):
+        """Implement a conv2d_transpose as a matrix multiply after im2col."""
+        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
+        X_N, X_C, X_H, X_W = X.shape
+        Y_N, Y_C, Y_H, Y_W = Y.shape
+        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))    # transformed to im2col Y
+        # XX = X.reshape((X_N, X_C, X_W * X_H))   # transformed to im2col X
+        F_filter = Filter.reshape((filter_outChannel, -1))
+        gradient_im2col_XX = np.matmul(F_filter.T, YY)
+
+        gradient_X = im2col_transpose(
+            X, filter_H, filter_W, gradient_im2col_XX, padding, stride)    # gradient of x
+        im2col_XX = im2col(X, filter_H, filter_W, padding, stride)
+        gradient_filter = np.zeros(shape=F_filter.shape, dtype=X.dtype)
+
+        for i in range(X_N):
+            gradient_filter += np.matmul(YY[i], im2col_XX[i].T)
+        gradient_filter = gradient_filter.reshape(Filter.shape)
+
+        return gradient_X, gradient_filter
+
+    shapeX = (100, 3, 28, 28)
+    shapeF = (10, 3, 5, 5)
+    shapeY = (100, 10, 24, 24)
+    shapeW = (100, 3*5*5, 24*24)
+    shapeFF = (100, 10, 3, 5, 5)
+    #  input : x , filter : f , output: y
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
+
+    der_y = np.ones(shape=shapeY)
+    gradient_x, gradient_f = np_conv2d_transpose(x, f, der_y)
+
+    arr_x = ht.array(x, ctx=ctx)
+    arr_f = ht.array(f, ctx=ctx)
+    gradient_y = ht.array(der_y, ctx=ctx)
+    gradient_xx = ht.array(x, ctx=ctx)
+    gradient_ff = ht.array(f, ctx=ctx)
+
+    arr_workspace_im2col = ht.empty(shapeW, ctx=ctx)
+    arr_workspace_batch_filter = ht.empty(shapeFF, ctx=ctx)
+    gpu_op.CuDNN_conv2d_gradient_of_filter(arr_x, gradient_y, gradient_ff)
+    gpu_op.CuDNN_conv2d_gradient_of_data(arr_f, gradient_y, gradient_xx)
+
+    np.testing.assert_allclose(gradient_x, gradient_xx.asnumpy(), rtol=1e-5)
+    # test ok
+    np.testing.assert_allclose(gradient_f, gradient_ff.asnumpy(), rtol=1e-5)
+
+
+def test_average_pooling():
+    ctx = ht.gpu(0)
+
+    def np_average_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, H, W = input.shape
+        assert((H + 2 * padding - kernel_H) % stride == 0)
+        assert((W + 2 * padding - kernel_W) % stride == 0)
+        pooled_H = (H + 2 * padding - kernel_H) // stride + 1
+        pooled_W = (W + 2 * padding - kernel_W) // stride + 1
+        pooled_layer = np.zeros(
+            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                pooled_layer[n][c][h][w] += input[n][c][i][j]
+                        pooled_layer[n][c][h][w] /= pooling_size
+        return pooled_layer
+
+    def np_average_pooling_gradient(gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                gradient_x[n][c][i][j] += gradient_y[n][c][h][w] / \
+                                    pooling_size
+
+        return gradient_x
+
+    shapeX = (100, 3, 28, 28)
+    shapeY = (100, 3, 24, 24)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_gradient_y = ht.array(gradient_y, ctx=ctx)
+    arr_pool_layer = ht.empty(shapeY, ctx=ctx)
+    arr_gradient_x = ht.empty(shapeX, ctx=ctx)
+
+    gpu_op.CuDNN_average_pooling2d(arr_x, 5, 5, arr_pool_layer)
+    gpu_op.CuDNN_average_pooling2d_gradient(
+        arr_pool_layer, arr_gradient_y, arr_x, 5, 5, arr_gradient_x)
+    np_pool_layer = np_average_pooling(x, 5, 5)
+    np_gradient_x = np_average_pooling_gradient(gradient_y, 5, 5)
+
+    np.testing.assert_allclose(
+        np_pool_layer, arr_pool_layer.asnumpy(), rtol=1e-5)
+
+    np.testing.assert_allclose(
+        np_gradient_x, arr_gradient_x.asnumpy(), rtol=1e-5)
+
+
+def test_CuDNN_max_pooling():
+    ctx = ht.gpu(0)
+
+    def np_max_pooling_gradient(input, gradient_y, kernel_H, kernel_W, padding=0, stride=1):
+        N, C, pooled_H, pooled_W = gradient_y.shape
+        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
+        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
+        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
+        pooling_size = kernel_H * kernel_W
+
+        for n in range(N):
+            for c in range(C):
+                for h in range(pooled_H):
+                    for w in range(pooled_W):
+                        hs = h * stride - padding
+                        ws = w * stride - padding
+                        hend = min(hs + kernel_H, H)
+                        wend = min(ws + kernel_W, W)
+                        hs = max(hs, 0)
+                        ws = max(ws, 0)
+
+                        hargmax = hs
+                        wargmax = ws
+                        for i in range(hs, hend):
+                            for j in range(ws, wend):
+                                if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
+                                    hargmax = i
+                                    wargmax = j
+                        gradient_x[n][c][hargmax][wargmax] += gradient_y[n][c][h][w]
+
+        return gradient_x
+
+    shapeX = (100, 3, 28, 28)
+    # # (1,1,5,5)
+    shapeY = (100, 3, 24, 24)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_gradient_y = ht.array(gradient_y, ctx=ctx)
+    arr_pool_layer = ht.empty(shapeY, ctx=ctx)
+    arr_gradient_x = ht.empty(shapeX, ctx=ctx)
+
+    arr_pool_layer1 = ht.empty(shapeY, ctx=ctx)
+    arr_gradient_x1 = ht.empty(shapeX, ctx=ctx)
+
+    gpu_op.CuDNN_max_pooling2d(arr_x, 2, 2, arr_pool_layer)
+    gpu_op.CuDNN_max_pooling2d_gradient(
+        arr_pool_layer, arr_gradient_y, arr_x, 2, 2, arr_gradient_x)
+
+    gpu_op.max_pooling2d(arr_x, 2, 2, arr_pool_layer1)
+    gpu_op.max_pooling2d_gradient(arr_x, arr_gradient_y, 2, 2, arr_gradient_x1)
+
+    np.testing.assert_allclose(
+        arr_pool_layer.asnumpy(), arr_pool_layer1.asnumpy(), rtol=1e-5)
+
+    np.testing.assert_allclose(
+        arr_gradient_x.asnumpy(), arr_gradient_x1.asnumpy(), rtol=1e-5)
+
+
+def test_CuDNN_dropout_op():
+    import ctypes
+    ctx = ht.gpu(0)
+    shapeX = (3, 10)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeX, ctx=ctx)
+    shapeK = (1)
+    keep_prob = 0.5
+    reserve_size = ctypes.c_int(0)
+    reserve_space = ctypes.c_void_p(0)
+    gpu_op.CuDNN_Dropout(arr_x, keep_prob, arr_y,
+                         reserve_size, reserve_space, 1)
+
+    gradient_y = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    gradient_y = ht.array(gradient_y, ctx=ctx)
+    gradient_x = ht.empty(shapeX, ctx=ctx)
+    gpu_op.CuDNN_Dropout_gradient(
+        gradient_y, keep_prob, gradient_x, reserve_size, reserve_space)
+    print(arr_y.asnumpy())
+    print(gradient_x.asnumpy())
+
+
+def test_pad():
+    ctx = ht.gpu(0)
+    shape = (1, 1, 1, 3)
+    paddings = [[1, 1], [1, 1]]
+    to_shape = (1, 1, 3, 5)
+    x = np.random.uniform(0, 10, size=shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(to_shape, ctx=ctx)
+    gpu_op.pad(arr_x, arr_y, paddings)
+    print(arr_x.asnumpy())
+    print(arr_y.asnumpy())
+    gradient_y = np.random.uniform(0, 10, size=to_shape).astype(np.float32)
+    arr_gradient_y = ht.array(gradient_y, ctx=ctx)
+    arr_gradient_x = ht.empty(shape, ctx=ctx)
+    gpu_op.pad_gradient(arr_gradient_y, arr_gradient_x, paddings)
+    print(arr_gradient_y.asnumpy())
+    print(arr_gradient_x.asnumpy())
+
+
+def test_concat():
+    def unit_test(shape1, shape2, axis):
+        ctx = ht.gpu(0)
+        x1 = np.random.random(shape1).astype(np.float32)
+        x2 = np.random.random(shape2).astype(np.float32)
+        arr_x1 = ht.array(x1, ctx=ctx)
+        arr_x2 = ht.array(x2, ctx=ctx)
+        np_res = np.concatenate([x1, x2], axis)
+        arr_res = ht.empty(np_res.shape, ctx=ctx)
+        gpu_op.concat(arr_x1, arr_x2, arr_res, axis)
+        np.testing.assert_allclose(arr_res.asnumpy(), np_res)
+
+        grad_x1 = ht.empty(shape1, ctx=ctx)
+        grad_x2 = ht.empty(shape2, ctx=ctx)
+        gpu_op.concat_gradient(arr_res, grad_x1, axis=axis, idx=0)
+        gpu_op.concat_gradient(arr_res, grad_x2, axis=axis, idx=1)
+        np.testing.assert_allclose(x1, grad_x1.asnumpy())
+        np.testing.assert_allclose(x2, grad_x2.asnumpy())
+
+        print("Pass test with ", shape1, shape2, axis)
+
+    unit_test((1, 2), (1, 2), 0)
+    unit_test((12, 34, 56), (12, 43, 56), 1)
+
+
+def test_matrix_transpose():
+    shape = (4321, 1234)
+    ctx = ht.gpu(0)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty((shape[1], shape[0]), ctx=ctx)
+    gpu_op.matrix_transpose(arr_x, arr_y, perm=[1, 0])
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np.transpose(x), y)
+
+    shape = (21, 43, 65, 11)
+    x = np.random.uniform(-1, 1, shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty((65, 11, 43, 21), ctx=ctx)
+    gpu_op.matrix_transpose(arr_x, arr_y, perm=[2, 3, 1, 0])
+    y = arr_y.asnumpy()
+    np.testing.assert_allclose(np.transpose(x, [2, 3, 1, 0]), y)
+
+
+def test_slice():
+    i_shape = (123, 234, 13, 7)
+    o_shape = (67, 209, 3, 5)
+    begin_pos = (31, 11, 3, 1)
+
+    ctx = ht.gpu(0)
+    x = np.random.normal(size=i_shape).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(o_shape, ctx=ctx)
+
+    gpu_op.matrix_slice(arr_x, arr_y, begin_pos)
+    y = arr_y.asnumpy()
+    index = tuple([slice(i, i+j) for i, j in zip(begin_pos, o_shape)])
+    np.testing.assert_allclose(x[index], y)
+    print("Slice op no bug.")
+
+    begin_pos = (29, 3, 5, 0)
+    gpu_op.matrix_slice_gradient(arr_y, arr_x, begin_pos)
+    x = arr_x.asnumpy()
+    index = tuple([slice(i, i+j) for i, j in zip(begin_pos, o_shape)])
+    x_ = np.zeros(i_shape, dtype=np.float32)
+    x_[index] = y
+    np.testing.assert_allclose(x, x_)
+    print("Slice gradient op no bug.")
+
+
+def test_where():
+    cond = np.random.randint(2, size=(5, 5)).astype(np.float32)
+    x = np.random.rand(5, 5).astype(np.float32)
+    y = np.random.rand(5, 5).astype(np.float32)
+    ctx = ht.gpu(0)
+    cond_gpu = ht.array(cond, ctx=ctx)
+    x_gpu = ht.array(x, ctx=ctx)
+    y_gpu = ht.array(y, ctx=ctx)
+    output_gpu = ht.empty((5, 5), ctx=ctx)
+
+    gpu_op.where(cond_gpu, x_gpu, y_gpu, output_gpu)
+    output = np.where(cond, x, y)
+
+    np.testing.assert_allclose(output, output_gpu.asnumpy())
+
+
+def test_batch_matrix_multiply():
+    ctx = ht.gpu(0)
+    x = np.random.uniform(0, 10, size=(2, 3, 500, 700)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(2, 3, 700, 1000)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((2, 3, 500, 1000), ctx=ctx)
+    gpu_op.batch_matrix_multiply(arr_x, False, arr_y, False, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(np.matmul(x, y), z, rtol=1e-5)
+    print('Test 1 passed.')
+
+    x = np.random.uniform(0, 10, size=(7, 11, 1000, 500)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(7, 11, 2000, 500)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((7, 11, 1000, 2000), ctx=ctx)
+    gpu_op.batch_matrix_multiply(arr_x, False, arr_y, True, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(
+        np.matmul(x, np.transpose(y, [0, 1, 3, 2])), z, rtol=1e-5)
+    print('Test 2 passed.')
+
+    x = np.random.uniform(0, 10, size=(3, 2, 5, 500, 1000)).astype(np.float32)
+    y = np.random.uniform(0, 10, size=(3, 2, 5, 2000, 500)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.array(y, ctx=ctx)
+    arr_z = ht.empty((3, 2, 5, 1000, 2000), ctx=ctx)
+    gpu_op.batch_matrix_multiply(arr_x, True, arr_y, True, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(np.matmul(np.transpose(x, [0, 1, 2, 4, 3]), np.transpose(y, [0, 1, 2, 4, 3])), z,
+                               rtol=1e-5)
+    print('Test 3 passed.')
+
+
+def test_broadcast_shape():
+    def unit_test(shape1, shape2):
+        ctx = ht.gpu(0)
+        x = np.random.random(shape1).astype(np.float32)
+        arr_x = ht.array(x, ctx=ctx)
+        arr_y = ht.empty(shape2, ctx=ctx)
+        gpu_op.broadcast_shape(arr_x, arr_y)
+        np.testing.assert_allclose(arr_y.asnumpy(), np.broadcast_to(x, shape2))
+        print('Passed test with input shape %s and output shape %s.' %
+              (str(shape1), str(shape2)))
+
+    unit_test((3, 1), (2, 3, 4))
+    unit_test((1,), (2, 3, 4, 5))
+    unit_test((1, 1, 3, 1), (9, 8, 3, 7))
+
+
+def test_reduce_sum():
+    def unit_test(shape, axes):
+        ctx = ht.gpu(0)
+        x = np.random.random(shape).astype(np.float32)
+        arr_x = ht.array(x, ctx=ctx)
+        o_shape = list(shape)
+        for ax in axes:
+            o_shape[ax] = 0
+        o_shape = [i for i in o_shape if i > 0]
+        arr_y = ht.empty(o_shape, ctx=ctx)
+        gpu_op.reduce_sum(arr_x, arr_y, axes)
+        np.testing.assert_allclose(arr_y.asnumpy(), np.sum(
+            x, tuple(axes), keepdims=False), rtol=1e-6)
+        print('Passed test with input shape %s and reduce axes %s.' %
+              (str(shape), str(axes)))
+
+    unit_test((2, 3, 4), [2])
+    unit_test((2, 3, 4), [2, 1])
+    unit_test((2, 3, 4), [2, 1, 0])
+    unit_test((2, 3, 1, 5, 6), [1, 2, 4])
+
+
+def test_reduce_mean():
+    def unit_test(shape, axes):
+        ctx = ht.gpu(0)
+        x = np.random.random(shape).astype(np.float32)
+        arr_x = ht.array(x, ctx=ctx)
+        o_shape = list(shape)
+        for ax in axes:
+            o_shape[ax] = 0
+        o_shape = [i for i in o_shape if i > 0]
+        arr_y = ht.empty(o_shape, ctx=ctx)
+        gpu_op.reduce_mean(arr_x, arr_y, axes)
+        np.testing.assert_allclose(arr_y.asnumpy(), np.mean(
+            x, tuple(axes), keepdims=False), rtol=1e-6)
+        print('Passed test with input shape %s and reduce axes %s.' %
+              (str(shape), str(axes)))
+
+    unit_test((2, 3, 4), [2])
+    unit_test((2, 3, 4), [2, 1])
+    unit_test((2, 3, 4), [2, 1, 0])
+    unit_test((2, 3, 1, 5, 6), [1, 2, 4])
+
+
+def test_dropout():
+    ctx = ht.gpu(0)
+    # shapeX = (16, 8, 99, 64)
+    shapeX = (10,)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeX, ctx=ctx)
+    dropout_rate = 0.6
+    import ctypes
+    seed = ctypes.c_ulonglong(0)
+    print(x)
+    gpu_op.dropout(arr_x, dropout_rate, arr_y, seed)
+    print(arr_y.asnumpy())
+    print(seed)
+    gpu_op.dropout_gradient(arr_x, dropout_rate, arr_y, seed)
+    print(arr_y.asnumpy())
+    print(seed)
+
+
+def test_dropout2d():
+    ctx = ht.gpu(0)
+    shapeX = (2, 2, 2, 4)
+    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(shapeX, ctx=ctx)
+    dropout_rate = 0.6
+    import ctypes
+    seed = ctypes.c_ulonglong(0)
+    print(x)
+    gpu_op.dropout2d(arr_x, dropout_rate, arr_y, seed)
+    print(arr_y.asnumpy())
+    print(seed)
+    gpu_op.dropout2d_gradient(arr_x, dropout_rate, arr_y, seed)
+    print(arr_y.asnumpy())
+    print(seed)
+
+
+def test_instance_norm2d():
+    from hetu import ndarray
+    ctx = ht.gpu(0)
+    shapeX = (2, 2, 2, 4)
+    eps = 0.00000001
+    x = ht.array(np.arange(32).reshape(shapeX), ctx=ctx)
+    mid_shape = (2, 2, 1, 1)
+    mean = ndarray.empty(mid_shape, ctx=ndarray.gpu(0))
+    var = ndarray.empty(mid_shape, ctx=ndarray.gpu(0))
+    out = ht.empty(shapeX, ctx=ctx)
+    gpu_op.instance_normalization2d(x, mean, var, out, eps)
+
+    def correct_res(arr_in, eps):
+        mu = np.mean(arr_in, axis=(2, 3))
+        sigma = np.var(arr_in, axis=(2, 3))
+
+        arr_out = np.ones_like(arr_in)
+        for i in range(arr_in.shape[0]):
+            for j in range(arr_in.shape[1]):
+                for k in range(arr_in.shape[2]):
+                    for h in range(arr_in.shape[3]):
+                        arr_out[i][j][k][h] = (
+                            arr_in[i][j][k][h] - mu[i][j]) / np.sqrt(sigma[i][j] + eps)
+        return arr_out
+
+    np.testing.assert_allclose(
+        out.asnumpy(), correct_res(x.asnumpy(), eps), rtol=1e-5)
+
+
+def test_instance_norm2d_gradient():
+    from hetu import ndarray
+    ctx = ht.gpu(0)
+    shapeX = (2, 2, 2, 4)
+    eps = 0.00000001
+    x = ht.array(np.arange(32).reshape(shapeX), ctx=ctx)
+    mid_shape = (2, 2, 1, 1)
+    mean = ndarray.empty(mid_shape, ctx=ndarray.gpu(0))
+    var = ndarray.empty(mid_shape, ctx=ndarray.gpu(0))
+    norm_out = ht.empty(shapeX, ctx=ctx)
+    gpu_op.instance_normalization2d(x, mean, var, norm_out, eps)
+
+    x_grad = ht.array(np.ones(shapeX), ctx=ctx)
+    norm_grad_out = ht.empty(shapeX, ctx=ctx)
+    gpu_op.instance_normalization2d_gradient(
+        x_grad, x, norm_grad_out, mean, var, eps)
+
+    def correct_res(arr_in, grad_in, eps):
+        mu = np.mean(arr_in, axis=(2, 3))
+        sigma = np.var(arr_in, axis=(2, 3))
+
+        grad_out = np.ones_like(grad_in)
+        for i in range(arr_in.shape[0]):
+            for j in range(arr_in.shape[1]):
+                for k in range(arr_in.shape[2]):
+                    for h in range(arr_in.shape[3]):
+                        y = (arr_in[i][j][k][h] - mu[i][j]) / \
+                            np.sqrt(sigma[i][j] + eps)
+                        grad_out[i][j][k][h] = grad_in[i][j][k][h] * \
+                            (1 - 1/(shapeX[2]*shapeX[3]) - y **
+                             2) / np.sqrt(sigma[i][j] + eps)
+        return grad_out
+
+    np.testing.assert_allclose(norm_grad_out.asnumpy(), correct_res(
+        x.asnumpy(), x_grad.asnumpy(), eps), rtol=1e-5)
+
+
+def test_onehot():
+    ctx = ht.gpu(0)
+    shapein = (2, 3)
+    num_classes = 7
+    x = np.random.randint(7, size=shapein)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_y = ht.empty(list(shapein) + [num_classes], ctx=ctx)
+    gpu_op.one_hot(arr_x, arr_y)
+    print(x)
+    print(arr_y.asnumpy())
+
+
+test_array_set()
+test_broadcast_to()
+test_reduce_sum_axis_zero()
+test_matrix_elementwise_add()
+test_matrix_elementwise_add_by_const()
+test_matrix_elementwise_multiply()
+test_matrix_elementwise_multiply_by_const()
+test_matrix_elementwise_divide()
+test_matrix_elementwise_divide_const()
+test_matrix_opposite()
+test_matrix_multiply()
+test_matrix_sqrt()
+test_matrix_rsqrt()
+test_relu()
+test_leaky_relu()
+test_relu_gradient()
+test_leaky_relu_gradient()
+test_softmax()
+test_softmax_cross_entropy()
+test_softmax_cross_entropy_gradient()
+test_conv2d()
+test_conv2d_Gradient()
+test_average_pooling()
+test_reshape()
+test_conv2d_broadcast_to()
+test_conv2d_reduce_sum()
+test_cudnn_conv2d()
+test_cudnn_conv2d_Gradient()
+test_CuDNN_max_pooling()
+test_CuDNN_dropout_op()
+test_pad()
+test_concat()
+test_matrix_transpose()
+test_slice()
+test_where()
+test_batch_matrix_multiply()
+test_broadcast_shape()
+test_reduce_sum()
+test_reduce_mean()
+test_dropout()
+test_dropout2d()
+test_instance_norm2d()
+test_instance_norm2d_gradient()
+test_onehot()
diff --git a/tests/test_lr_scheduler.py b/tests/test_lr_scheduler.py
new file mode 100644
index 0000000..6c3431f
--- /dev/null
+++ b/tests/test_lr_scheduler.py
@@ -0,0 +1,77 @@
+import hetu as ht
+import random
+
+
+def test_fixed(learning_rate):
+    lr_sched = ht.lr.FixedScheduler(learning_rate)
+    for i in range(100):
+        assert lr_sched.step() == learning_rate
+    print('Fixed scheduler with learning_rate {} pass test.'.format(learning_rate))
+
+
+def test_step(learning_rate, step_size, gamma):
+    lr_sched = ht.lr.StepScheduler(learning_rate, step_size, gamma)
+    cur_lr = learning_rate / gamma
+    for i in range(10):
+        cur_lr *= gamma
+        cur_lr = max(cur_lr, 1e-8)
+        for j in range(step_size):
+            sched_lr = lr_sched.step()
+            assert sched_lr == cur_lr, 'Got {} and {} at {}:{}.'.format(
+                sched_lr, cur_lr, i, j)
+    print('Step scheduler with learning_rate {}, step_size {}, gamma {} pass test.'.format(
+        learning_rate, step_size, gamma))
+
+
+def test_multistep(learning_rate, milestones, gamma):
+    lr_sched = ht.lr.MultiStepScheduler(learning_rate, milestones, gamma)
+    cur_lr = learning_rate
+    cur_step = 0
+    for ms in milestones:
+        while cur_step < ms:
+            assert lr_sched.step() == cur_lr
+            cur_step += 1
+        cur_lr *= gamma
+    for i in range(10):
+        assert lr_sched.step() == cur_lr
+    print('Multistep scheduler with learning_rate {}, milestones {}, gamma {} pass test.'.format(
+        learning_rate, str(milestones), gamma))
+
+
+def test_exponential(learning_rate, gamma):
+    lr_sched = ht.lr.ExponentialScheduler(learning_rate, gamma)
+    cur_lr = learning_rate
+    cur_step = 0
+    for i in range(100):
+        assert lr_sched.step() == cur_lr
+        cur_lr = max(cur_lr * gamma, 1e-8)
+    print('Exponential scheduler with learning_rate {}, gamma {} pass test.'.format(
+        learning_rate, gamma))
+
+
+def test_reduce_on_plateau(learning_rate, mode, factor, patience, threshold, threshold_mode, cooldown):
+    lr_sched = ht.lr.ReduceOnPlateauScheduler(
+        learning_rate, mode, factor, patience, threshold, threshold_mode, cooldown)
+    cur_lr = learning_rate
+    results = []
+    inputs = []
+    for i in range(10):
+        inputs.append(random.random())
+        results.append(lr_sched.step(inputs[-1]))
+    print('Please check manually: lr {}, mode {}, factor {}, patience {}, threshold {} {}, cooldown {}'.format(
+        learning_rate, mode, factor, patience, threshold_mode, threshold, cooldown))
+    print('Inputs:', inputs)
+    print('Results:', results)
+
+
+test_fixed(0.1)
+test_step(0.1, 10, 0.1)
+test_step(0.1, 10, 0.5)
+test_multistep(0.1, [30, 80], 0.1)
+test_multistep(0.1, [5, 7, 11, 13], 0.5)
+test_exponential(0.1, 0.99)
+test_exponential(0.1, 0.5)
+test_reduce_on_plateau(0.1, 'min', 0.1, 1, 0.01, 'rel', 2)
+test_reduce_on_plateau(0.1, 'min', 0.1, 2, 0.01, 'abs', 1)
+test_reduce_on_plateau(0.1, 'max', 0.1, 3, 0.01, 'rel', 3)
+test_reduce_on_plateau(0.1, 'max', 0.1, 2, 0.01, 'abs', 0)
diff --git a/tests/test_nccl_bandwidth.py b/tests/test_nccl_bandwidth.py
new file mode 100644
index 0000000..0b008a9
--- /dev/null
+++ b/tests/test_nccl_bandwidth.py
@@ -0,0 +1,55 @@
+from hetu.communicator.mpi_nccl_comm import ncclDataType_t, ncclRedOp_t, mpi_communicator
+from hetu import ndarray
+import numpy as np
+import time
+
+
+def test_allreduce(comm=None):
+    shape = (24, 24)
+    size = 4
+    for val in shape:
+        size *= val
+    input_arr = np.ones(shape)*comm.localRank.value
+    input_arr = ndarray.array(input_arr, ctx=ndarray.gpu(comm.localRank.value))
+
+    start = time.time()
+    comm.dlarrayNcclAllReduce(input_arr, input_arr,
+                              ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum)
+    comm.stream.sync()
+    end = time.time()
+
+    secs = end - start
+
+    return size, secs
+
+
+def test_p2p(comm=None, src=0, target=1):
+    shape = (1000, 30, 224, 224)
+    size = 4
+    for val in shape:
+        size *= val
+    print("MyRank: ", comm.myRank.value)
+    arr = np.ones(shape)*comm.localRank.value
+    arr = ndarray.array(arr, ctx=ndarray.gpu(comm.localRank.value))
+    start = time.time()
+    if comm.myRank.value == 0:
+        comm.dlarraySend(arr, ncclDataType_t.ncclFloat32, 1)
+    else:
+        comm.dlarrayRecv(arr, ncclDataType_t.ncclFloat32, 0)
+    comm.stream.sync()
+    end = time.time()
+
+    secs = end - start
+    # size: /Bytes
+    # dur_time: /s
+    return size, secs
+
+
+# mpirun --allow-run-as-root --tag-output -np 2 python test_nccl_bandwidth.py
+if __name__ == "__main__":
+    comm = mpi_communicator()
+    comm = comm.ncclInit()
+    size, secs = test_p2p(comm)
+    print("band width: %.2f MB/s" % (size/(2**20)/secs))
+    size, secs = test_allreduce(comm)
+    print("band width: %.2f MB/s" % (size/(2**20)/secs))
diff --git a/tests/test_simple_version_ops.py b/tests/test_simple_version_ops.py
new file mode 100644
index 0000000..cb7153b
--- /dev/null
+++ b/tests/test_simple_version_ops.py
@@ -0,0 +1,200 @@
+import numpy as np
+import tensorflow as tf
+import hetu as ht
+
+
+def test_broadcast_shape(shape1=(3, 1), shape2=(2, 3, 4)):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape1).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.broadcast_shape_op(ath_x, shape2)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.broadcast_to(tf_x, shape2)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape))
+    print('Passed broadcast shape op test with shape ', shape1, shape2)
+
+
+test_broadcast_shape()
+test_broadcast_shape((1,), (2, 3, 4, 5))
+test_broadcast_shape((1, 1, 3, 1), (9, 8, 3, 7))
+
+
+def test_broadcast(shape1=(3, 1), shape2=(2, 3, 4)):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape1).astype(np.float32)
+    y = np.random.random(shape2).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_z = ht.Variable(name='y', value=y)
+    ath_y = ht.broadcastto_op(ath_x, ath_z)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.broadcast_to(tf_x, shape2)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape))
+    print('Passed broadcast shape op test with shape ', shape1, shape2)
+
+
+test_broadcast()
+test_broadcast((1,), (2, 3, 4, 5))
+test_broadcast((1, 1, 3, 1), (9, 8, 3, 7))
+
+
+def test_transpose(shape=(2, 3, 4, 5), perm=None):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.transpose_op(ath_x, perm)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.transpose(tf_x, perm)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape))
+    print('Passed transpose shape op test with shape ', shape, ' and perm ', perm)
+
+
+test_transpose()
+test_transpose(perm=(1, 0, 3, 2))
+test_transpose((5, 6, 7, 8, 9), (4, 2, 0, 3, 1))
+
+
+def test_slice(shape1=(7, 11, 13), shape2=(2, 3, 4), begin_pos=(0, 0, 0)):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape1).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.slice_op(ath_x, begin_pos, shape2)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.slice(tf_x, begin_pos, shape2)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape))
+    print('Passed slice op test with shape ', shape1,
+          shape2, ' and begin pos ', begin_pos)
+
+
+test_slice()
+test_slice(shape1=(5,), shape2=(2,), begin_pos=(1,))
+test_slice(shape1=(2, 3, 4, 5), shape2=(1, 2, 3, 4), begin_pos=(0, 0, 0, 1))
+test_slice(shape1=(2, 3, 4, 5, 6), shape2=(
+    1, 2, 3, 4, 5), begin_pos=(0, 1, 0, 1, 0))
+
+
+def test_add(shape=(2, 3, 4, 5), ctx=ht.gpu(1)):
+    x = np.random.random(shape).astype(np.float32)
+    z = np.random.random(shape).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_z = ht.Variable(name='z', value=z)
+    ath_y = ht.add_op(ath_x, ath_z)
+    executor = ht.Executor([ath_y], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_z = tf.convert_to_tensor(z)
+    tf_y = tf_x + tf_z
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    print('Passed add op test with shape ', shape)
+
+
+test_add()
+test_add((7, 9))
+test_add((4, 5, 6, 7, 8))
+test_add(ctx=ht.cpu(0))
+test_add((7, 9), ctx=ht.cpu(0))
+test_add((4, 5, 6, 7, 8), ctx=ht.cpu(0))
+
+
+def test_add_broadcast(shape1=(2, 3, 4, 5), shape2=(1, 4, 1), ctx=ht.gpu(1)):
+    x = np.random.random(shape1).astype(np.float32)
+    z = np.random.random(shape2).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_z = ht.Variable(name='z', value=z)
+    ath_y = ht.add_op(ath_x, ath_z)
+    executor = ht.Executor([ath_y], ctx=ctx, enable_lazy=False)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_z = tf.convert_to_tensor(z)
+    tf_y = tf_x + tf_z
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    print('Passed add op test with shape ', shape1, shape2)
+
+
+test_add_broadcast()
+test_add_broadcast((7, 9), (9,))
+test_add_broadcast((1, 1), (4, 5, 6, 7, 8))
+test_add_broadcast(ctx=ht.cpu(0))
+test_add_broadcast((9,), (7, 9), ctx=ht.cpu(0))
+test_add_broadcast((4, 5, 6, 7, 8), (1, 7, 1), ctx=ht.cpu(0))
+
+
+def test_add_lazy(shape1=(1, 4, 1), shape2=(2, 3, 4, 5), ctx=ht.gpu(1)):
+    x = np.random.random(shape1).astype(np.float32)
+    z = np.random.random(shape2).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_z = ht.Variable(name='z', value=z)
+    ath_y = ht.add_op(ht.broadcast_shape_op(ath_x, shape2), ath_z)
+    executor = ht.Executor([ath_y], ctx=ctx)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    tf_x = tf.convert_to_tensor(x)
+    tf_z = tf.convert_to_tensor(z)
+    tf_y = tf_x + tf_z
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    print('Passed add op test with shape ', shape1, shape2)
+
+
+test_add_lazy()
+test_add_lazy((9,), (7, 9))
+test_add_lazy((1, 1), (4, 5, 6, 7, 8))
+test_add_lazy(ctx=ht.cpu(0))
+test_add_lazy((9,), (7, 9), ctx=ht.cpu(0))
+test_add_lazy((1, 7, 1), (4, 5, 6, 7, 8), ctx=ht.cpu(0))
diff --git a/tests/test_sparse.py b/tests/test_sparse.py
new file mode 100644
index 0000000..c085648
--- /dev/null
+++ b/tests/test_sparse.py
@@ -0,0 +1,73 @@
+import numpy as np
+import scipy.sparse
+from hetu import ndarray
+from hetu import gpu_links as gpu_op
+from hetu import gpu_ops as autodiff
+
+
+def test_sparse_matrix_multiply():
+    density = 1e-3
+    ctx = ndarray.gpu(0)
+    x = scipy.sparse.rand(500, 7000, density=density,
+                          format='coo', dtype=np.float32)
+    y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32)
+    mat_x = ndarray.sparse_array(
+        x.data, (x.row, x.col), shape=[500, 7000], ctx=ctx)
+    mat_y = ndarray.array(y, ctx=ctx)
+    mat_z = ndarray.empty((500, 100), ctx=ctx)
+    gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z)
+    z = mat_z.asnumpy()
+    np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
+
+    # # following codes are invalid in cuda
+    # density = 1e-3
+    # ctx = ndarray.gpu(0)
+    # x = scipy.sparse.rand(1000, 500 ,density=density,format='coo',dtype=np.float32)
+    # y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
+    # mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [1000, 500], ctx=ctx)
+    # mat_y = ndarray.array(y, ctx=ctx)
+    # mat_z = ndarray.empty((1000, 2000), ctx=ctx)
+    # gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, True, mat_z)
+    # z = mat_z.asnumpy()
+    # np.testing.assert_allclose(x.dot(np.transpose(y)), z, rtol=1e-5)
+
+    # x = scipy.sparse.rand(500, 1000, density=density,format='coo',dtype=np.float32)
+    # y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
+    # mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 1000], ctx=ctx)
+    # mat_y = ndarray.array(y, ctx=ctx)
+    # mat_z = ndarray.empty((1000, 2000), ctx=ctx)
+    # gpu_op.CuSparse_Csrmm(mat_x, True, mat_y, True, mat_z)
+    # z = mat_z.asnumpy()
+    # np.testing.assert_allclose(x.T.dot(np.transpose(y)), z, rtol=1e-5)
+
+
+def test_sparse_array_dense_vector_multiply():
+    density = 1e-3
+    ctx = ndarray.gpu(0)
+    x = scipy.sparse.rand(500, 70000, density=density,
+                          format='coo', dtype=np.float32)
+    y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
+    mat_x = ndarray.sparse_array(
+        x.data, (x.row, x.col), shape=[500, 70000], ctx=ctx)
+    arr_y = ndarray.array(y, ctx=ctx)
+    arr_z = ndarray.empty((500, 1), ctx=ctx)
+    trans = False
+    gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
+
+    x = scipy.sparse.rand(70000, 500, density=density,
+                          format='coo', dtype=np.float32)
+    y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
+    mat_x = ndarray.sparse_array(
+        x.data, (x.row, x.col), shape=[70000, 500], ctx=ctx)
+    arr_y = ndarray.array(y, ctx=ctx)
+    arr_z = ndarray.empty((500, 1), ctx=ctx)
+    trans = True
+    gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
+    z = arr_z.asnumpy()
+    np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
+
+
+test_sparse_matrix_multiply()
+test_sparse_array_dense_vector_multiply()
diff --git a/tests/test_sparse_op.py b/tests/test_sparse_op.py
new file mode 100644
index 0000000..044ef98
--- /dev/null
+++ b/tests/test_sparse_op.py
@@ -0,0 +1,99 @@
+import numpy as np
+import scipy
+import hetu as ht
+
+
+def softmax_func(y):
+    """Numerically stable softmax."""
+    b = y - np.max(y, axis=1, keepdims=True)
+    expb = np.exp(b)
+    softmax = expb / np.sum(expb, axis=1, keepdims=True)
+    return softmax
+
+
+def test_csrmm_op(executor_ctx):
+    X = ht.Variable(name="X")
+    W = ht.Variable(name="W")
+    Y = ht.csrmm_op(X, W)
+    Y_ = ht.Variable(name="Y_")
+    loss = ht.softmaxcrossentropy_op(Y, Y_)
+    loss = ht.reduce_mean_op(loss, [0])
+    grads = ht.gradients(loss, [W, Y])
+
+    executor = ht.Executor(
+        [loss, grads[0], grads[1]], ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+
+    W_val = rand.normal(scale=0.1, size=[70000, 2]).astype(np.float32)
+    if ht.is_gpu_ctx(executor_ctx):
+        W_val = ht.array(W_val, ctx=executor_ctx)
+
+    X_val = scipy.sparse.rand(500, 70000, density=1e-5,
+                              format='coo', dtype=np.float32)
+    Y_val = np.random.uniform(0, 10, size=(500, 2)).astype(np.float32)
+
+    loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val})
+
+    if ht.is_gpu_ctx(executor_ctx):
+        W_val = W_val.asnumpy()
+    loss_val = [val.asnumpy() for val in loss_val]
+
+    y_groundtruth = X_val.dot(W_val)
+    loss_groundtruth = np.mean(
+        -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True)
+    Y_grad_groundtruth = (softmax_func(y_groundtruth) + -
+                          1 * Y_val) * np.ones(loss_groundtruth.shape) / 500
+    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)
+
+    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
+    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
+    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
+
+
+test_csrmm_op(ht.cpu(0))
+test_csrmm_op(ht.gpu(1))
+
+
+def test_csrmv_op(executor_ctx):
+    X = ht.Variable(name="X")
+    W = ht.Variable(name="W")
+    Y = ht.csrmv_op(X, W)
+    Y_ = ht.Variable(name="Y_")
+    temp = Y + (-1) * Y_
+    loss = temp * temp
+
+    grads = ht.gradients(loss, [W, Y])
+
+    executor = ht.Executor(
+        [loss, grads[0], grads[1]], ctx=executor_ctx)
+
+    rand = np.random.RandomState(seed=123)
+
+    W_val = rand.normal(scale=0.1, size=[70000, ])
+    if ht.is_gpu_ctx(executor_ctx):
+        W_val = ht.array(W_val, ctx=executor_ctx)
+
+    X_val = scipy.sparse.rand(500, 70000, density=1e-5,
+                              format='coo', dtype=np.float32)
+    Y_val = np.random.uniform(0, 10, size=(500, )).astype(np.float32)
+
+    loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val})
+
+    if ht.is_gpu_ctx(executor_ctx):
+        W_val = W_val.asnumpy()
+    loss_val = [val.asnumpy() for val in loss_val]
+
+    y_groundtruth = X_val.dot(W_val)
+    loss_groundtruth = (y_groundtruth - Y_val) ** 2
+    Y_grad_groundtruth = 2 * (y_groundtruth - Y_val) * \
+        np.ones(loss_groundtruth.shape)
+    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)
+
+    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
+    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
+    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
+
+
+test_csrmv_op(ht.cpu(0))
+test_csrmv_op(ht.gpu(1))
diff --git a/tests/test_transformer_ops.py b/tests/test_transformer_ops.py
new file mode 100644
index 0000000..2cfd8db
--- /dev/null
+++ b/tests/test_transformer_ops.py
@@ -0,0 +1,341 @@
+import numpy as np
+import hetu as ht
+from hetu import gpu_links as gpu_op
+
+
+def test_batch_matmul(shape1=(7, 4, 6), shape2=(7, 6, 5), transA=False, transB=False):
+    executor_ctx = ht.gpu(1)
+
+    if transA:
+        shape1 = tuple(list(shape1)[:-2] + [shape1[-1], shape1[-2]])
+    if transB:
+        shape2 = tuple(list(shape2)[:-2] + [shape2[-1], shape2[-2]])
+
+    data = np.random.normal(0.0, 0.2, shape1).astype(np.float32)
+    weights = np.random.normal(0.0, 0.1, shape2).astype(np.float32)
+
+    ath_data = ht.Variable(name='data')
+    ath_weights = ht.Variable(name='weights')
+    ath_output = ht.batch_matmul_op(
+        ath_data, ath_weights, trans_A=transA, trans_B=transB)
+
+    ath_grads = ht.gradients(ath_output, [ath_data, ath_weights])
+
+    executor = ht.Executor(
+        [ath_output] + ath_grads,
+        ctx=executor_ctx
+    )
+
+    ath_results = executor.run(
+        feed_dict={ath_data: data, ath_weights: weights})
+    ath_results = [res.asnumpy() for res in ath_results]
+
+    import tensorflow as tf
+    tf_data = tf.placeholder(name='data', dtype=tf.float32)
+    tf_weights = tf.placeholder(name='weights', dtype=tf.float32)
+    tf_output = tf.matmul(tf_data, tf_weights,
+                          transpose_a=transA, transpose_b=transB)
+    tf_grads = tf.gradients(tf_output, [tf_data, tf_weights])
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_output] + tf_grads,
+                              feed_dict={tf_data: data, tf_weights: weights})
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0], atol=1e-6)
+    np.testing.assert_allclose(ath_results[1], tf_results[1], atol=1e-6)
+    np.testing.assert_allclose(ath_results[2], tf_results[2], atol=1e-6)
+    print('Pass batch matmul op test with shape ', shape1, shape2)
+
+
+test_batch_matmul()
+test_batch_matmul(transA=True)
+test_batch_matmul(transB=True)
+test_batch_matmul(transA=True, transB=True)
+
+test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13))
+test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13), transA=True)
+test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13), transB=True)
+test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(
+    11, 3, 17, 13), transA=True, transB=True)
+
+
+def test_broadcast(shape1=(3, 1), shape2=(2, 3, 4)):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape1).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.broadcast_shape_op(ath_x, shape2)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    import tensorflow as tf
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.broadcast_to(tf_x, shape2)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], tf_results[0])
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape))
+    print('Passed broadcast shape op test with shape ', shape1, shape2)
+
+
+test_broadcast()
+test_broadcast((1,), (2, 3, 4, 5))
+test_broadcast((1, 1, 3, 1), (9, 8, 3, 7))
+
+
+def test_reduce_sum(shape=(2, 3, 4), axes=[2]):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.reduce_sum_op(ath_x, axes, keepdims=False)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    import tensorflow as tf
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.reduce_sum(tf_x, axes)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], np.reshape(
+        tf_results[0], ath_results[0].shape), rtol=1e-6)
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape), rtol=1e-6)
+    print('Passed reduce sum op test with shape and axes ', shape, axes)
+
+
+test_reduce_sum()
+test_reduce_sum((2, 3, 4), [2, 1])
+test_reduce_sum((2, 3, 4), [2, 1, 0])
+test_reduce_sum((2, 3, 1, 5, 6), [1, 2, 4])
+
+
+def test_reduce_mean(shape=(2, 3, 4), axes=[2]):
+    ctx = ht.gpu(1)
+    x = np.random.random(shape).astype(np.float32)
+    ath_x = ht.Variable(name='x', value=x)
+    ath_y = ht.reduce_mean_op(ath_x, axes, keepdims=False)
+    ath_grad = ht.gradients(ath_y, [ath_x])[0]
+    executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
+    ath_results = [var.asnumpy() for var in executor.run()]
+
+    import tensorflow as tf
+    tf_x = tf.convert_to_tensor(x)
+    tf_y = tf.reduce_mean(tf_x, axes)
+    tf_grad = tf.gradients(tf_y, tf_x)
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_results = sess.run([tf_y, tf_grad])
+
+    np.testing.assert_allclose(ath_results[0], np.reshape(
+        tf_results[0], ath_results[0].shape), rtol=1e-6)
+    np.testing.assert_allclose(ath_results[1], np.reshape(
+        tf_results[1], ath_results[1].shape), rtol=1e-6)
+    print('Passed reduce mean op test with shape and axes ', shape, axes)
+
+
+test_reduce_mean()
+test_reduce_mean((2, 3, 4), [2, 1])
+test_reduce_mean((2, 3, 4), [2, 1, 0])
+test_reduce_mean((2, 3, 1, 5, 6), [1, 2, 4])
+
+
+def test_layernorm_forward(shape=(5, 3)):
+    ctx = ht.gpu(1)
+    # shape = (5, 3)
+    last_dim = shape[-1]
+    x = np.random.random(shape).astype(np.float32)
+    scale = np.random.random((last_dim,)).astype(np.float32)
+    bias = np.random.random((last_dim,)).astype(np.float32)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_scale = ht.array(scale, ctx=ctx)
+    arr_bias = ht.array(bias, ctx=ctx)
+    arr_mean = ht.empty(list(shape[:-1]) + [1], ctx=ctx)
+    arr_var = ht.empty(list(shape[:-1]) + [1], ctx=ctx)
+    arr_y = ht.empty((shape), ctx=ctx)
+    gpu_op.layer_normalization(
+        arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01)
+
+    y = arr_y.asnumpy()
+
+    np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
+    np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
+    std = np.sqrt(np_vars + 0.01, dtype=np.float32)
+    centered_input = x - np_means
+    normed_input = centered_input / std
+
+    bc_shape = [1] * len(x.shape)
+    bc_shape[-1] = x.shape[-1]
+
+    y_ = scale.reshape(bc_shape) * normed_input + \
+        bias.reshape(bc_shape)
+
+    np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6)
+    np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6)
+    np.testing.assert_allclose(y_, y, atol=1e-6)
+    print('Pass forward test with shape ', shape)
+
+# test_layernorm_forward()
+# test_layernorm_forward(shape=(4, 500, 67))
+# test_layernorm_forward(shape=(2, 3, 5, 7, 11))
+
+
+def test_layernorm_backward(shape=(5, 3)):
+    ctx = ht.gpu(1)
+    # shape = (5, 3)
+    last_dim = shape[-1]
+    grads = np.random.random(shape).astype(np.float32)
+    x = np.random.random(shape).astype(np.float32)
+    scale = np.random.random((last_dim,)).astype(np.float32)
+    mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
+    var = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
+
+    arr_grads = ht.array(grads, ctx=ctx)
+    arr_x = ht.array(x, ctx=ctx)
+    arr_scale = ht.array(scale, ctx=ctx)
+    arr_mean = ht.array(mean, ctx=ctx)
+    arr_var = ht.array(var, ctx=ctx)
+
+    grad_inarr = ht.empty(shape, ctx=ctx)
+    grad_scale = ht.empty((last_dim,), ctx=ctx)
+    grad_bias = ht.empty((last_dim,), ctx=ctx)
+    gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale,
+                                        grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01)
+
+    # numpy calculate phase
+    red_axis = tuple(range(grads.ndim-1))
+    np_grad_bias = grads.sum(red_axis)  # (X,)
+
+    std = np.sqrt(var + 0.01)  # (N, 1)
+    x_centered = x - mean  # (N, X)
+    x_norm = x_centered / std  # (N, X)
+    np_grad_scale = (grads * x_norm).sum(red_axis)  # (X,)
+
+    last_dim = x.shape[-1]
+    dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1])  # (N, X)
+    dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * - \
+        0.5 / (var + 0.01) / std  # (N, 1)
+    dx_mu_1 = dx_norm / std  # (N, X)
+    dx_mu_2 = dvar * 2 * x_centered / last_dim  # (N, X)
+    dx_1 = dx_mu_1 + dx_mu_2  # (N, X)
+    dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim  # (N, 1)
+    np_grad_inarr = dx_1 + dx_2  # (N, X)
+
+    np.testing.assert_allclose(
+        np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4)
+    np.testing.assert_allclose(
+        np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4)
+    np.testing.assert_allclose(
+        np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4)
+    print('Pass backward test with shape ', shape)
+
+# test_layernorm_backward()
+# test_layernorm_backward(shape=(4, 500, 67))
+# test_layernorm_backward(shape=(2, 3, 5, 7, 11))
+
+
+def test_layer_norm_op(shape=(5, 3)):
+    # scale = np.random.random((shape[-1],)).astype(np.float32)
+    # bias = np.random.random((shape[-1],)).astype(np.float32)
+    scale = np.ones((shape[-1], )).astype(np.float32)
+    bias = np.zeros((shape[-1], )).astype(np.float32)
+
+    scale_data = ht.Variable(name='layer_norm_scale', value=scale)
+    bias_data = ht.Variable(name='layer_norm_biad', value=bias)
+    input_data = ht.Variable(name='input')
+    output = ht.layer_normalization_op(
+        input_data, scale_data, bias_data, 1e-12)
+    grads = ht.gradients(output, [scale_data, bias_data, input_data])
+
+    executor_ctx = ht.gpu(1)
+
+    executor = ht.Executor(
+        [output]+grads,
+        ctx=executor_ctx)
+
+    x = np.random.normal(loc=0.0, scale=1, size=shape).astype(np.float32)
+
+    results = executor.run(feed_dict={input_data: x})
+    y = results[0].asnumpy()
+    grad_scale = results[1].asnumpy()
+    grad_bias = results[2].asnumpy()
+    grad_input = results[3].asnumpy()
+    # print(y)
+
+    np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
+    np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
+    std = np.sqrt(np_vars + 1e-12, dtype=np.float32)
+    centered_input = x - np_means
+    normed_input = centered_input / std
+
+    bc_shape = [1] * len(x.shape)
+    bc_shape[-1] = x.shape[-1]
+
+    y_ = scale.reshape(bc_shape) * normed_input + \
+        bias.reshape(bc_shape)
+
+    np.testing.assert_allclose(y_, y, atol=1e-6)
+    # print(y_)
+
+    prev_grad = np.ones(y_.shape).astype(np.float32)
+
+    red_axis = tuple(range(prev_grad.ndim-1))
+    np_grad_bias = prev_grad.sum(red_axis)  # (X,)
+
+    std = np.sqrt(np_vars + 1e-12)  # (N, 1)
+    x_centered = x - np_means  # (N, X)
+    x_norm = x_centered / std  # (N, X)
+    np_grad_scale = (prev_grad * x_norm).sum(red_axis)  # (X,)
+
+    last_dim = x.shape[-1]
+    dx_norm = prev_grad * \
+        scale.reshape([1] * (prev_grad.ndim - 1) + [-1])  # (N, X)
+    dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * - \
+        0.5 / (np_vars + 1e-12) / std  # (N, 1)
+    dx_mu_1 = dx_norm / std  # (N, X)
+    dx_mu_2 = dvar * 2 * x_centered / last_dim  # (N, X)
+    dx_1 = dx_mu_1 + dx_mu_2  # (N, X)
+    dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim  # (N, 1)
+    np_grad_inarr = dx_1 + dx_2  # (N, X)
+
+    np.testing.assert_allclose(grad_bias, np_grad_bias, rtol=1e-6, atol=1e-4)
+    np.testing.assert_allclose(grad_scale, np_grad_scale, rtol=1e-6, atol=1e-4)
+    np.testing.assert_allclose(grad_input, np_grad_inarr, rtol=1e-6, atol=1e-4)
+
+    import tensorflow as tf
+    tf_input = tf.convert_to_tensor(x)
+    tf_result = tf.contrib.layers.layer_norm(
+        inputs=tf_input, begin_norm_axis=-1, begin_params_axis=-1)
+    tf_gamma = tf.global_variables()[-1]
+    tf_beta = tf.global_variables()[-2]
+    tf_grads = tf.gradients(tf_result, [tf_gamma, tf_beta, tf_input])
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        tf_all_results = sess.run([tf_result]+tf_grads)
+    y_tf = tf_all_results[0]
+    tf_scale_grad = tf_all_results[1]
+    tf_bias_grad = tf_all_results[2]
+    tf_input_grad = tf_all_results[3]
+    # print(y_tf)
+    np.testing.assert_allclose(y_tf, y, rtol=1e-6, atol=1e-4)
+    np.testing.assert_allclose(grad_bias, tf_bias_grad, rtol=1e-6, atol=1e-4)
+    np.testing.assert_allclose(grad_scale, tf_scale_grad, rtol=1e-6, atol=1e-4)
+    if shape[-1] > 100:
+        atol = 1e-4
+    else:
+        atol = 1e-5
+    np.testing.assert_allclose(grad_input, tf_input_grad, atol=1e-4)
+
+    print('Pass op test with shape ', shape)
+
+
+test_layer_norm_op()
+test_layer_norm_op(shape=(4, 5, 6))
+test_layer_norm_op(shape=(2, 256, 768))
diff --git a/third_party/GraphMix b/third_party/GraphMix
new file mode 160000
index 0000000..c65ac28
--- /dev/null
+++ b/third_party/GraphMix
@@ -0,0 +1 @@
+Subproject commit c65ac280db2c25d41c012ad8b290e82a63e36647
diff --git a/third_party/HetuML b/third_party/HetuML
new file mode 160000
index 0000000..d728408
--- /dev/null
+++ b/third_party/HetuML
@@ -0,0 +1 @@
+Subproject commit d7284086c0fee730e95713474a45569b8af5820d