add document of basic framework

1. add the design document of the basic framework. 2. add the roadmap. 3. add the proposal of joint inference. Signed-off-by: Jie Pu <pujie2@huawei.com>
4 years ago · caaee68204
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,34 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = build
 SPHINXAPIDOC  = sphinx-apidoc
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 clean:
 	rm -rf "$(BUILDDIR)"
 api:
 	rm -rf ./source/api/*
 	@$(SPHINXAPIDOC) -M -o ./lib-api/joint_inference ../lib/neptune/joint_inference
 html:
 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 all: clean api html
--- a/docs/_static/css/custom.css
+++ b/docs/_static/css/custom.css
@@ -0,0 +1,3 @@
 .wy-nav-content{
    max-width: 100%;
 }
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -0,0 +1,79 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
 import sys
 import sphinx_rtd_theme
 sys.path.insert(0, os.path.abspath('../lib'))
 sys.path.insert(0, os.path.abspath('../lib/neptune'))
 # -- Project information -----------------------------------------------------
 project = 'Neptune'
 copyright = '2020, Kubeedge'
 author = 'Kubeedge'
 # -- General configuration ---------------------------------------------------
 from recommonmark.parser import CommonMarkParser
 source_parsers = {
    '.md': CommonMarkParser,
 }
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = ['m2r2', 'sphinx.ext.autodoc', 'sphinx_markdown_tables', ]
 # Add any paths that contain templates here, relative to this directory.
 # templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 # The master toctree document
 master_doc = 'index'
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 html_static_path = ['_static']
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 html_theme_options = {
    'prev_next_buttons_location': 'both'
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 # html_static_path = ['_static']
 source_suffix = {
    '.rst': 'restructuredtext',
    '.txt': 'markdown',
    '.md': 'markdown',
 }
 def setup(app):
    app.add_stylesheet('css/custom.css')
--- a/docs/examples/joint_inference_example_link.rst
+++ b/docs/examples/joint_inference_example_link.rst
@@ -0,0 +1 @@
 .. mdinclude:: ../../examples/helmet_detection_inference/README.md
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -0,0 +1,57 @@
 ===========================================
 Neptune documentation
 ===========================================
 .. toctree::
    :maxdepth: 1
    :caption: QUICK START
    :hidden:
    quickstart
 .. toctree::
    :maxdepth: 1
    :caption: INTRODUCTION
    :hidden:
    proposals/architecture
    proposals/dataset-and-model
    proposals/joint-inference
 .. toctree::
   :maxdepth: 1
   :caption: Setup
   setup/install
 .. toctree::
    :maxdepth: 1
    :caption: EXAMPLES
    :hidden:
    examples/joint_inference_example_link
 .. toctree::
    :maxdepth: 2
    :caption: API
    :hidden:
    lib-api/modules
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/docs/lib-api/modules.rst
+++ b/docs/lib-api/modules.rst
@@ -0,0 +1,7 @@
 neptune
 =======
 .. toctree::
   :maxdepth: 4
   neptune
--- a/docs/lib-api/neptune.hard_example_mining.image_classification.rst
+++ b/docs/lib-api/neptune.hard_example_mining.image_classification.rst
@@ -0,0 +1,16 @@
 neptune.hard\_example\_mining.image\_classification package
 ===========================================================
 .. automodule:: neptune.hard_example_mining.image_classification
   :members:
   :undoc-members:
   :show-inheritance:
 neptune.hard\_example\_mining.image\_classification.hard\_mine\_filters module
 ------------------------------------------------------------------------------
 .. automodule:: neptune.hard_example_mining.image_classification.hard_mine_filters
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/lib-api/neptune.hard_example_mining.object_detection.rst
+++ b/docs/lib-api/neptune.hard_example_mining.object_detection.rst
@@ -0,0 +1,18 @@
 neptune.hard\_example\_mining.object\_detection package
 =======================================================
 .. automodule:: neptune.hard_example_mining.object_detection
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 neptune.hard\_example\_mining.object\_detection.scores\_filters module
 ----------------------------------------------------------------------
 .. automodule:: neptune.hard_example_mining.object_detection.scores_filters
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/lib-api/neptune.hard_example_mining.rst
+++ b/docs/lib-api/neptune.hard_example_mining.rst
@@ -0,0 +1,27 @@
 neptune.hard\_example\_mining package
 =====================================
 .. automodule:: neptune.hard_example_mining
   :members:
   :undoc-members:
   :show-inheritance:
 Subpackages
 -----------
 .. toctree::
   :maxdepth: 4
   neptune.hard_example_mining.image_classification
   neptune.hard_example_mining.object_detection
 Submodules
 ----------
 neptune.hard\_example\_mining.base module
 -----------------------------------------
 .. automodule:: neptune.hard_example_mining.base
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/lib-api/neptune.joint_inference.rst
+++ b/docs/lib-api/neptune.joint_inference.rst
@@ -0,0 +1,26 @@
 neptune.joint\_inference package
 ================================
 .. automodule:: neptune.joint_inference
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 neptune.joint\_inference.data module
 ------------------------------------
 .. automodule:: neptune.joint_inference.data
   :members:
   :undoc-members:
   :show-inheritance:
 neptune.joint\_inference.joint\_inference module
 ------------------------------------------------
 .. automodule:: neptune.joint_inference.joint_inference
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/lib-api/neptune.rst
+++ b/docs/lib-api/neptune.rst
@@ -0,0 +1,18 @@
 Subpackages
 -----------
 .. toctree::
   :maxdepth: 4
   neptune.hard_example_mining
   neptune.joint_inference
 neptune.context module
 ----------------------
 .. automodule:: neptune.context
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=.
 set BUILDDIR=build
 if "%1" == "" goto help
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/proposals/architecture.md
+++ b/docs/proposals/architecture.md
@@ -0,0 +1,58 @@
 # Edge Cloud Collaborative AI Framework
 ## Motivation
 Currently, "Edge AI" in the industry is at an early stage of training on the cloud and inference on the edge. However, the future trend has emerged, and related research and practice are booming, bringing new value growth points for edge computing and AI. Also, edge AI applications have much room for optimization in terms of cost, model effect, and privacy protection. For example:
 This proposal provides a basic framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement, and data privacy protection.
 ### Goals
 For AI applications running at the edge, the goals of edge cloud collaborative framework are:
 * reducing resource cost on the edge 
 * improving model performance 
 * protecting data privacy
 ## Proposal
 * What we propose:
    * an edge-cloud collaborative AI framework based on KubeEdge
    * with embed collaborative training and joint inferencing algorithm
    * working with existing AI framework like Tensorflow, etc
 * 3 Features：
    * joint inference
    * incremental learning
    * federated learning
 * Targeting Users：
    * Domain-specific AI Developers: build and publish edge-cloud collaborative AI services/functions easily 
    * Application Developers: use edge-cloud collaborative AI capabilities.
 * We are NOT:
    * to re-invent existing ML framework, i.e., tensorflow, pytorch, mindspore, etc.
    * to re-invent existing edge platform, i.e., kubeedge, etc.
    * to offer domain/application-specific algorithms, i.e.,  facial recognition, text classification, etc.
 ### Architecture
 ![](./images/framework.png)
 * GlobalManager: implements the Edge AI features controllers based on the [k8s operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/)
    * Federated Learning Controller: Implements the federated learning feature based on user created CRDs
    * Incremental Learning Controller: Implements the incremental learning feature based on user created CRDs
    * Joint Inference Controller: Implements the joint inference feature based on user created CRDs
 * LocalController: manages the Edge AI features, the extra dataset/model resources on the edge nodes
 * Workers: includes the training/evaluation/inference/aggregator
    * do inference or training, based on existing ML framework
    * launch on demand, imagine they are docker containers
    * different workers for different features
    * could run on edge or cloud
 * Lib: exposes the Edge AI features to applications, i.e. training or inference programs
--- a/docs/proposals/dataset-and-model.md
+++ b/docs/proposals/dataset-and-model.md
@@ -0,0 +1,350 @@
 * [Dataset and Model](#dataset-and-model)
   * [Motivation](#motivation)
     * [Goals](#goals)
     * [Non\-goals](#non-goals)
   * [Proposal](#proposal)
     * [Use Cases](#use-cases)
   * [Design Details](#design-details)
     * [CRD API Group and Version](#crd-api-group-and-version)
     * [CRDs](#crds)
     * [Type definition](#crd-type-definition)
     * [Crd sample](#crd-samples)
   * [Controller Design](#controller-design)
 # Dataset and Model
 ## Motivation
 Currently, the Edge AI features depend on the object `dataset` and `model`.
 This proposal provides the definitions of dataset and model as the first class of k8s resources.
 ### Goals
 * Metadata of `dataset` and `model` objects.
 * Used by the Edge AI features 
 ### Non-goals
 * The truly format of the AI `dataset`, such as `imagenet`, `coco` or `tf-record` etc.
 * The truly format of the AI `model`, such as `ckpt`, `saved_model` of tensorflow etc.
 * The truly operations of the AI `dataset`, such as `shuffle`, `crop` etc.
 * The truly operations of the AI `model`, such as `train`, `inference` etc.
 ## Proposal
 We propose using Kubernetes Custom Resource Definitions (CRDs) to describe 
 the dataset/model specification/status and a controller to synchronize these updates between edge and cloud.
 ![](./images/dataset-model-crd.png)
 ### Use Cases
 * Users can create the dataset resource, by providing the `dataset url`, `format` and the `nodeName` which owns the dataset.
 * Users can create the model resource by providing the `model url` and `format`.
 * Users can show the information of dataset/model.
 * Users can delete the dataset/model. 
 ## Design Details
 ### CRD API Group and Version
 The `Dataset` and `Model` CRDs will be namespace-scoped.
 The tables below summarize the group, kind and API version details for the CRDs.
 * Dataset
 | Field                 | Description             |
 |-----------------------|-------------------------|
 |Group                  | neptune.io     |
 |APIVersion             | v1alpha1                |
 |Kind                   | Dataset             |
 * Model
 | Field                 | Description             |
 |-----------------------|-------------------------|
 |Group                  | neptune.io     |
 |APIVersion             | v1alpha1                |
 |Kind                   | Model             |
 ### CRDs
 #### `Dataset` CRD
 [crd source](/build/crds/neptune/dataset_v1alpha1.yaml)
 ```yaml
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
  name: datasets.neptune.io
 spec:
  group: neptune.io
  names:
    kind: Dataset
    plural: datasets
  scope: Namespaced
  versions:
    - name: v1alpha1
      subresources:
        # status enables the status subresource.
        status: {}
      served: true
      storage: true
      schema:
        openAPIV3Schema:
          type: object
          properties:
            spec:
              type: object
              required:
                - url
                - format
              properties:
                url:
                  type: string
                format:
                  type: string
                nodeName:
                  type: string
            status:
              type: object
              properties:
                numberOfSamples:
                  type: integer
                updateTime:
                  type: string
                  format: datatime
      additionalPrinterColumns:
        - name: NumberOfSamples
          type: integer
          description: The number of samples in the dataset
          jsonPath: ".status.numberOfSamples"
        - name: Node
          type: string
          description: The node name of the dataset
          jsonPath: ".spec.nodeName"
        - name: spec
          type: string
          description: The spec of the dataset
          jsonPath: ".spec"
 ```
 1. `format` of dataset
 We use this field to report the number of samples for the dataset and do dataset splitting.
 Current we support these below formats:
 - txt: one nonempty line is one sample
 #### `Model` CRD
 [crd source](/build/crds/neptune/model_v1alpha1.yaml)
 ```yaml
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
  name: models.neptune.io
 spec:
  group: neptune.io
  names:
    kind: Model
    plural: models
  scope: Namespaced
  versions:
    - name: v1alpha1
      subresources:
        # status enables the status subresource.
        status: {}
      served: true
      storage: true
      schema:
        openAPIV3Schema:
          type: object
          properties:
            spec:
              type: object
              required:
                - url
                - format
              properties:
                url:
                  type: string
                format:
                  type: string
            status:
              type: object
              properties:
                updateTime:
                  type: string
                  format: datetime
                metrics:
                  type: array
                  items:
                    type: object
                    properties:
                      key:
                        type: string
                      value:
                        type: string
      additionalPrinterColumns:
        - name: updateAGE
          type: date
          description: The update age
          jsonPath: ".status.updateTime"
        - name: metrics
          type: string
          description: The metrics
          jsonPath: ".status.metrics"
 ```
 ### CRD type definition
 - `Dataset`
 [go source](cloud/pkg/apis/neptune/v1alpha1/dataset_types.go)
 ```go
 package v1alpha1
 import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 // +genclient
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // Dataset describes the data that a dataset resource should have
 type Dataset struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 	Spec   DatasetSpec   `json:"spec"`
 	Status DatasetStatus `json:"status"`
 }
 // DatasetSpec is a description of a dataset
 type DatasetSpec struct {
 	URL  string `json:"url"`
 	Format   string `json:"format"`
 	NodeName string `json:"nodeName"`
 }
 // DatasetStatus represents information about the status of a dataset
 // including the time a dataset updated, and number of samples in a dataset
 type DatasetStatus struct {
 	UpdateTime      *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"`
 	NumberOfSamples int          `json:"numberOfSamples"`
 }
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // DatasetList is a list of Datasets
 type DatasetList struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ListMeta `json:"metadata"`
 	Items []Dataset `json:"items"`
 }
 ```
 - `Model`
 [go source](cloud/pkg/apis/neptune/v1alpha1/model_types.go)
 ```go
 package v1alpha1
 import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 // +genclient
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // Model describes the data that a model resource should have
 type Model struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 	Spec   ModelSpec   `json:"spec"`
 	Status ModelStatus `json:"status"`
 }
 // ModelSpec is a description of a model
 type ModelSpec struct {
 	URL string `json:"url"`
 	Format   string `json:"format"`
 }
 // ModelStatus represents information about the status of a model
 // including the time a model updated, and metrics in a model
 type ModelStatus struct {
 	UpdateTime *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"`
 	Metrics    []Metric     `json:"metrics,omitempty" protobuf:"bytes,2,rep,name=metrics"`
 }
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 //  ModelList is a list of Models
 type ModelList struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ListMeta `json:"metadata"`
 	Items []Model `json:"items"`
 }
 ```
 ### Crd samples
 - `Dataset`
 ```yaml
 apiVersion: neptune.io/v1alpha1
 kind: Dataset
 metadata:
  name: "dataset-examp"
 spec:
  url: "/code/data"
  format: "txt"
  nodeName: "edge0"
 ```
 - `Model`
 ```yaml
 apiVersion: neptune.io/v1alpha1
 kind: Model
 metadata:
  name: model-examp
 spec:
  url: "/model/frozen.pb"
  format: pb
 ```
 ## Controller Design
 In the current design there is downstream/upstream controller for `dataset`, no downstream/upstream controller for `model`.<br/>
 The dataset controller synchronizes the dataset between the cloud and edge.
 - downstream: synchronize the dataset info from the cloud to the edge node.
 - upstream: synchronize the dataset status from the edge to the cloud node, such as the information how many samples the dataset has.
 <br/>
 Here is the flow of the dataset creation:
 ![](./images/dataset-creation-flow.png)
 For the model:
 1. Model's info will be synced when sync the federated-task etc which uses the model.
 1. Model's status will be updated when the corresponding training/inference work has completed. 
--- a/docs/proposals/images/dataset-creation-flow.png
+++ b/docs/proposals/images/dataset-creation-flow.png
--- a/docs/proposals/images/dataset-model-crd.png
+++ b/docs/proposals/images/dataset-model-crd.png
--- a/docs/proposals/images/framework-zh.png
+++ b/docs/proposals/images/framework-zh.png
--- a/docs/proposals/images/framework.png
+++ b/docs/proposals/images/framework.png
--- a/docs/proposals/images/joint-inference-controller.png
+++ b/docs/proposals/images/joint-inference-controller.png
--- a/docs/proposals/images/joint-inference-downstream-controller.png
+++ b/docs/proposals/images/joint-inference-downstream-controller.png
--- a/docs/proposals/images/joint-inference-flow-creation.png
+++ b/docs/proposals/images/joint-inference-flow-creation.png
--- a/docs/proposals/images/joint-inference-service-crd-details.png
+++ b/docs/proposals/images/joint-inference-service-crd-details.png
--- a/docs/proposals/images/joint-inference-service-crd.png
+++ b/docs/proposals/images/joint-inference-service-crd.png
--- a/docs/proposals/images/joint-inference-upstream-controller.png
+++ b/docs/proposals/images/joint-inference-upstream-controller.png
--- a/docs/proposals/images/joint-inference-worker-communication.png
+++ b/docs/proposals/images/joint-inference-worker-communication.png
--- a/docs/proposals/joint-inference.md
+++ b/docs/proposals/joint-inference.md
@@ -0,0 +1,553 @@
 * [Joint Inference](#joint-inference)
   * [Motivation](#motivation)
     * [Goals](#goals)
     * [Non\-goals](#non-goals)
   * [Proposal](#proposal)
     * [Use Cases](#use-cases)
   * [Design Details](#design-details)
     * [CRD API Group and Version](#crd-api-group-and-version)
     * [Joint inference CRD](#joint-inference-crd)
     * [Joint inference type definition](#joint-inference-type-definition)
     * [Joint inference sample](#joint-inference-sample)
     * [Validation](#validation)
   * [Controller Design](#controller-design)
     * [Joint Inference Controller](#joint-inference-controller)
     * [Downstream Controller](#downstream-controller)
     * [Upstream Controller](#upstream-controller)
     * [Details of api between GM(cloud) and LC(edge)](#details-of-api-between-gmcloud-and-lcedge)
     * [Details of api between Worker(edge) and LC(edge)](#details-of-api-between-workeredge-and-lcedge)
     * [Flow of Joint Inference](#flow-of-joint-inference)
   * [Workers Communication](#workers-communication)
 # Joint Inference
 ## Motivation
 Inference on the edge can get a shorter latency and a higher throughput, and inference on the cloud can get better inference precision. 
 The collaborative inference technology detects hard samples on the edge and sends them to the cloud for inference. 
 **In this way, simple samples inference on the edge ensures latency and throughput, while hard samples inference on the cloud improves the overall precision.**
 ### Goals
 * Joint inference improves the inference precision without significantly reducing the time and throughput.
 ## Proposal
 We propose using Kubernetes Custom Resource Definitions (CRDs) to describe 
 the joint inference specification/status and a controller to synchronize these updates between edge and cloud.
 ![](./images/joint-inference-service-crd.png)
 ### Use Cases
 * User can create a joint inference service with providing a training script,
 specifying the aggregation algorithm, configuring training hyper parameters, 
 configuring training datasets.
 * Users can get the joint inference status, including the counts of inference at the edge/cloud.
 ## Design Details
 ### CRD API Group and Version
 The `JointInferenceService` CRD will be namespace-scoped.
 The tables below summarize the group, kind and API version details for the CRD.
 * JointInferenceService
 | Field                 | Description             |
 |-----------------------|-------------------------|
 |Group                  | neptune.io     |
 |APIVersion             | v1alpha1                |
 |Kind                   | JointInferenceService             |
 ### Joint inference CRD
 ![](./images/joint-inference-service-crd-details.png)
 Below is the CustomResourceDefinition yaml for `JointInferenceService`:
 [crd source](/build/crds/neptune/jointinferenceservice_v1alpha1.yaml)
 ```yaml
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
  name: jointinferenceservices.neptune.io
 spec:
  group: neptune.io
  names:
    kind: JointInferenceService
    plural: jointinferenceservices
    shortNames:
      - jointinferenceservice
      - jis
  scope: Namespaced
  versions:
    - name: v1alpha1
      subresources:
        # status enables the status subresource.
        status: {}
      served: true
      storage: true
      schema:
        openAPIV3Schema:
          type: object
          properties:
            spec:
              type: object
              required:
                - edgeWorker
                - cloudWorker
              properties:
                edgeWorker:
                  type: object
                  required:
                    - name
                    - model
                    - nodeName
                    - hardExampleAlgorithm
                    - workerSpec
                  properties:
                    name:
                      type: string
                    model:
                      type: object
                      required:
                        - name
                      properties:
                        name:
                          type: string
                    nodeName:
                      type: string
                    hardExampleAlgorithm:
                      type: object
                      required:
                        - name
                      properties:
                        name:
                          type: string
                    workerSpec:
                      type: object
                      required:
                        - scriptDir
                        - scriptBootFile
                        - frameworkType
                        - frameworkVersion
                      properties:
                        scriptDir:
                          type: string
                        scriptBootFile:
                          type: string
                        frameworkType:
                          type: string
                        frameworkVersion:
                          type: string
                        parameters:
                          type: array
                          items:
                            type: object
                            required:
                              - key
                              - value
                            properties:
                              key:
                                type: string
                              value:
                                type: string
                cloudWorker:
                  type: object
                  required:
                    - name
                    - model
                    - nodeName
                    - workerSpec
                  properties:
                    name:
                      type: string
                    model:
                      type: object
                      required:
                        - name
                      properties:
                        name:
                          type: string
                    nodeName:
                      type: string
                    workerSpec:
                      type: object
                      required:
                        - scriptDir
                        - scriptBootFile
                        - frameworkType
                        - frameworkVersion
                      properties:
                        scriptDir:
                          type: string
                        scriptBootFile:
                          type: string
                        frameworkType:
                          type: string
                        frameworkVersion:
                          type: string
                        parameters:
                          type: array
                          items:
                            type: object
                            required:
                              - key
                              - value
                            properties:
                              key:
                                type: string
                              value:
                                type: string
            status:
              type: object
              properties:
                conditions:
                  type: array
                  items:
                    type: object
                    properties:
                      type:
                        type: string
                      status:
                        type: string
                      lastHeartbeatTime:
                        type: string
                        format: date-time
                      lastTransitionTime:
                        type: string
                        format: date-time
                      reason:
                        type: string
                      message:
                        type: string
                startTime:
                  type: string
                  format: date-time
                active:
                  type: integer
                failed:
                  type: integer
                metrics:
                  type: array
                  items:
                    type: object
                    properties:
                      key:
                        type: string
                      value:
                        type: string
      additionalPrinterColumns:
        - name: status
          type: string
          description: The status of the jointinference service
          jsonPath: ".status.conditions[-1].type"
        - name: active
          type: integer
          description: The number of active worker
          jsonPath: ".status.active"
        - name: failed
          type: integer
          description: The number of failed worker
          jsonPath: ".status.failed"
        - name: Age
          type: date
          jsonPath: .metadata.creationTimestamp
 ```
 ### Joint inference type definition
 [go source](cloud/pkg/apis/neptune/v1alpha1/jointinferenceservice_types.go)
 ```go
 package v1alpha1
 import (
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 // +genclient
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // JointInferenceService describes the data that a jointinferenceservice resource should have
 type JointInferenceService struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ObjectMeta `json:"metadata"`
 	Spec   JointInferenceServiceSpec   `json:"spec"`
 	Status JointInferenceServiceStatus `json:"status,omitempty"`
 }
 // JointInferenceServiceSpec is a description of a jointinferenceservice
 type JointInferenceServiceSpec struct {
 	EdgeWorker  EdgeWorker  `json:"edgeWorker"`
 	CloudWorker CloudWorker `json:"cloudWorker"`
 }
 // EdgeWorker describes the data a edge worker should have
 type EdgeWorker struct {
 	Name                 string               `json:"name"`
 	Model                SmallModel           `json:"model"`
 	NodeName             string               `json:"nodeName"`
 	HardExampleAlgorithm HardExampleAlgorithm `json:"hardExampleAlgorithm"`
 	WorkerSpec           CommonWorkerSpec     `json:"workerSpec"`
 }
 // CloudWorker describes the data a cloud worker should have
 type CloudWorker struct {
 	Name       string           `json:"name"`
 	Model      BigModel         `json:"model"`
 	NodeName   string           `json:"nodeName"`
 	WorkerSpec CommonWorkerSpec `json:"workerSpec"`
 }
 type SmallModel struct {
 	Name string `json:"name"`
 }
 type BigModel struct {
 	Name string `json:"name"`
 }
 type HardExampleAlgorithm struct {
 	Name string `json:"name"`
 }
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
 // JointInferenceServiceList is a list of JointInferenceServices.
 type JointInferenceServiceList struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ListMeta `json:"metadata"`
 	Items           []JointInferenceService `json:"items"`
 }
 // JointInferenceServiceStatus represents the current state of a joint inference service.
 type JointInferenceServiceStatus struct {
 	// The latest available observations of a joint inference service's current state.
 	// +optional
 	Conditions []JointInferenceServiceCondition `json:"conditions,omitempty"`
 	// Represents time when the service was acknowledged by the service controller.
 	// It is not guaranteed to be set in happens-before order across separate operations.
 	// It is represented in RFC3339 form and is in UTC.
 	// +optional
 	StartTime *metav1.Time `json:"startTime,omitempty"`
 	// The number of actively running workers.
 	// +optional
 	Active int32 `json:"active"`
 	// The number of workers which reached to Failed.
 	// +optional
 	Failed int32 `json:"failed"`
 	// Metrics of the joint inference service.
 	Metrics []Metric `json:"metrics,omitempty"`
 }
 type JointInferenceServiceConditionType string
 // These are valid conditions of a service.
 const (
 	// JointInferenceServiceCondPending means the service has been accepted by the system,
 	// but one or more of the workers has not been started.
 	JointInferenceServiceCondPending JointInferenceServiceConditionType = "Pending"
 	// JointInferenceServiceCondFailed means the service has failed its execution.
 	JointInferenceServiceCondFailed JointInferenceServiceConditionType = "Failed"
 	// JointInferenceServiceReady means the service has been ready.
 	JointInferenceServiceCondRunning JointInferenceServiceConditionType = "Running"
 )
 // JointInferenceServiceCondition describes current state of a service.
 type JointInferenceServiceCondition struct {
 	// Type of service condition, Complete or Failed.
 	Type JointInferenceServiceConditionType `json:"type"`
 	// Status of the condition, one of True, False, Unknown.
 	Status v1.ConditionStatus `json:"status"`
 	// Last time the condition was checked.
 	// +optional
 	LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"`
 	// Last time the condition transit from one status to another.
 	// +optional
 	LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
 	// (brief) reason for the condition's last transition.
 	// +optional
 	Reason string `json:"reason,omitempty"`
 	// Human readable message indicating details about last transition.
 	// +optional
 	Message string `json:"message,omitempty"`
 }
 ```
 #### Validation
 [Open API v3 Schema based validation](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#validation) can be used to guard against bad requests.
 Invalid values for fields ( example string value for a boolean field etc) can be validated using this.
 Here is a list of validations we need to support :
 1. The `dataset` specified in the crd should exist in k8s.
 1. The `model` specified in the crd should exist in k8s.
 1. The edgenode name specified in the crd should exist in k8s.
 ### joint inference sample
 ```yaml
 apiVersion: neptune.io/v1alpha1
 kind: JointInferenceService
 metadata:
  name: helmet-detection-demo
  namespace: default
 spec:
  edgeWorker:
    name: "edgeworker"
    model:
      name: "small-model"
    nodeName: "edge0"
    hardExampleAlgorithm:
      name: "IBT"
    workerSpec:
      scriptDir: "/code"
      scriptBootFile: "edge_inference.py"
      frameworkType: "tensorflow"
      frameworkVersion: "1.18"
      parameters:
        - key: "nms_threshold"
          value: "0.6"
  cloudWorker:
    name: "work"
    model:
      name: "big-model"
    nodeName: "solar-corona-cloud"
    workerSpec:
      scriptDir: "/code"
      scriptBootFile: "cloud_inference.py"
      frameworkType: "tensorflow"
      frameworkVersion: "1.18"
      parameters:
        - key: "nms_threshold"
          value: "0.6"
 ```
 ## Controller Design
 The joint inference controller starts three separate goroutines called `upstream`, `downstream` and `joint-inference`controller. These are not separate controllers as such but named here for clarity.
 - joint inference: watch the updates of joint-inference-task crds, and create the workers to complete the task.
 - downstream: synchronize the joint-inference updates from the cloud to the edge node.
 - upstream: synchronize the joint-inference updates from the edge to the cloud node.
 ### Joint Inference Controller
 ![](./images/joint-inference-controller.png)
 The joint-inference controller watches for the updates of joint-inference tasks and the corresponding pods against the K8S API server.
 Updates are categorized below along with the possible actions:
 | Update Type                    | Action                                       |
 |-------------------------------|---------------------------------------------- |
 |New  Joint-inference-service Created             |Create the cloud/edge worker|
 |Joint-inference-service Deleted                 | NA. These workers will be deleted by GM.|
 |The corresponding pod created/running/completed/failed                 | Update the status of joint-inference task.|
 ### Downstream Controller
 ![](./images/joint-inference-downstream-controller.png)
 The downstream controller watches for joint-inference updates against the K8S API server.
 Updates are categorized below along with the possible actions that the downstream controller can take:
 | Update Type                    | Action                                       |
 |-------------------------------|---------------------------------------------- |
 |New Joint-inference-service Created             |Sends the task information to LCs.|
 |Joint-inference-service Deleted                 | The controller sends the delete event to LCs.|
 ### Upstream Controller
 ![](./images/joint-inference-upstream-controller.png)
 The upstream controller watches for joint-inference-task updates from the edge node and applies these updates against the API server in the cloud.
 Updates are categorized below along with the possible actions that the upstream controller can take:
 | Update Type                        | Action                                        |
 |-------------------------------     |---------------------------------------------- |
 |Joint-inference-service Reported State Updated    |  The controller appends the reported status of the Joint-inference-service in the cloud. |
 ### Details of api between GM(cloud) and LC(edge)
 1. GM(downstream controller) syncs the task info to LC:
    ```go
    // POST <namespace>/neptune/downstream/jointinferenceservices/<name>/insert
    // body same to the task crd of k8s api, omitted here.
    ```
 1. LC uploads the task status which reported by the worker to GM(upstream controller):
    ```go
    // POST <namespace>/neptune/upstream/jointinferenceservices/<name>/status
    // JoinInferenceServiceStatus defines status that send to GlobalManager
    type JoinInferenceServiceStatus struct {
    	Phase  string  `json:"phase"`
    	Status string  `json:"status"`
    	Output *Output `json:"output"`
    }
    // Output defines task output information
    type Output struct {
    	Models   []Model   `json:"models"`
    	TaskInfo *TaskInfo `json:"taskInfo"`
    }
    // Model defines the model information
    type Model struct {
    	Format string `json:"format"`
    	URL    string `json:"url"`
    }
    // TaskInfo defines the task information
    type TaskInfo struct {
    	InferenceNumber   int     `json:"inferenceNumber"`
    	HardExampleNumber int     `json:"hardExampleNumber"`
    	UploadCloudRatio  float64 `json:"uploadCloudRatio"`
    	StartTime         string  `json:"startTime"`
    	CurrentTime       string  `json:"currentTime"`
    }
    ```
 ### Details of api between Worker(edge) and LC(edge)
 1. Worker sends inference info to LC in same edge node:
    ```
    // POST /neptune/workers/<worker-name>/info
    ```
    ```json
   {
       "name": "worker-name",
       "namespace": "default",
       "ownerName": "jointinferenceservice-name",
       "ownerKind": "jointinferenceservice",
       "kind": "inference",
       "status": "completed/failed/running",
       "taskInfo": {
           "inferenceNumber": 1000,
           "hardExampleNumber": 100,
           "uploadCloudRatio": 0.1,
           "startTime": "2020-11-03T08:39:22.517Z",
           "updateTime": "2020-11-03T08:50:22.517Z"
       }
   }
   ```
 ### Flow of Joint Inference
 - The flow of joint inference service creation:
 ![](./images/joint-inference-flow-creation.png)
 ## Workers Communication
 ![](./images/joint-inference-worker-communication.png)
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -0,0 +1,23 @@
 ## Getting start
 Neptune is an open source framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement and data privacy protection.
 ### Get Neptune
 You can find the latest Neptune release [here](TODO)
 ### Deploying Neptune
 Please refer to this [link](setup/install.html).
 ### Examples
 Please refer to this[link](TODO)
 ### Contributing
 Contributions are very welcome! You can see our [CONTRIBUTING.md](TODO) for more information
 ### Community
 Neptune is an open source project and In the spirit of openness and freedom, we welcome new contributors to join us . You can get in touch with the community according to the ways:
 * [Github Issues](TODO)
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -0,0 +1,21 @@
 # Roadmap
 This document defines a high level roadmap for neptune development.
 The [milestones defined in GitHub](https://github.com/edgeai-neptune/neptune/milestones) represent the most up-to-date plans.
 ## 2021 Q1 Roadmap
 - Support edge model and dataset management.
 - Support incremental learning, with time trigger, sample size trigger, and precision-based trigger, and integrating hard sample discovering algorithm.
 - Support collaborative training, integrating some common weight/gradient compression algorithm.  
 ## Future
 - Integrate some common multi-task migration algorithms to resolve the problem of low precision caused by small size samples.
 - Integrate KubeFlow and ONNX into Neptune, to enable interoperability of edge models with diverse formats.
 - Integrate typical AI frameworks into Neptune, include Tensorflow, Pytorch, PaddlePaddle and Mindspore etc. 
--- a/docs/setup/install.md
+++ b/docs/setup/install.md
@@ -0,0 +1,294 @@
 * [Prerequisites](#prerequisites)
 * [Download project source](#download-source)
 * [Create CRDs](#create-crds)
 * [Deploy GM](#deploy-gm)
 * [Prepare GM config](#prepare-gm-config)
 * [Build worker base images](#build-worker-base-images)
 * [Run GM as k8s pod(recommended)](#run-gm-as-k8s-podrecommended)
 * [Run GM as a single process(alternative)](#run-gm-as-a-single-processalternative)
 * [Run GM as docker container(alternative)](#run-gm-as-docker-containeralternative)
 * [Deploy LC](#deploy-lc)
 ## Deploy Neptune
 ### Prerequisites
 - [GIT][git_tool]
 - [GO][go_tool] version v1.15+.
 - [Kubernetes][kubernetes] 1.16+.
 - [KubeEdge][kubeedge] version v.15+.
 GM will be deployed to a node which has satisfied these requirements:
 1. Has a public IP address which the edge can access to.
 1. Can access the k8s master.
 Simply you can use the node which `cloudcore` of `kubeedge` is deployed at.
 The shell commands below should to be executed in this node and **one terminal session** in case keeping the shell variables.
 ### Download source
 ```shell
 git clone http://github.com/edgeai-neptune/neptune.git
 cd neptune
 git checkout master
 ```
 ### Create CRDs
 ```shell
 # create these crds including dataset, model, joint-inference
 kubectl apply -f build/crds/neptune/
 ```
 ### Deploy GM
 #### Prepare GM config
 Get `build/gm/gm-config.yaml` for a copy
 ```yaml
 kubeConfig: ""
 master: ""
 namespace: ""
 imageHub:
 "tensorflow:1.15": "docker.io/neptune/tensorflow-base-image-to-filled:1.15"
 websocket:
  address: 0.0.0.0
  port: 9000
 localController:
  server: http://localhost:9100
 ```
 1. `kubeConfig`: config to connect k8s, default `""`
 1. `master`: k8s master addr, default `""`
 1. `namespace`: the namespace GM watches, `""` means that gm watches all namespaces, default `""`.
 1. `imageHub`: the base image mapping for model training/evaluation/inference which key is frameworkType/frameVersion.
 1. `websocket`: since the current limit of kubeedge(1.5), GM needs to build the websocket channel for communicating between GM and LCs.
 1. `localController`:
   - `server`: to be injected into the worker to connect LC.
 #### Build worker base images
 Here build worker base image for tensorflow 1.15 for example:
 ```shell
 # edit it with the truly base repo by your choice.
 IMG_BASE_ADDR=docker.io/neptune
 # build tensorflow image
 WORKER_TF1_IMAGE=$IMG_BASE_ADDR/worker-tensorflow:1.15
 docker build -f build/worker/base_images/tensorflow/tensorflow-1.15.Dockerfile -t $WORKER_TF1_IMAGE .
 # push worker image to registry, login to registry first if needed
 docker push $WORKER_TF1_IMAGE
 ```
 There are some methods to run gm, you can choose one method below:
 #### Run GM as k8s pod(**recommended**):
 We don't need to config the kubeconfig in this method said by [accessing the API from a Pod](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod).
 1\. Create the cluster role in case that gm can access/write the CRDs:
 ```shell
 # create the cluster role
 kubectl create -f build/gm/rbac/
 ```
 2\. Prepare the config:
 ```shell
 # edit it with another number if you wish
 GM_PORT=9000
 LC_PORT=9100
 # fill the GM_NODE_NAME's ip which edge node can access to.
 # such as GM_IP=192.168.0.9
 GM_IP=<GM_NODE_NAME_IP_ADDRESS>
 # edit it with the truly base repo by your choice.
 IMG_BASE_ADDR=docker.io/neptune
 GM_ADDRESS=$GM_IP:$GM_PORT
 LC_SERVER="http://localhost:$LC_PORT"
 ```
 ```shell
 # copy and edit CONFIG_FILE.
 CONFIG_FILE=gm-config.yaml
 cp build/gm/gm-config.yaml $CONFIG_FILE
 # prepare the config with empty kubeconfig and empty master url meaning accessing k8s by rest.InClusterConfig().
 # here using sed command, alternative you can edit the config file manully.
 sed -i 's@kubeConfig:.*@kubeConfig: ""@' $CONFIG_FILE
 sed -i 's@master:.*@master: ""@' $CONFIG_FILE
 sed -i "s@port:.*@port: $GM_PORT@" $CONFIG_FILE
 # setting tensorflow1.15 base image
 sed -i 's@\("tensorflow:1.15":\).*@\1 '"$WORKER_TF1_IMAGE@" $CONFIG_FILE
 # setting lc server
 sed -i "s@http://localhost:9100@$LC_SERVER@" $CONFIG_FILE
 ```
 3\. Build the GM image:
 ```shell
 # build image from source OR use the gm image previous built.
 # edit it with the truly base repo by your choice.
 GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1
 # build docker image
 docker build -f build/gm/Dockerfile --tag $GM_IMAGE .
 # push image to registry, login to registry first if needed
 docker push $GM_IMAGE
 ```
 4\. Create gm configmap:
 ```shell
 # create configmap from $CONFIG_FILE
 CONFIG_NAME=neptune-gm-config   # customize this configmap name
 kubectl create configmap $CONFIG_NAME --from-file=$CONFIG_FILE
 ```
 5\. Deploy GM as pod:
 ```shell
 # we assign gm to the node which edge node can access to.
 # here current terminal node name, i.e. the k8s master node.
 # remember the GM_IP
 GM_NODE_NAME=$(hostname)
 GM_POD_NAME=gm-from-$CONFIG_NAME
 kubectl apply -f - <<EOF
 apiVersion: v1
 kind: Pod
 metadata:
  name: $GM_POD_NAME
 spec:
  restartPolicy: OnFailure
  hostNetwork: true
  nodeName: $GM_NODE_NAME
  containers:
  - name: gm
    image: $GM_IMAGE
    command: ["neptune-gm", "--config", "/config/$CONFIG_FILE", "-v2"]
    volumeMounts:
    - name: gm-config
      mountPath: /config
  volumes:
    - name: gm-config
      configMap:
        name: $CONFIG_NAME
 EOF
 ```
 6\. Check the GM status:
 ```shell
 kubectl get pod $GM_POD_NAME
 ```
 #### Run GM as a single process(alternative)
 1\. config GM:
 ```shell
 cp build/gm/neptune-gm.yaml gm.yaml
 # make sure /root/.kube/config exists
 sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' gm.yaml
 ```
 2\. compile and run GM direct:
 ```shell
 go build cmd/neptune-gm/neptune-gm.go
 ./neptune-gm --config gm.yaml -v2
 ```
 #### Run GM as docker container(alternative)
 1\. build GM image:
 ```shell
 GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1
 sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' build/gm/neptune-gm.yaml
 docker build -f build/gm/Dockerfile --tag $GM_IMAGE .
 ```
 2\. run GM as container:
 ```shell
 docker run --net host -v /root/.kube:/root/.kube $GM_IMAGE
 ```
 ### Deploy LC
 Prerequisites:
 1. Run GM successfully.
 2. Get the bind address/port of GM.
 Steps:
 1\. Build LC image:
 ```shell
 LC_IMAGE=$IMG_BASE_ADDR/neptune-lc:v1alpha1
 docker build -f build/lc/Dockerfile --tag $LC_IMAGE .
 # push image to registry, login to registry first if needed
 docker push $LC_IMAGE
 ```
 2\. Deploy LC as k8s daemonset:
 ```shell
 LC_DS_NAME=edge-lc
 kubectl create -f- <<EOF
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  labels:
    k8s-app: neptune-lc
  name: $LC_DS_NAME
  namespace: default
 spec:
  selector:
    matchLabels:
      k8s-app: $LC_DS_NAME
  template:
    metadata:
      labels:
        k8s-app: $LC_DS_NAME
    spec:
      containers:
        - name: $LC_DS_NAME
          image: $LC_IMAGE
          imagePullPolicy: Always
          env:
            - name: GM_ADDRESS
              value: $GM_ADDRESS
            - name: BIND_PORT
              value: "$LC_PORT"
            - name: NODENAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
            - name: ROOTFS_MOUNT_DIR
              # the value of ROOTFS_MOUNT_DIR is same with the mount path of volume
              value: /rootfs
          volumeMounts:
            - name: localcontroller
              mountPath: /rootfs
      volumes:
        - name: localcontroller
          hostPath:
            path: /
      restartPolicy: Always
      hostNetwork: true
 EOF
 ```
 3\. Check the LC status:
 ```shell
 kubectl get ds $LC_DS_NAME
 kubectl get pod |grep $LC_DS_NAME
 ```
 [git_tool]:https://git-scm.com/downloads
 [go_tool]:https://golang.org/dl/
 [kubeedge]:https://github.com/kubeedge/kubeedge
 [kubernetes]:https://kubernetes.io/