Browse Source

add document of basic framework

1. add the design document of the basic framework.
2. add the roadmap.
3. add the proposal of joint inference.

Signed-off-by: Jie Pu <pujie2@huawei.com>
tags/v0.1.0
Jie Pu llhuii 4 years ago
parent
commit
caaee68204
29 changed files with 1620 additions and 0 deletions
  1. +34
    -0
      docs/Makefile
  2. +3
    -0
      docs/_static/css/custom.css
  3. +79
    -0
      docs/conf.py
  4. +1
    -0
      docs/examples/joint_inference_example_link.rst
  5. +57
    -0
      docs/index.rst
  6. +7
    -0
      docs/lib-api/modules.rst
  7. +16
    -0
      docs/lib-api/neptune.hard_example_mining.image_classification.rst
  8. +18
    -0
      docs/lib-api/neptune.hard_example_mining.object_detection.rst
  9. +27
    -0
      docs/lib-api/neptune.hard_example_mining.rst
  10. +26
    -0
      docs/lib-api/neptune.joint_inference.rst
  11. +18
    -0
      docs/lib-api/neptune.rst
  12. +35
    -0
      docs/make.bat
  13. +58
    -0
      docs/proposals/architecture.md
  14. +350
    -0
      docs/proposals/dataset-and-model.md
  15. BIN
      docs/proposals/images/dataset-creation-flow.png
  16. BIN
      docs/proposals/images/dataset-model-crd.png
  17. BIN
      docs/proposals/images/framework-zh.png
  18. BIN
      docs/proposals/images/framework.png
  19. BIN
      docs/proposals/images/joint-inference-controller.png
  20. BIN
      docs/proposals/images/joint-inference-downstream-controller.png
  21. BIN
      docs/proposals/images/joint-inference-flow-creation.png
  22. BIN
      docs/proposals/images/joint-inference-service-crd-details.png
  23. BIN
      docs/proposals/images/joint-inference-service-crd.png
  24. BIN
      docs/proposals/images/joint-inference-upstream-controller.png
  25. BIN
      docs/proposals/images/joint-inference-worker-communication.png
  26. +553
    -0
      docs/proposals/joint-inference.md
  27. +23
    -0
      docs/quickstart.md
  28. +21
    -0
      docs/roadmap.md
  29. +294
    -0
      docs/setup/install.md

+ 34
- 0
docs/Makefile View File

@@ -0,0 +1,34 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = build

SPHINXAPIDOC = sphinx-apidoc

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

clean:
rm -rf "$(BUILDDIR)"

api:
rm -rf ./source/api/*
@$(SPHINXAPIDOC) -M -o ./lib-api/joint_inference ../lib/neptune/joint_inference

html:
@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

all: clean api html

+ 3
- 0
docs/_static/css/custom.css View File

@@ -0,0 +1,3 @@
.wy-nav-content{
max-width: 100%;
}

+ 79
- 0
docs/conf.py View File

@@ -0,0 +1,79 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
import sphinx_rtd_theme

sys.path.insert(0, os.path.abspath('../lib'))
sys.path.insert(0, os.path.abspath('../lib/neptune'))

# -- Project information -----------------------------------------------------

project = 'Neptune'
copyright = '2020, Kubeedge'
author = 'Kubeedge'

# -- General configuration ---------------------------------------------------

from recommonmark.parser import CommonMarkParser

source_parsers = {
'.md': CommonMarkParser,
}
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['m2r2', 'sphinx.ext.autodoc', 'sphinx_markdown_tables', ]

# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []

# The master toctree document
master_doc = 'index'

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

html_static_path = ['_static']

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

html_theme_options = {
'prev_next_buttons_location': 'both'
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']

source_suffix = {
'.rst': 'restructuredtext',
'.txt': 'markdown',
'.md': 'markdown',
}


def setup(app):
app.add_stylesheet('css/custom.css')

+ 1
- 0
docs/examples/joint_inference_example_link.rst View File

@@ -0,0 +1 @@
.. mdinclude:: ../../examples/helmet_detection_inference/README.md

+ 57
- 0
docs/index.rst View File

@@ -0,0 +1,57 @@
===========================================
Neptune documentation
===========================================



.. toctree::
:maxdepth: 1
:caption: QUICK START
:hidden:

quickstart


.. toctree::
:maxdepth: 1
:caption: INTRODUCTION
:hidden:

proposals/architecture
proposals/dataset-and-model
proposals/joint-inference


.. toctree::
:maxdepth: 1
:caption: Setup

setup/install


.. toctree::
:maxdepth: 1
:caption: EXAMPLES
:hidden:

examples/joint_inference_example_link


.. toctree::
:maxdepth: 2
:caption: API
:hidden:

lib-api/modules






Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

+ 7
- 0
docs/lib-api/modules.rst View File

@@ -0,0 +1,7 @@
neptune
=======

.. toctree::
:maxdepth: 4

neptune

+ 16
- 0
docs/lib-api/neptune.hard_example_mining.image_classification.rst View File

@@ -0,0 +1,16 @@
neptune.hard\_example\_mining.image\_classification package
===========================================================

.. automodule:: neptune.hard_example_mining.image_classification
:members:
:undoc-members:
:show-inheritance:


neptune.hard\_example\_mining.image\_classification.hard\_mine\_filters module
------------------------------------------------------------------------------

.. automodule:: neptune.hard_example_mining.image_classification.hard_mine_filters
:members:
:undoc-members:
:show-inheritance:

+ 18
- 0
docs/lib-api/neptune.hard_example_mining.object_detection.rst View File

@@ -0,0 +1,18 @@
neptune.hard\_example\_mining.object\_detection package
=======================================================

.. automodule:: neptune.hard_example_mining.object_detection
:members:
:undoc-members:
:show-inheritance:

Submodules
----------

neptune.hard\_example\_mining.object\_detection.scores\_filters module
----------------------------------------------------------------------

.. automodule:: neptune.hard_example_mining.object_detection.scores_filters
:members:
:undoc-members:
:show-inheritance:

+ 27
- 0
docs/lib-api/neptune.hard_example_mining.rst View File

@@ -0,0 +1,27 @@
neptune.hard\_example\_mining package
=====================================

.. automodule:: neptune.hard_example_mining
:members:
:undoc-members:
:show-inheritance:

Subpackages
-----------

.. toctree::
:maxdepth: 4

neptune.hard_example_mining.image_classification
neptune.hard_example_mining.object_detection

Submodules
----------

neptune.hard\_example\_mining.base module
-----------------------------------------

.. automodule:: neptune.hard_example_mining.base
:members:
:undoc-members:
:show-inheritance:

+ 26
- 0
docs/lib-api/neptune.joint_inference.rst View File

@@ -0,0 +1,26 @@
neptune.joint\_inference package
================================

.. automodule:: neptune.joint_inference
:members:
:undoc-members:
:show-inheritance:

Submodules
----------

neptune.joint\_inference.data module
------------------------------------

.. automodule:: neptune.joint_inference.data
:members:
:undoc-members:
:show-inheritance:

neptune.joint\_inference.joint\_inference module
------------------------------------------------

.. automodule:: neptune.joint_inference.joint_inference
:members:
:undoc-members:
:show-inheritance:

+ 18
- 0
docs/lib-api/neptune.rst View File

@@ -0,0 +1,18 @@

Subpackages
-----------

.. toctree::
:maxdepth: 4

neptune.hard_example_mining
neptune.joint_inference

neptune.context module
----------------------

.. automodule:: neptune.context
:members:
:undoc-members:
:show-inheritance:


+ 35
- 0
docs/make.bat View File

@@ -0,0 +1,35 @@
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd

+ 58
- 0
docs/proposals/architecture.md View File

@@ -0,0 +1,58 @@
# Edge Cloud Collaborative AI Framework
## Motivation
Currently, "Edge AI" in the industry is at an early stage of training on the cloud and inference on the edge. However, the future trend has emerged, and related research and practice are booming, bringing new value growth points for edge computing and AI. Also, edge AI applications have much room for optimization in terms of cost, model effect, and privacy protection. For example:
This proposal provides a basic framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement, and data privacy protection.
### Goals
For AI applications running at the edge, the goals of edge cloud collaborative framework are:
* reducing resource cost on the edge
* improving model performance
* protecting data privacy
## Proposal
* What we propose:
* an edge-cloud collaborative AI framework based on KubeEdge
* with embed collaborative training and joint inferencing algorithm
* working with existing AI framework like Tensorflow, etc
* 3 Features:
* joint inference
* incremental learning
* federated learning
* Targeting Users:
* Domain-specific AI Developers: build and publish edge-cloud collaborative AI services/functions easily
* Application Developers: use edge-cloud collaborative AI capabilities.
* We are NOT:
* to re-invent existing ML framework, i.e., tensorflow, pytorch, mindspore, etc.
* to re-invent existing edge platform, i.e., kubeedge, etc.
* to offer domain/application-specific algorithms, i.e., facial recognition, text classification, etc.
### Architecture
![](./images/framework.png)
* GlobalManager: implements the Edge AI features controllers based on the [k8s operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/)
* Federated Learning Controller: Implements the federated learning feature based on user created CRDs
* Incremental Learning Controller: Implements the incremental learning feature based on user created CRDs
* Joint Inference Controller: Implements the joint inference feature based on user created CRDs
* LocalController: manages the Edge AI features, the extra dataset/model resources on the edge nodes
* Workers: includes the training/evaluation/inference/aggregator
* do inference or training, based on existing ML framework
* launch on demand, imagine they are docker containers
* different workers for different features
* could run on edge or cloud
* Lib: exposes the Edge AI features to applications, i.e. training or inference programs

+ 350
- 0
docs/proposals/dataset-and-model.md View File

@@ -0,0 +1,350 @@
* [Dataset and Model](#dataset-and-model)
* [Motivation](#motivation)
* [Goals](#goals)
* [Non\-goals](#non-goals)
* [Proposal](#proposal)
* [Use Cases](#use-cases)
* [Design Details](#design-details)
* [CRD API Group and Version](#crd-api-group-and-version)
* [CRDs](#crds)
* [Type definition](#crd-type-definition)
* [Crd sample](#crd-samples)
* [Controller Design](#controller-design)
# Dataset and Model
## Motivation
Currently, the Edge AI features depend on the object `dataset` and `model`.
This proposal provides the definitions of dataset and model as the first class of k8s resources.
### Goals
* Metadata of `dataset` and `model` objects.
* Used by the Edge AI features
### Non-goals
* The truly format of the AI `dataset`, such as `imagenet`, `coco` or `tf-record` etc.
* The truly format of the AI `model`, such as `ckpt`, `saved_model` of tensorflow etc.
* The truly operations of the AI `dataset`, such as `shuffle`, `crop` etc.
* The truly operations of the AI `model`, such as `train`, `inference` etc.
## Proposal
We propose using Kubernetes Custom Resource Definitions (CRDs) to describe
the dataset/model specification/status and a controller to synchronize these updates between edge and cloud.
![](./images/dataset-model-crd.png)
### Use Cases
* Users can create the dataset resource, by providing the `dataset url`, `format` and the `nodeName` which owns the dataset.
* Users can create the model resource by providing the `model url` and `format`.
* Users can show the information of dataset/model.
* Users can delete the dataset/model.
## Design Details
### CRD API Group and Version
The `Dataset` and `Model` CRDs will be namespace-scoped.
The tables below summarize the group, kind and API version details for the CRDs.
* Dataset
| Field | Description |
|-----------------------|-------------------------|
|Group | neptune.io |
|APIVersion | v1alpha1 |
|Kind | Dataset |
* Model
| Field | Description |
|-----------------------|-------------------------|
|Group | neptune.io |
|APIVersion | v1alpha1 |
|Kind | Model |
### CRDs
#### `Dataset` CRD
[crd source](/build/crds/neptune/dataset_v1alpha1.yaml)
```yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: datasets.neptune.io
spec:
group: neptune.io
names:
kind: Dataset
plural: datasets
scope: Namespaced
versions:
- name: v1alpha1
subresources:
# status enables the status subresource.
status: {}
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
required:
- url
- format
properties:
url:
type: string
format:
type: string
nodeName:
type: string
status:
type: object
properties:
numberOfSamples:
type: integer
updateTime:
type: string
format: datatime
additionalPrinterColumns:
- name: NumberOfSamples
type: integer
description: The number of samples in the dataset
jsonPath: ".status.numberOfSamples"
- name: Node
type: string
description: The node name of the dataset
jsonPath: ".spec.nodeName"
- name: spec
type: string
description: The spec of the dataset
jsonPath: ".spec"
```
1. `format` of dataset
We use this field to report the number of samples for the dataset and do dataset splitting.
Current we support these below formats:
- txt: one nonempty line is one sample
#### `Model` CRD
[crd source](/build/crds/neptune/model_v1alpha1.yaml)
```yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: models.neptune.io
spec:
group: neptune.io
names:
kind: Model
plural: models
scope: Namespaced
versions:
- name: v1alpha1
subresources:
# status enables the status subresource.
status: {}
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
required:
- url
- format
properties:
url:
type: string
format:
type: string
status:
type: object
properties:
updateTime:
type: string
format: datetime
metrics:
type: array
items:
type: object
properties:
key:
type: string
value:
type: string
additionalPrinterColumns:
- name: updateAGE
type: date
description: The update age
jsonPath: ".status.updateTime"
- name: metrics
type: string
description: The metrics
jsonPath: ".status.metrics"
```
### CRD type definition
- `Dataset`
[go source](cloud/pkg/apis/neptune/v1alpha1/dataset_types.go)
```go
package v1alpha1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// Dataset describes the data that a dataset resource should have
type Dataset struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec DatasetSpec `json:"spec"`
Status DatasetStatus `json:"status"`
}
// DatasetSpec is a description of a dataset
type DatasetSpec struct {
URL string `json:"url"`
Format string `json:"format"`
NodeName string `json:"nodeName"`
}
// DatasetStatus represents information about the status of a dataset
// including the time a dataset updated, and number of samples in a dataset
type DatasetStatus struct {
UpdateTime *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"`
NumberOfSamples int `json:"numberOfSamples"`
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// DatasetList is a list of Datasets
type DatasetList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata"`
Items []Dataset `json:"items"`
}
```
- `Model`
[go source](cloud/pkg/apis/neptune/v1alpha1/model_types.go)
```go
package v1alpha1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// Model describes the data that a model resource should have
type Model struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec ModelSpec `json:"spec"`
Status ModelStatus `json:"status"`
}
// ModelSpec is a description of a model
type ModelSpec struct {
URL string `json:"url"`
Format string `json:"format"`
}
// ModelStatus represents information about the status of a model
// including the time a model updated, and metrics in a model
type ModelStatus struct {
UpdateTime *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"`
Metrics []Metric `json:"metrics,omitempty" protobuf:"bytes,2,rep,name=metrics"`
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// ModelList is a list of Models
type ModelList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata"`
Items []Model `json:"items"`
}
```
### Crd samples
- `Dataset`
```yaml
apiVersion: neptune.io/v1alpha1
kind: Dataset
metadata:
name: "dataset-examp"
spec:
url: "/code/data"
format: "txt"
nodeName: "edge0"
```
- `Model`
```yaml
apiVersion: neptune.io/v1alpha1
kind: Model
metadata:
name: model-examp
spec:
url: "/model/frozen.pb"
format: pb
```
## Controller Design
In the current design there is downstream/upstream controller for `dataset`, no downstream/upstream controller for `model`.<br/>
The dataset controller synchronizes the dataset between the cloud and edge.
- downstream: synchronize the dataset info from the cloud to the edge node.
- upstream: synchronize the dataset status from the edge to the cloud node, such as the information how many samples the dataset has.
<br/>
Here is the flow of the dataset creation:
![](./images/dataset-creation-flow.png)
For the model:
1. Model's info will be synced when sync the federated-task etc which uses the model.
1. Model's status will be updated when the corresponding training/inference work has completed.

BIN
docs/proposals/images/dataset-creation-flow.png View File

Before After
Width: 868  |  Height: 643  |  Size: 38 kB

BIN
docs/proposals/images/dataset-model-crd.png View File

Before After
Width: 749  |  Height: 663  |  Size: 24 kB

BIN
docs/proposals/images/framework-zh.png View File

Before After
Width: 945  |  Height: 650  |  Size: 54 kB

BIN
docs/proposals/images/framework.png View File

Before After
Width: 915  |  Height: 643  |  Size: 56 kB

BIN
docs/proposals/images/joint-inference-controller.png View File

Before After
Width: 762  |  Height: 324  |  Size: 26 kB

BIN
docs/proposals/images/joint-inference-downstream-controller.png View File

Before After
Width: 890  |  Height: 674  |  Size: 32 kB

BIN
docs/proposals/images/joint-inference-flow-creation.png View File

Before After
Width: 939  |  Height: 703  |  Size: 55 kB

BIN
docs/proposals/images/joint-inference-service-crd-details.png View File

Before After
Width: 814  |  Height: 613  |  Size: 34 kB

BIN
docs/proposals/images/joint-inference-service-crd.png View File

Before After
Width: 755  |  Height: 674  |  Size: 24 kB

BIN
docs/proposals/images/joint-inference-upstream-controller.png View File

Before After
Width: 731  |  Height: 675  |  Size: 32 kB

BIN
docs/proposals/images/joint-inference-worker-communication.png View File

Before After
Width: 870  |  Height: 687  |  Size: 43 kB

+ 553
- 0
docs/proposals/joint-inference.md View File

@@ -0,0 +1,553 @@
* [Joint Inference](#joint-inference)
* [Motivation](#motivation)
* [Goals](#goals)
* [Non\-goals](#non-goals)
* [Proposal](#proposal)
* [Use Cases](#use-cases)
* [Design Details](#design-details)
* [CRD API Group and Version](#crd-api-group-and-version)
* [Joint inference CRD](#joint-inference-crd)
* [Joint inference type definition](#joint-inference-type-definition)
* [Joint inference sample](#joint-inference-sample)
* [Validation](#validation)
* [Controller Design](#controller-design)
* [Joint Inference Controller](#joint-inference-controller)
* [Downstream Controller](#downstream-controller)
* [Upstream Controller](#upstream-controller)
* [Details of api between GM(cloud) and LC(edge)](#details-of-api-between-gmcloud-and-lcedge)
* [Details of api between Worker(edge) and LC(edge)](#details-of-api-between-workeredge-and-lcedge)
* [Flow of Joint Inference](#flow-of-joint-inference)
* [Workers Communication](#workers-communication)
# Joint Inference
## Motivation
Inference on the edge can get a shorter latency and a higher throughput, and inference on the cloud can get better inference precision.
The collaborative inference technology detects hard samples on the edge and sends them to the cloud for inference.
**In this way, simple samples inference on the edge ensures latency and throughput, while hard samples inference on the cloud improves the overall precision.**
### Goals
* Joint inference improves the inference precision without significantly reducing the time and throughput.
## Proposal
We propose using Kubernetes Custom Resource Definitions (CRDs) to describe
the joint inference specification/status and a controller to synchronize these updates between edge and cloud.
![](./images/joint-inference-service-crd.png)
### Use Cases
* User can create a joint inference service with providing a training script,
specifying the aggregation algorithm, configuring training hyper parameters,
configuring training datasets.
* Users can get the joint inference status, including the counts of inference at the edge/cloud.
## Design Details
### CRD API Group and Version
The `JointInferenceService` CRD will be namespace-scoped.
The tables below summarize the group, kind and API version details for the CRD.
* JointInferenceService
| Field | Description |
|-----------------------|-------------------------|
|Group | neptune.io |
|APIVersion | v1alpha1 |
|Kind | JointInferenceService |
### Joint inference CRD
![](./images/joint-inference-service-crd-details.png)
Below is the CustomResourceDefinition yaml for `JointInferenceService`:
[crd source](/build/crds/neptune/jointinferenceservice_v1alpha1.yaml)
```yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: jointinferenceservices.neptune.io
spec:
group: neptune.io
names:
kind: JointInferenceService
plural: jointinferenceservices
shortNames:
- jointinferenceservice
- jis
scope: Namespaced
versions:
- name: v1alpha1
subresources:
# status enables the status subresource.
status: {}
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
required:
- edgeWorker
- cloudWorker
properties:
edgeWorker:
type: object
required:
- name
- model
- nodeName
- hardExampleAlgorithm
- workerSpec
properties:
name:
type: string
model:
type: object
required:
- name
properties:
name:
type: string
nodeName:
type: string
hardExampleAlgorithm:
type: object
required:
- name
properties:
name:
type: string
workerSpec:
type: object
required:
- scriptDir
- scriptBootFile
- frameworkType
- frameworkVersion
properties:
scriptDir:
type: string
scriptBootFile:
type: string
frameworkType:
type: string
frameworkVersion:
type: string
parameters:
type: array
items:
type: object
required:
- key
- value
properties:
key:
type: string
value:
type: string
cloudWorker:
type: object
required:
- name
- model
- nodeName
- workerSpec
properties:
name:
type: string
model:
type: object
required:
- name
properties:
name:
type: string
nodeName:
type: string
workerSpec:
type: object
required:
- scriptDir
- scriptBootFile
- frameworkType
- frameworkVersion
properties:
scriptDir:
type: string
scriptBootFile:
type: string
frameworkType:
type: string
frameworkVersion:
type: string
parameters:
type: array
items:
type: object
required:
- key
- value
properties:
key:
type: string
value:
type: string
status:
type: object
properties:
conditions:
type: array
items:
type: object
properties:
type:
type: string
status:
type: string
lastHeartbeatTime:
type: string
format: date-time
lastTransitionTime:
type: string
format: date-time
reason:
type: string
message:
type: string
startTime:
type: string
format: date-time
active:
type: integer
failed:
type: integer
metrics:
type: array
items:
type: object
properties:
key:
type: string
value:
type: string
additionalPrinterColumns:
- name: status
type: string
description: The status of the jointinference service
jsonPath: ".status.conditions[-1].type"
- name: active
type: integer
description: The number of active worker
jsonPath: ".status.active"
- name: failed
type: integer
description: The number of failed worker
jsonPath: ".status.failed"
- name: Age
type: date
jsonPath: .metadata.creationTimestamp
```
### Joint inference type definition
[go source](cloud/pkg/apis/neptune/v1alpha1/jointinferenceservice_types.go)
```go
package v1alpha1
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// JointInferenceService describes the data that a jointinferenceservice resource should have
type JointInferenceService struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata"`
Spec JointInferenceServiceSpec `json:"spec"`
Status JointInferenceServiceStatus `json:"status,omitempty"`
}
// JointInferenceServiceSpec is a description of a jointinferenceservice
type JointInferenceServiceSpec struct {
EdgeWorker EdgeWorker `json:"edgeWorker"`
CloudWorker CloudWorker `json:"cloudWorker"`
}
// EdgeWorker describes the data a edge worker should have
type EdgeWorker struct {
Name string `json:"name"`
Model SmallModel `json:"model"`
NodeName string `json:"nodeName"`
HardExampleAlgorithm HardExampleAlgorithm `json:"hardExampleAlgorithm"`
WorkerSpec CommonWorkerSpec `json:"workerSpec"`
}
// CloudWorker describes the data a cloud worker should have
type CloudWorker struct {
Name string `json:"name"`
Model BigModel `json:"model"`
NodeName string `json:"nodeName"`
WorkerSpec CommonWorkerSpec `json:"workerSpec"`
}
type SmallModel struct {
Name string `json:"name"`
}
type BigModel struct {
Name string `json:"name"`
}
type HardExampleAlgorithm struct {
Name string `json:"name"`
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// JointInferenceServiceList is a list of JointInferenceServices.
type JointInferenceServiceList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata"`
Items []JointInferenceService `json:"items"`
}
// JointInferenceServiceStatus represents the current state of a joint inference service.
type JointInferenceServiceStatus struct {
// The latest available observations of a joint inference service's current state.
// +optional
Conditions []JointInferenceServiceCondition `json:"conditions,omitempty"`
// Represents time when the service was acknowledged by the service controller.
// It is not guaranteed to be set in happens-before order across separate operations.
// It is represented in RFC3339 form and is in UTC.
// +optional
StartTime *metav1.Time `json:"startTime,omitempty"`
// The number of actively running workers.
// +optional
Active int32 `json:"active"`
// The number of workers which reached to Failed.
// +optional
Failed int32 `json:"failed"`
// Metrics of the joint inference service.
Metrics []Metric `json:"metrics,omitempty"`
}
type JointInferenceServiceConditionType string
// These are valid conditions of a service.
const (
// JointInferenceServiceCondPending means the service has been accepted by the system,
// but one or more of the workers has not been started.
JointInferenceServiceCondPending JointInferenceServiceConditionType = "Pending"
// JointInferenceServiceCondFailed means the service has failed its execution.
JointInferenceServiceCondFailed JointInferenceServiceConditionType = "Failed"
// JointInferenceServiceReady means the service has been ready.
JointInferenceServiceCondRunning JointInferenceServiceConditionType = "Running"
)
// JointInferenceServiceCondition describes current state of a service.
type JointInferenceServiceCondition struct {
// Type of service condition, Complete or Failed.
Type JointInferenceServiceConditionType `json:"type"`
// Status of the condition, one of True, False, Unknown.
Status v1.ConditionStatus `json:"status"`
// Last time the condition was checked.
// +optional
LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"`
// Last time the condition transit from one status to another.
// +optional
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
// (brief) reason for the condition's last transition.
// +optional
Reason string `json:"reason,omitempty"`
// Human readable message indicating details about last transition.
// +optional
Message string `json:"message,omitempty"`
}
```
#### Validation
[Open API v3 Schema based validation](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#validation) can be used to guard against bad requests.
Invalid values for fields ( example string value for a boolean field etc) can be validated using this.
Here is a list of validations we need to support :
1. The `dataset` specified in the crd should exist in k8s.
1. The `model` specified in the crd should exist in k8s.
1. The edgenode name specified in the crd should exist in k8s.
### joint inference sample
```yaml
apiVersion: neptune.io/v1alpha1
kind: JointInferenceService
metadata:
name: helmet-detection-demo
namespace: default
spec:
edgeWorker:
name: "edgeworker"
model:
name: "small-model"
nodeName: "edge0"
hardExampleAlgorithm:
name: "IBT"
workerSpec:
scriptDir: "/code"
scriptBootFile: "edge_inference.py"
frameworkType: "tensorflow"
frameworkVersion: "1.18"
parameters:
- key: "nms_threshold"
value: "0.6"
cloudWorker:
name: "work"
model:
name: "big-model"
nodeName: "solar-corona-cloud"
workerSpec:
scriptDir: "/code"
scriptBootFile: "cloud_inference.py"
frameworkType: "tensorflow"
frameworkVersion: "1.18"
parameters:
- key: "nms_threshold"
value: "0.6"
```
## Controller Design
The joint inference controller starts three separate goroutines called `upstream`, `downstream` and `joint-inference`controller. These are not separate controllers as such but named here for clarity.
- joint inference: watch the updates of joint-inference-task crds, and create the workers to complete the task.
- downstream: synchronize the joint-inference updates from the cloud to the edge node.
- upstream: synchronize the joint-inference updates from the edge to the cloud node.
### Joint Inference Controller
![](./images/joint-inference-controller.png)
The joint-inference controller watches for the updates of joint-inference tasks and the corresponding pods against the K8S API server.
Updates are categorized below along with the possible actions:
| Update Type | Action |
|-------------------------------|---------------------------------------------- |
|New Joint-inference-service Created |Create the cloud/edge worker|
|Joint-inference-service Deleted | NA. These workers will be deleted by GM.|
|The corresponding pod created/running/completed/failed | Update the status of joint-inference task.|
### Downstream Controller
![](./images/joint-inference-downstream-controller.png)
The downstream controller watches for joint-inference updates against the K8S API server.
Updates are categorized below along with the possible actions that the downstream controller can take:
| Update Type | Action |
|-------------------------------|---------------------------------------------- |
|New Joint-inference-service Created |Sends the task information to LCs.|
|Joint-inference-service Deleted | The controller sends the delete event to LCs.|
### Upstream Controller
![](./images/joint-inference-upstream-controller.png)
The upstream controller watches for joint-inference-task updates from the edge node and applies these updates against the API server in the cloud.
Updates are categorized below along with the possible actions that the upstream controller can take:
| Update Type | Action |
|------------------------------- |---------------------------------------------- |
|Joint-inference-service Reported State Updated | The controller appends the reported status of the Joint-inference-service in the cloud. |
### Details of api between GM(cloud) and LC(edge)
1. GM(downstream controller) syncs the task info to LC:
```go
// POST <namespace>/neptune/downstream/jointinferenceservices/<name>/insert
// body same to the task crd of k8s api, omitted here.
```
1. LC uploads the task status which reported by the worker to GM(upstream controller):
```go
// POST <namespace>/neptune/upstream/jointinferenceservices/<name>/status
// JoinInferenceServiceStatus defines status that send to GlobalManager
type JoinInferenceServiceStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
Output *Output `json:"output"`
}
// Output defines task output information
type Output struct {
Models []Model `json:"models"`
TaskInfo *TaskInfo `json:"taskInfo"`
}
// Model defines the model information
type Model struct {
Format string `json:"format"`
URL string `json:"url"`
}
// TaskInfo defines the task information
type TaskInfo struct {
InferenceNumber int `json:"inferenceNumber"`
HardExampleNumber int `json:"hardExampleNumber"`
UploadCloudRatio float64 `json:"uploadCloudRatio"`
StartTime string `json:"startTime"`
CurrentTime string `json:"currentTime"`
}
```
### Details of api between Worker(edge) and LC(edge)
1. Worker sends inference info to LC in same edge node:
```
// POST /neptune/workers/<worker-name>/info
```
```json
{
"name": "worker-name",
"namespace": "default",
"ownerName": "jointinferenceservice-name",
"ownerKind": "jointinferenceservice",
"kind": "inference",
"status": "completed/failed/running",
"taskInfo": {
"inferenceNumber": 1000,
"hardExampleNumber": 100,
"uploadCloudRatio": 0.1,
"startTime": "2020-11-03T08:39:22.517Z",
"updateTime": "2020-11-03T08:50:22.517Z"
}
}
```
### Flow of Joint Inference
- The flow of joint inference service creation:
![](./images/joint-inference-flow-creation.png)
## Workers Communication
![](./images/joint-inference-worker-communication.png)

+ 23
- 0
docs/quickstart.md View File

@@ -0,0 +1,23 @@
## Getting start

Neptune is an open source framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement and data privacy protection.

### Get Neptune

You can find the latest Neptune release [here](TODO)

### Deploying Neptune

Please refer to this [link](setup/install.html).

### Examples
Please refer to this[link](TODO)

### Contributing

Contributions are very welcome! You can see our [CONTRIBUTING.md](TODO) for more information

### Community

Neptune is an open source project and In the spirit of openness and freedom, we welcome new contributors to join us . You can get in touch with the community according to the ways:
* [Github Issues](TODO)

+ 21
- 0
docs/roadmap.md View File

@@ -0,0 +1,21 @@
# Roadmap

This document defines a high level roadmap for neptune development.

The [milestones defined in GitHub](https://github.com/edgeai-neptune/neptune/milestones) represent the most up-to-date plans.


## 2021 Q1 Roadmap

- Support edge model and dataset management.
- Support incremental learning, with time trigger, sample size trigger, and precision-based trigger, and integrating hard sample discovering algorithm.
- Support collaborative training, integrating some common weight/gradient compression algorithm.


## Future

- Integrate some common multi-task migration algorithms to resolve the problem of low precision caused by small size samples.
- Integrate KubeFlow and ONNX into Neptune, to enable interoperability of edge models with diverse formats.
- Integrate typical AI frameworks into Neptune, include Tensorflow, Pytorch, PaddlePaddle and Mindspore etc.



+ 294
- 0
docs/setup/install.md View File

@@ -0,0 +1,294 @@
* [Prerequisites](#prerequisites)
* [Download project source](#download-source)
* [Create CRDs](#create-crds)
* [Deploy GM](#deploy-gm)
* [Prepare GM config](#prepare-gm-config)
* [Build worker base images](#build-worker-base-images)
* [Run GM as k8s pod(recommended)](#run-gm-as-k8s-podrecommended)
* [Run GM as a single process(alternative)](#run-gm-as-a-single-processalternative)
* [Run GM as docker container(alternative)](#run-gm-as-docker-containeralternative)
* [Deploy LC](#deploy-lc)

## Deploy Neptune

### Prerequisites

- [GIT][git_tool]
- [GO][go_tool] version v1.15+.
- [Kubernetes][kubernetes] 1.16+.
- [KubeEdge][kubeedge] version v.15+.

GM will be deployed to a node which has satisfied these requirements:
1. Has a public IP address which the edge can access to.
1. Can access the k8s master.

Simply you can use the node which `cloudcore` of `kubeedge` is deployed at.

The shell commands below should to be executed in this node and **one terminal session** in case keeping the shell variables.

### Download source
```shell
git clone http://github.com/edgeai-neptune/neptune.git
cd neptune
git checkout master
```

### Create CRDs

```shell
# create these crds including dataset, model, joint-inference
kubectl apply -f build/crds/neptune/
```

### Deploy GM

#### Prepare GM config
Get `build/gm/gm-config.yaml` for a copy
```yaml
kubeConfig: ""
master: ""
namespace: ""
imageHub:
"tensorflow:1.15": "docker.io/neptune/tensorflow-base-image-to-filled:1.15"
websocket:
address: 0.0.0.0
port: 9000
localController:
server: http://localhost:9100
```
1. `kubeConfig`: config to connect k8s, default `""`
1. `master`: k8s master addr, default `""`
1. `namespace`: the namespace GM watches, `""` means that gm watches all namespaces, default `""`.
1. `imageHub`: the base image mapping for model training/evaluation/inference which key is frameworkType/frameVersion.
1. `websocket`: since the current limit of kubeedge(1.5), GM needs to build the websocket channel for communicating between GM and LCs.
1. `localController`:
- `server`: to be injected into the worker to connect LC.

#### Build worker base images

Here build worker base image for tensorflow 1.15 for example:
```shell
# edit it with the truly base repo by your choice.
IMG_BASE_ADDR=docker.io/neptune

# build tensorflow image
WORKER_TF1_IMAGE=$IMG_BASE_ADDR/worker-tensorflow:1.15

docker build -f build/worker/base_images/tensorflow/tensorflow-1.15.Dockerfile -t $WORKER_TF1_IMAGE .

# push worker image to registry, login to registry first if needed
docker push $WORKER_TF1_IMAGE
```



There are some methods to run gm, you can choose one method below:

#### Run GM as k8s pod(**recommended**):

We don't need to config the kubeconfig in this method said by [accessing the API from a Pod](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod).

1\. Create the cluster role in case that gm can access/write the CRDs:
```shell
# create the cluster role
kubectl create -f build/gm/rbac/
```

2\. Prepare the config:
```shell
# edit it with another number if you wish
GM_PORT=9000
LC_PORT=9100

# fill the GM_NODE_NAME's ip which edge node can access to.
# such as GM_IP=192.168.0.9
GM_IP=<GM_NODE_NAME_IP_ADDRESS>

# edit it with the truly base repo by your choice.
IMG_BASE_ADDR=docker.io/neptune

GM_ADDRESS=$GM_IP:$GM_PORT
LC_SERVER="http://localhost:$LC_PORT"

```

```shell
# copy and edit CONFIG_FILE.
CONFIG_FILE=gm-config.yaml
cp build/gm/gm-config.yaml $CONFIG_FILE

# prepare the config with empty kubeconfig and empty master url meaning accessing k8s by rest.InClusterConfig().
# here using sed command, alternative you can edit the config file manully.
sed -i 's@kubeConfig:.*@kubeConfig: ""@' $CONFIG_FILE
sed -i 's@master:.*@master: ""@' $CONFIG_FILE

sed -i "s@port:.*@port: $GM_PORT@" $CONFIG_FILE

# setting tensorflow1.15 base image
sed -i 's@\("tensorflow:1.15":\).*@\1 '"$WORKER_TF1_IMAGE@" $CONFIG_FILE

# setting lc server
sed -i "s@http://localhost:9100@$LC_SERVER@" $CONFIG_FILE

```

3\. Build the GM image:
```shell
# build image from source OR use the gm image previous built.

# edit it with the truly base repo by your choice.
GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1

# build docker image
docker build -f build/gm/Dockerfile --tag $GM_IMAGE .

# push image to registry, login to registry first if needed
docker push $GM_IMAGE
```

4\. Create gm configmap:
```shell
# create configmap from $CONFIG_FILE
CONFIG_NAME=neptune-gm-config # customize this configmap name
kubectl create configmap $CONFIG_NAME --from-file=$CONFIG_FILE
```

5\. Deploy GM as pod:
```shell
# we assign gm to the node which edge node can access to.
# here current terminal node name, i.e. the k8s master node.
# remember the GM_IP
GM_NODE_NAME=$(hostname)
GM_POD_NAME=gm-from-$CONFIG_NAME
kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: $GM_POD_NAME
spec:
restartPolicy: OnFailure
hostNetwork: true
nodeName: $GM_NODE_NAME
containers:
- name: gm
image: $GM_IMAGE
command: ["neptune-gm", "--config", "/config/$CONFIG_FILE", "-v2"]
volumeMounts:
- name: gm-config
mountPath: /config
volumes:
- name: gm-config
configMap:
name: $CONFIG_NAME
EOF
```

6\. Check the GM status:
```shell
kubectl get pod $GM_POD_NAME
```

#### Run GM as a single process(alternative)
1\. config GM:
```shell
cp build/gm/neptune-gm.yaml gm.yaml
# make sure /root/.kube/config exists
sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' gm.yaml
```

2\. compile and run GM direct:
```shell
go build cmd/neptune-gm/neptune-gm.go
./neptune-gm --config gm.yaml -v2
```

#### Run GM as docker container(alternative)
1\. build GM image:
```shell
GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1
sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' build/gm/neptune-gm.yaml
docker build -f build/gm/Dockerfile --tag $GM_IMAGE .
```

2\. run GM as container:
```shell
docker run --net host -v /root/.kube:/root/.kube $GM_IMAGE
```

### Deploy LC
Prerequisites:
1. Run GM successfully.
2. Get the bind address/port of GM.

Steps:

1\. Build LC image:
```shell
LC_IMAGE=$IMG_BASE_ADDR/neptune-lc:v1alpha1

docker build -f build/lc/Dockerfile --tag $LC_IMAGE .

# push image to registry, login to registry first if needed
docker push $LC_IMAGE
```

2\. Deploy LC as k8s daemonset:
```shell
LC_DS_NAME=edge-lc

kubectl create -f- <<EOF
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
k8s-app: neptune-lc
name: $LC_DS_NAME
namespace: default
spec:
selector:
matchLabels:
k8s-app: $LC_DS_NAME
template:
metadata:
labels:
k8s-app: $LC_DS_NAME
spec:
containers:
- name: $LC_DS_NAME
image: $LC_IMAGE
imagePullPolicy: Always
env:
- name: GM_ADDRESS
value: $GM_ADDRESS
- name: BIND_PORT
value: "$LC_PORT"
- name: NODENAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: ROOTFS_MOUNT_DIR
# the value of ROOTFS_MOUNT_DIR is same with the mount path of volume
value: /rootfs
volumeMounts:
- name: localcontroller
mountPath: /rootfs
volumes:
- name: localcontroller
hostPath:
path: /
restartPolicy: Always
hostNetwork: true
EOF
```

3\. Check the LC status:
```shell
kubectl get ds $LC_DS_NAME

kubectl get pod |grep $LC_DS_NAME
```

[git_tool]:https://git-scm.com/downloads
[go_tool]:https://golang.org/dl/
[kubeedge]:https://github.com/kubeedge/kubeedge
[kubernetes]:https://kubernetes.io/

Loading…
Cancel
Save