1. add the design document of the basic framework. 2. add the roadmap. 3. add the proposal of joint inference. Signed-off-by: Jie Pu <pujie2@huawei.com>tags/v0.1.0
| @@ -0,0 +1,34 @@ | |||||
| # Minimal makefile for Sphinx documentation | |||||
| # | |||||
| # You can set these variables from the command line, and also | |||||
| # from the environment for the first two. | |||||
| SPHINXOPTS ?= | |||||
| SPHINXBUILD ?= sphinx-build | |||||
| SOURCEDIR = . | |||||
| BUILDDIR = build | |||||
| SPHINXAPIDOC = sphinx-apidoc | |||||
| # Put it first so that "make" without argument is like "make help". | |||||
| help: | |||||
| @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | |||||
| .PHONY: help Makefile | |||||
| # Catch-all target: route all unknown targets to Sphinx using the new | |||||
| # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). | |||||
| %: Makefile | |||||
| @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | |||||
| clean: | |||||
| rm -rf "$(BUILDDIR)" | |||||
| api: | |||||
| rm -rf ./source/api/* | |||||
| @$(SPHINXAPIDOC) -M -o ./lib-api/joint_inference ../lib/neptune/joint_inference | |||||
| html: | |||||
| @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | |||||
| all: clean api html | |||||
| @@ -0,0 +1,3 @@ | |||||
| .wy-nav-content{ | |||||
| max-width: 100%; | |||||
| } | |||||
| @@ -0,0 +1,79 @@ | |||||
| # Configuration file for the Sphinx documentation builder. | |||||
| # | |||||
| # This file only contains a selection of the most common options. For a full | |||||
| # list see the documentation: | |||||
| # https://www.sphinx-doc.org/en/master/usage/configuration.html | |||||
| # -- Path setup -------------------------------------------------------------- | |||||
| # If extensions (or modules to document with autodoc) are in another directory, | |||||
| # add these directories to sys.path here. If the directory is relative to the | |||||
| # documentation root, use os.path.abspath to make it absolute, like shown here. | |||||
| # | |||||
| import os | |||||
| import sys | |||||
| import sphinx_rtd_theme | |||||
| sys.path.insert(0, os.path.abspath('../lib')) | |||||
| sys.path.insert(0, os.path.abspath('../lib/neptune')) | |||||
| # -- Project information ----------------------------------------------------- | |||||
| project = 'Neptune' | |||||
| copyright = '2020, Kubeedge' | |||||
| author = 'Kubeedge' | |||||
| # -- General configuration --------------------------------------------------- | |||||
| from recommonmark.parser import CommonMarkParser | |||||
| source_parsers = { | |||||
| '.md': CommonMarkParser, | |||||
| } | |||||
| # Add any Sphinx extension module names here, as strings. They can be | |||||
| # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom | |||||
| # ones. | |||||
| extensions = ['m2r2', 'sphinx.ext.autodoc', 'sphinx_markdown_tables', ] | |||||
| # Add any paths that contain templates here, relative to this directory. | |||||
| # templates_path = ['_templates'] | |||||
| # List of patterns, relative to source directory, that match files and | |||||
| # directories to ignore when looking for source files. | |||||
| # This pattern also affects html_static_path and html_extra_path. | |||||
| exclude_patterns = [] | |||||
| # The master toctree document | |||||
| master_doc = 'index' | |||||
| # The name of the Pygments (syntax highlighting) style to use. | |||||
| pygments_style = 'sphinx' | |||||
| html_static_path = ['_static'] | |||||
| # -- Options for HTML output ------------------------------------------------- | |||||
| # The theme to use for HTML and HTML Help pages. See the documentation for | |||||
| # a list of builtin themes. | |||||
| # | |||||
| html_theme = 'sphinx_rtd_theme' | |||||
| html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] | |||||
| html_theme_options = { | |||||
| 'prev_next_buttons_location': 'both' | |||||
| } | |||||
| # Add any paths that contain custom static files (such as style sheets) here, | |||||
| # relative to this directory. They are copied after the builtin static files, | |||||
| # so a file named "default.css" will overwrite the builtin "default.css". | |||||
| # html_static_path = ['_static'] | |||||
| source_suffix = { | |||||
| '.rst': 'restructuredtext', | |||||
| '.txt': 'markdown', | |||||
| '.md': 'markdown', | |||||
| } | |||||
| def setup(app): | |||||
| app.add_stylesheet('css/custom.css') | |||||
| @@ -0,0 +1 @@ | |||||
| .. mdinclude:: ../../examples/helmet_detection_inference/README.md | |||||
| @@ -0,0 +1,57 @@ | |||||
| =========================================== | |||||
| Neptune documentation | |||||
| =========================================== | |||||
| .. toctree:: | |||||
| :maxdepth: 1 | |||||
| :caption: QUICK START | |||||
| :hidden: | |||||
| quickstart | |||||
| .. toctree:: | |||||
| :maxdepth: 1 | |||||
| :caption: INTRODUCTION | |||||
| :hidden: | |||||
| proposals/architecture | |||||
| proposals/dataset-and-model | |||||
| proposals/joint-inference | |||||
| .. toctree:: | |||||
| :maxdepth: 1 | |||||
| :caption: Setup | |||||
| setup/install | |||||
| .. toctree:: | |||||
| :maxdepth: 1 | |||||
| :caption: EXAMPLES | |||||
| :hidden: | |||||
| examples/joint_inference_example_link | |||||
| .. toctree:: | |||||
| :maxdepth: 2 | |||||
| :caption: API | |||||
| :hidden: | |||||
| lib-api/modules | |||||
| Indices and tables | |||||
| ================== | |||||
| * :ref:`genindex` | |||||
| * :ref:`modindex` | |||||
| * :ref:`search` | |||||
| @@ -0,0 +1,7 @@ | |||||
| neptune | |||||
| ======= | |||||
| .. toctree:: | |||||
| :maxdepth: 4 | |||||
| neptune | |||||
| @@ -0,0 +1,16 @@ | |||||
| neptune.hard\_example\_mining.image\_classification package | |||||
| =========================================================== | |||||
| .. automodule:: neptune.hard_example_mining.image_classification | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| neptune.hard\_example\_mining.image\_classification.hard\_mine\_filters module | |||||
| ------------------------------------------------------------------------------ | |||||
| .. automodule:: neptune.hard_example_mining.image_classification.hard_mine_filters | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| @@ -0,0 +1,18 @@ | |||||
| neptune.hard\_example\_mining.object\_detection package | |||||
| ======================================================= | |||||
| .. automodule:: neptune.hard_example_mining.object_detection | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| Submodules | |||||
| ---------- | |||||
| neptune.hard\_example\_mining.object\_detection.scores\_filters module | |||||
| ---------------------------------------------------------------------- | |||||
| .. automodule:: neptune.hard_example_mining.object_detection.scores_filters | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| @@ -0,0 +1,27 @@ | |||||
| neptune.hard\_example\_mining package | |||||
| ===================================== | |||||
| .. automodule:: neptune.hard_example_mining | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| Subpackages | |||||
| ----------- | |||||
| .. toctree:: | |||||
| :maxdepth: 4 | |||||
| neptune.hard_example_mining.image_classification | |||||
| neptune.hard_example_mining.object_detection | |||||
| Submodules | |||||
| ---------- | |||||
| neptune.hard\_example\_mining.base module | |||||
| ----------------------------------------- | |||||
| .. automodule:: neptune.hard_example_mining.base | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| @@ -0,0 +1,26 @@ | |||||
| neptune.joint\_inference package | |||||
| ================================ | |||||
| .. automodule:: neptune.joint_inference | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| Submodules | |||||
| ---------- | |||||
| neptune.joint\_inference.data module | |||||
| ------------------------------------ | |||||
| .. automodule:: neptune.joint_inference.data | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| neptune.joint\_inference.joint\_inference module | |||||
| ------------------------------------------------ | |||||
| .. automodule:: neptune.joint_inference.joint_inference | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| @@ -0,0 +1,18 @@ | |||||
| Subpackages | |||||
| ----------- | |||||
| .. toctree:: | |||||
| :maxdepth: 4 | |||||
| neptune.hard_example_mining | |||||
| neptune.joint_inference | |||||
| neptune.context module | |||||
| ---------------------- | |||||
| .. automodule:: neptune.context | |||||
| :members: | |||||
| :undoc-members: | |||||
| :show-inheritance: | |||||
| @@ -0,0 +1,35 @@ | |||||
| @ECHO OFF | |||||
| pushd %~dp0 | |||||
| REM Command file for Sphinx documentation | |||||
| if "%SPHINXBUILD%" == "" ( | |||||
| set SPHINXBUILD=sphinx-build | |||||
| ) | |||||
| set SOURCEDIR=. | |||||
| set BUILDDIR=build | |||||
| if "%1" == "" goto help | |||||
| %SPHINXBUILD% >NUL 2>NUL | |||||
| if errorlevel 9009 ( | |||||
| echo. | |||||
| echo.The 'sphinx-build' command was not found. Make sure you have Sphinx | |||||
| echo.installed, then set the SPHINXBUILD environment variable to point | |||||
| echo.to the full path of the 'sphinx-build' executable. Alternatively you | |||||
| echo.may add the Sphinx directory to PATH. | |||||
| echo. | |||||
| echo.If you don't have Sphinx installed, grab it from | |||||
| echo.http://sphinx-doc.org/ | |||||
| exit /b 1 | |||||
| ) | |||||
| %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% | |||||
| goto end | |||||
| :help | |||||
| %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% | |||||
| :end | |||||
| popd | |||||
| @@ -0,0 +1,58 @@ | |||||
| # Edge Cloud Collaborative AI Framework | |||||
| ## Motivation | |||||
| Currently, "Edge AI" in the industry is at an early stage of training on the cloud and inference on the edge. However, the future trend has emerged, and related research and practice are booming, bringing new value growth points for edge computing and AI. Also, edge AI applications have much room for optimization in terms of cost, model effect, and privacy protection. For example: | |||||
| This proposal provides a basic framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement, and data privacy protection. | |||||
| ### Goals | |||||
| For AI applications running at the edge, the goals of edge cloud collaborative framework are: | |||||
| * reducing resource cost on the edge | |||||
| * improving model performance | |||||
| * protecting data privacy | |||||
| ## Proposal | |||||
| * What we propose: | |||||
| * an edge-cloud collaborative AI framework based on KubeEdge | |||||
| * with embed collaborative training and joint inferencing algorithm | |||||
| * working with existing AI framework like Tensorflow, etc | |||||
| * 3 Features: | |||||
| * joint inference | |||||
| * incremental learning | |||||
| * federated learning | |||||
| * Targeting Users: | |||||
| * Domain-specific AI Developers: build and publish edge-cloud collaborative AI services/functions easily | |||||
| * Application Developers: use edge-cloud collaborative AI capabilities. | |||||
| * We are NOT: | |||||
| * to re-invent existing ML framework, i.e., tensorflow, pytorch, mindspore, etc. | |||||
| * to re-invent existing edge platform, i.e., kubeedge, etc. | |||||
| * to offer domain/application-specific algorithms, i.e., facial recognition, text classification, etc. | |||||
| ### Architecture | |||||
|  | |||||
| * GlobalManager: implements the Edge AI features controllers based on the [k8s operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) | |||||
| * Federated Learning Controller: Implements the federated learning feature based on user created CRDs | |||||
| * Incremental Learning Controller: Implements the incremental learning feature based on user created CRDs | |||||
| * Joint Inference Controller: Implements the joint inference feature based on user created CRDs | |||||
| * LocalController: manages the Edge AI features, the extra dataset/model resources on the edge nodes | |||||
| * Workers: includes the training/evaluation/inference/aggregator | |||||
| * do inference or training, based on existing ML framework | |||||
| * launch on demand, imagine they are docker containers | |||||
| * different workers for different features | |||||
| * could run on edge or cloud | |||||
| * Lib: exposes the Edge AI features to applications, i.e. training or inference programs | |||||
| @@ -0,0 +1,350 @@ | |||||
| * [Dataset and Model](#dataset-and-model) | |||||
| * [Motivation](#motivation) | |||||
| * [Goals](#goals) | |||||
| * [Non\-goals](#non-goals) | |||||
| * [Proposal](#proposal) | |||||
| * [Use Cases](#use-cases) | |||||
| * [Design Details](#design-details) | |||||
| * [CRD API Group and Version](#crd-api-group-and-version) | |||||
| * [CRDs](#crds) | |||||
| * [Type definition](#crd-type-definition) | |||||
| * [Crd sample](#crd-samples) | |||||
| * [Controller Design](#controller-design) | |||||
| # Dataset and Model | |||||
| ## Motivation | |||||
| Currently, the Edge AI features depend on the object `dataset` and `model`. | |||||
| This proposal provides the definitions of dataset and model as the first class of k8s resources. | |||||
| ### Goals | |||||
| * Metadata of `dataset` and `model` objects. | |||||
| * Used by the Edge AI features | |||||
| ### Non-goals | |||||
| * The truly format of the AI `dataset`, such as `imagenet`, `coco` or `tf-record` etc. | |||||
| * The truly format of the AI `model`, such as `ckpt`, `saved_model` of tensorflow etc. | |||||
| * The truly operations of the AI `dataset`, such as `shuffle`, `crop` etc. | |||||
| * The truly operations of the AI `model`, such as `train`, `inference` etc. | |||||
| ## Proposal | |||||
| We propose using Kubernetes Custom Resource Definitions (CRDs) to describe | |||||
| the dataset/model specification/status and a controller to synchronize these updates between edge and cloud. | |||||
|  | |||||
| ### Use Cases | |||||
| * Users can create the dataset resource, by providing the `dataset url`, `format` and the `nodeName` which owns the dataset. | |||||
| * Users can create the model resource by providing the `model url` and `format`. | |||||
| * Users can show the information of dataset/model. | |||||
| * Users can delete the dataset/model. | |||||
| ## Design Details | |||||
| ### CRD API Group and Version | |||||
| The `Dataset` and `Model` CRDs will be namespace-scoped. | |||||
| The tables below summarize the group, kind and API version details for the CRDs. | |||||
| * Dataset | |||||
| | Field | Description | | |||||
| |-----------------------|-------------------------| | |||||
| |Group | neptune.io | | |||||
| |APIVersion | v1alpha1 | | |||||
| |Kind | Dataset | | |||||
| * Model | |||||
| | Field | Description | | |||||
| |-----------------------|-------------------------| | |||||
| |Group | neptune.io | | |||||
| |APIVersion | v1alpha1 | | |||||
| |Kind | Model | | |||||
| ### CRDs | |||||
| #### `Dataset` CRD | |||||
| [crd source](/build/crds/neptune/dataset_v1alpha1.yaml) | |||||
| ```yaml | |||||
| apiVersion: apiextensions.k8s.io/v1 | |||||
| kind: CustomResourceDefinition | |||||
| metadata: | |||||
| name: datasets.neptune.io | |||||
| spec: | |||||
| group: neptune.io | |||||
| names: | |||||
| kind: Dataset | |||||
| plural: datasets | |||||
| scope: Namespaced | |||||
| versions: | |||||
| - name: v1alpha1 | |||||
| subresources: | |||||
| # status enables the status subresource. | |||||
| status: {} | |||||
| served: true | |||||
| storage: true | |||||
| schema: | |||||
| openAPIV3Schema: | |||||
| type: object | |||||
| properties: | |||||
| spec: | |||||
| type: object | |||||
| required: | |||||
| - url | |||||
| - format | |||||
| properties: | |||||
| url: | |||||
| type: string | |||||
| format: | |||||
| type: string | |||||
| nodeName: | |||||
| type: string | |||||
| status: | |||||
| type: object | |||||
| properties: | |||||
| numberOfSamples: | |||||
| type: integer | |||||
| updateTime: | |||||
| type: string | |||||
| format: datatime | |||||
| additionalPrinterColumns: | |||||
| - name: NumberOfSamples | |||||
| type: integer | |||||
| description: The number of samples in the dataset | |||||
| jsonPath: ".status.numberOfSamples" | |||||
| - name: Node | |||||
| type: string | |||||
| description: The node name of the dataset | |||||
| jsonPath: ".spec.nodeName" | |||||
| - name: spec | |||||
| type: string | |||||
| description: The spec of the dataset | |||||
| jsonPath: ".spec" | |||||
| ``` | |||||
| 1. `format` of dataset | |||||
| We use this field to report the number of samples for the dataset and do dataset splitting. | |||||
| Current we support these below formats: | |||||
| - txt: one nonempty line is one sample | |||||
| #### `Model` CRD | |||||
| [crd source](/build/crds/neptune/model_v1alpha1.yaml) | |||||
| ```yaml | |||||
| apiVersion: apiextensions.k8s.io/v1 | |||||
| kind: CustomResourceDefinition | |||||
| metadata: | |||||
| name: models.neptune.io | |||||
| spec: | |||||
| group: neptune.io | |||||
| names: | |||||
| kind: Model | |||||
| plural: models | |||||
| scope: Namespaced | |||||
| versions: | |||||
| - name: v1alpha1 | |||||
| subresources: | |||||
| # status enables the status subresource. | |||||
| status: {} | |||||
| served: true | |||||
| storage: true | |||||
| schema: | |||||
| openAPIV3Schema: | |||||
| type: object | |||||
| properties: | |||||
| spec: | |||||
| type: object | |||||
| required: | |||||
| - url | |||||
| - format | |||||
| properties: | |||||
| url: | |||||
| type: string | |||||
| format: | |||||
| type: string | |||||
| status: | |||||
| type: object | |||||
| properties: | |||||
| updateTime: | |||||
| type: string | |||||
| format: datetime | |||||
| metrics: | |||||
| type: array | |||||
| items: | |||||
| type: object | |||||
| properties: | |||||
| key: | |||||
| type: string | |||||
| value: | |||||
| type: string | |||||
| additionalPrinterColumns: | |||||
| - name: updateAGE | |||||
| type: date | |||||
| description: The update age | |||||
| jsonPath: ".status.updateTime" | |||||
| - name: metrics | |||||
| type: string | |||||
| description: The metrics | |||||
| jsonPath: ".status.metrics" | |||||
| ``` | |||||
| ### CRD type definition | |||||
| - `Dataset` | |||||
| [go source](cloud/pkg/apis/neptune/v1alpha1/dataset_types.go) | |||||
| ```go | |||||
| package v1alpha1 | |||||
| import ( | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| ) | |||||
| // +genclient | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // Dataset describes the data that a dataset resource should have | |||||
| type Dataset struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ObjectMeta `json:"metadata,omitempty"` | |||||
| Spec DatasetSpec `json:"spec"` | |||||
| Status DatasetStatus `json:"status"` | |||||
| } | |||||
| // DatasetSpec is a description of a dataset | |||||
| type DatasetSpec struct { | |||||
| URL string `json:"url"` | |||||
| Format string `json:"format"` | |||||
| NodeName string `json:"nodeName"` | |||||
| } | |||||
| // DatasetStatus represents information about the status of a dataset | |||||
| // including the time a dataset updated, and number of samples in a dataset | |||||
| type DatasetStatus struct { | |||||
| UpdateTime *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"` | |||||
| NumberOfSamples int `json:"numberOfSamples"` | |||||
| } | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // DatasetList is a list of Datasets | |||||
| type DatasetList struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ListMeta `json:"metadata"` | |||||
| Items []Dataset `json:"items"` | |||||
| } | |||||
| ``` | |||||
| - `Model` | |||||
| [go source](cloud/pkg/apis/neptune/v1alpha1/model_types.go) | |||||
| ```go | |||||
| package v1alpha1 | |||||
| import ( | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| ) | |||||
| // +genclient | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // Model describes the data that a model resource should have | |||||
| type Model struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ObjectMeta `json:"metadata,omitempty"` | |||||
| Spec ModelSpec `json:"spec"` | |||||
| Status ModelStatus `json:"status"` | |||||
| } | |||||
| // ModelSpec is a description of a model | |||||
| type ModelSpec struct { | |||||
| URL string `json:"url"` | |||||
| Format string `json:"format"` | |||||
| } | |||||
| // ModelStatus represents information about the status of a model | |||||
| // including the time a model updated, and metrics in a model | |||||
| type ModelStatus struct { | |||||
| UpdateTime *metav1.Time `json:"updateTime,omitempty" protobuf:"bytes,1,opt,name=updateTime"` | |||||
| Metrics []Metric `json:"metrics,omitempty" protobuf:"bytes,2,rep,name=metrics"` | |||||
| } | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // ModelList is a list of Models | |||||
| type ModelList struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ListMeta `json:"metadata"` | |||||
| Items []Model `json:"items"` | |||||
| } | |||||
| ``` | |||||
| ### Crd samples | |||||
| - `Dataset` | |||||
| ```yaml | |||||
| apiVersion: neptune.io/v1alpha1 | |||||
| kind: Dataset | |||||
| metadata: | |||||
| name: "dataset-examp" | |||||
| spec: | |||||
| url: "/code/data" | |||||
| format: "txt" | |||||
| nodeName: "edge0" | |||||
| ``` | |||||
| - `Model` | |||||
| ```yaml | |||||
| apiVersion: neptune.io/v1alpha1 | |||||
| kind: Model | |||||
| metadata: | |||||
| name: model-examp | |||||
| spec: | |||||
| url: "/model/frozen.pb" | |||||
| format: pb | |||||
| ``` | |||||
| ## Controller Design | |||||
| In the current design there is downstream/upstream controller for `dataset`, no downstream/upstream controller for `model`.<br/> | |||||
| The dataset controller synchronizes the dataset between the cloud and edge. | |||||
| - downstream: synchronize the dataset info from the cloud to the edge node. | |||||
| - upstream: synchronize the dataset status from the edge to the cloud node, such as the information how many samples the dataset has. | |||||
| <br/> | |||||
| Here is the flow of the dataset creation: | |||||
|  | |||||
| For the model: | |||||
| 1. Model's info will be synced when sync the federated-task etc which uses the model. | |||||
| 1. Model's status will be updated when the corresponding training/inference work has completed. | |||||
| @@ -0,0 +1,553 @@ | |||||
| * [Joint Inference](#joint-inference) | |||||
| * [Motivation](#motivation) | |||||
| * [Goals](#goals) | |||||
| * [Non\-goals](#non-goals) | |||||
| * [Proposal](#proposal) | |||||
| * [Use Cases](#use-cases) | |||||
| * [Design Details](#design-details) | |||||
| * [CRD API Group and Version](#crd-api-group-and-version) | |||||
| * [Joint inference CRD](#joint-inference-crd) | |||||
| * [Joint inference type definition](#joint-inference-type-definition) | |||||
| * [Joint inference sample](#joint-inference-sample) | |||||
| * [Validation](#validation) | |||||
| * [Controller Design](#controller-design) | |||||
| * [Joint Inference Controller](#joint-inference-controller) | |||||
| * [Downstream Controller](#downstream-controller) | |||||
| * [Upstream Controller](#upstream-controller) | |||||
| * [Details of api between GM(cloud) and LC(edge)](#details-of-api-between-gmcloud-and-lcedge) | |||||
| * [Details of api between Worker(edge) and LC(edge)](#details-of-api-between-workeredge-and-lcedge) | |||||
| * [Flow of Joint Inference](#flow-of-joint-inference) | |||||
| * [Workers Communication](#workers-communication) | |||||
| # Joint Inference | |||||
| ## Motivation | |||||
| Inference on the edge can get a shorter latency and a higher throughput, and inference on the cloud can get better inference precision. | |||||
| The collaborative inference technology detects hard samples on the edge and sends them to the cloud for inference. | |||||
| **In this way, simple samples inference on the edge ensures latency and throughput, while hard samples inference on the cloud improves the overall precision.** | |||||
| ### Goals | |||||
| * Joint inference improves the inference precision without significantly reducing the time and throughput. | |||||
| ## Proposal | |||||
| We propose using Kubernetes Custom Resource Definitions (CRDs) to describe | |||||
| the joint inference specification/status and a controller to synchronize these updates between edge and cloud. | |||||
|  | |||||
| ### Use Cases | |||||
| * User can create a joint inference service with providing a training script, | |||||
| specifying the aggregation algorithm, configuring training hyper parameters, | |||||
| configuring training datasets. | |||||
| * Users can get the joint inference status, including the counts of inference at the edge/cloud. | |||||
| ## Design Details | |||||
| ### CRD API Group and Version | |||||
| The `JointInferenceService` CRD will be namespace-scoped. | |||||
| The tables below summarize the group, kind and API version details for the CRD. | |||||
| * JointInferenceService | |||||
| | Field | Description | | |||||
| |-----------------------|-------------------------| | |||||
| |Group | neptune.io | | |||||
| |APIVersion | v1alpha1 | | |||||
| |Kind | JointInferenceService | | |||||
| ### Joint inference CRD | |||||
|  | |||||
| Below is the CustomResourceDefinition yaml for `JointInferenceService`: | |||||
| [crd source](/build/crds/neptune/jointinferenceservice_v1alpha1.yaml) | |||||
| ```yaml | |||||
| apiVersion: apiextensions.k8s.io/v1 | |||||
| kind: CustomResourceDefinition | |||||
| metadata: | |||||
| name: jointinferenceservices.neptune.io | |||||
| spec: | |||||
| group: neptune.io | |||||
| names: | |||||
| kind: JointInferenceService | |||||
| plural: jointinferenceservices | |||||
| shortNames: | |||||
| - jointinferenceservice | |||||
| - jis | |||||
| scope: Namespaced | |||||
| versions: | |||||
| - name: v1alpha1 | |||||
| subresources: | |||||
| # status enables the status subresource. | |||||
| status: {} | |||||
| served: true | |||||
| storage: true | |||||
| schema: | |||||
| openAPIV3Schema: | |||||
| type: object | |||||
| properties: | |||||
| spec: | |||||
| type: object | |||||
| required: | |||||
| - edgeWorker | |||||
| - cloudWorker | |||||
| properties: | |||||
| edgeWorker: | |||||
| type: object | |||||
| required: | |||||
| - name | |||||
| - model | |||||
| - nodeName | |||||
| - hardExampleAlgorithm | |||||
| - workerSpec | |||||
| properties: | |||||
| name: | |||||
| type: string | |||||
| model: | |||||
| type: object | |||||
| required: | |||||
| - name | |||||
| properties: | |||||
| name: | |||||
| type: string | |||||
| nodeName: | |||||
| type: string | |||||
| hardExampleAlgorithm: | |||||
| type: object | |||||
| required: | |||||
| - name | |||||
| properties: | |||||
| name: | |||||
| type: string | |||||
| workerSpec: | |||||
| type: object | |||||
| required: | |||||
| - scriptDir | |||||
| - scriptBootFile | |||||
| - frameworkType | |||||
| - frameworkVersion | |||||
| properties: | |||||
| scriptDir: | |||||
| type: string | |||||
| scriptBootFile: | |||||
| type: string | |||||
| frameworkType: | |||||
| type: string | |||||
| frameworkVersion: | |||||
| type: string | |||||
| parameters: | |||||
| type: array | |||||
| items: | |||||
| type: object | |||||
| required: | |||||
| - key | |||||
| - value | |||||
| properties: | |||||
| key: | |||||
| type: string | |||||
| value: | |||||
| type: string | |||||
| cloudWorker: | |||||
| type: object | |||||
| required: | |||||
| - name | |||||
| - model | |||||
| - nodeName | |||||
| - workerSpec | |||||
| properties: | |||||
| name: | |||||
| type: string | |||||
| model: | |||||
| type: object | |||||
| required: | |||||
| - name | |||||
| properties: | |||||
| name: | |||||
| type: string | |||||
| nodeName: | |||||
| type: string | |||||
| workerSpec: | |||||
| type: object | |||||
| required: | |||||
| - scriptDir | |||||
| - scriptBootFile | |||||
| - frameworkType | |||||
| - frameworkVersion | |||||
| properties: | |||||
| scriptDir: | |||||
| type: string | |||||
| scriptBootFile: | |||||
| type: string | |||||
| frameworkType: | |||||
| type: string | |||||
| frameworkVersion: | |||||
| type: string | |||||
| parameters: | |||||
| type: array | |||||
| items: | |||||
| type: object | |||||
| required: | |||||
| - key | |||||
| - value | |||||
| properties: | |||||
| key: | |||||
| type: string | |||||
| value: | |||||
| type: string | |||||
| status: | |||||
| type: object | |||||
| properties: | |||||
| conditions: | |||||
| type: array | |||||
| items: | |||||
| type: object | |||||
| properties: | |||||
| type: | |||||
| type: string | |||||
| status: | |||||
| type: string | |||||
| lastHeartbeatTime: | |||||
| type: string | |||||
| format: date-time | |||||
| lastTransitionTime: | |||||
| type: string | |||||
| format: date-time | |||||
| reason: | |||||
| type: string | |||||
| message: | |||||
| type: string | |||||
| startTime: | |||||
| type: string | |||||
| format: date-time | |||||
| active: | |||||
| type: integer | |||||
| failed: | |||||
| type: integer | |||||
| metrics: | |||||
| type: array | |||||
| items: | |||||
| type: object | |||||
| properties: | |||||
| key: | |||||
| type: string | |||||
| value: | |||||
| type: string | |||||
| additionalPrinterColumns: | |||||
| - name: status | |||||
| type: string | |||||
| description: The status of the jointinference service | |||||
| jsonPath: ".status.conditions[-1].type" | |||||
| - name: active | |||||
| type: integer | |||||
| description: The number of active worker | |||||
| jsonPath: ".status.active" | |||||
| - name: failed | |||||
| type: integer | |||||
| description: The number of failed worker | |||||
| jsonPath: ".status.failed" | |||||
| - name: Age | |||||
| type: date | |||||
| jsonPath: .metadata.creationTimestamp | |||||
| ``` | |||||
| ### Joint inference type definition | |||||
| [go source](cloud/pkg/apis/neptune/v1alpha1/jointinferenceservice_types.go) | |||||
| ```go | |||||
| package v1alpha1 | |||||
| import ( | |||||
| v1 "k8s.io/api/core/v1" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| ) | |||||
| // +genclient | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // JointInferenceService describes the data that a jointinferenceservice resource should have | |||||
| type JointInferenceService struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ObjectMeta `json:"metadata"` | |||||
| Spec JointInferenceServiceSpec `json:"spec"` | |||||
| Status JointInferenceServiceStatus `json:"status,omitempty"` | |||||
| } | |||||
| // JointInferenceServiceSpec is a description of a jointinferenceservice | |||||
| type JointInferenceServiceSpec struct { | |||||
| EdgeWorker EdgeWorker `json:"edgeWorker"` | |||||
| CloudWorker CloudWorker `json:"cloudWorker"` | |||||
| } | |||||
| // EdgeWorker describes the data a edge worker should have | |||||
| type EdgeWorker struct { | |||||
| Name string `json:"name"` | |||||
| Model SmallModel `json:"model"` | |||||
| NodeName string `json:"nodeName"` | |||||
| HardExampleAlgorithm HardExampleAlgorithm `json:"hardExampleAlgorithm"` | |||||
| WorkerSpec CommonWorkerSpec `json:"workerSpec"` | |||||
| } | |||||
| // CloudWorker describes the data a cloud worker should have | |||||
| type CloudWorker struct { | |||||
| Name string `json:"name"` | |||||
| Model BigModel `json:"model"` | |||||
| NodeName string `json:"nodeName"` | |||||
| WorkerSpec CommonWorkerSpec `json:"workerSpec"` | |||||
| } | |||||
| type SmallModel struct { | |||||
| Name string `json:"name"` | |||||
| } | |||||
| type BigModel struct { | |||||
| Name string `json:"name"` | |||||
| } | |||||
| type HardExampleAlgorithm struct { | |||||
| Name string `json:"name"` | |||||
| } | |||||
| // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | |||||
| // JointInferenceServiceList is a list of JointInferenceServices. | |||||
| type JointInferenceServiceList struct { | |||||
| metav1.TypeMeta `json:",inline"` | |||||
| metav1.ListMeta `json:"metadata"` | |||||
| Items []JointInferenceService `json:"items"` | |||||
| } | |||||
| // JointInferenceServiceStatus represents the current state of a joint inference service. | |||||
| type JointInferenceServiceStatus struct { | |||||
| // The latest available observations of a joint inference service's current state. | |||||
| // +optional | |||||
| Conditions []JointInferenceServiceCondition `json:"conditions,omitempty"` | |||||
| // Represents time when the service was acknowledged by the service controller. | |||||
| // It is not guaranteed to be set in happens-before order across separate operations. | |||||
| // It is represented in RFC3339 form and is in UTC. | |||||
| // +optional | |||||
| StartTime *metav1.Time `json:"startTime,omitempty"` | |||||
| // The number of actively running workers. | |||||
| // +optional | |||||
| Active int32 `json:"active"` | |||||
| // The number of workers which reached to Failed. | |||||
| // +optional | |||||
| Failed int32 `json:"failed"` | |||||
| // Metrics of the joint inference service. | |||||
| Metrics []Metric `json:"metrics,omitempty"` | |||||
| } | |||||
| type JointInferenceServiceConditionType string | |||||
| // These are valid conditions of a service. | |||||
| const ( | |||||
| // JointInferenceServiceCondPending means the service has been accepted by the system, | |||||
| // but one or more of the workers has not been started. | |||||
| JointInferenceServiceCondPending JointInferenceServiceConditionType = "Pending" | |||||
| // JointInferenceServiceCondFailed means the service has failed its execution. | |||||
| JointInferenceServiceCondFailed JointInferenceServiceConditionType = "Failed" | |||||
| // JointInferenceServiceReady means the service has been ready. | |||||
| JointInferenceServiceCondRunning JointInferenceServiceConditionType = "Running" | |||||
| ) | |||||
| // JointInferenceServiceCondition describes current state of a service. | |||||
| type JointInferenceServiceCondition struct { | |||||
| // Type of service condition, Complete or Failed. | |||||
| Type JointInferenceServiceConditionType `json:"type"` | |||||
| // Status of the condition, one of True, False, Unknown. | |||||
| Status v1.ConditionStatus `json:"status"` | |||||
| // Last time the condition was checked. | |||||
| // +optional | |||||
| LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"` | |||||
| // Last time the condition transit from one status to another. | |||||
| // +optional | |||||
| LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` | |||||
| // (brief) reason for the condition's last transition. | |||||
| // +optional | |||||
| Reason string `json:"reason,omitempty"` | |||||
| // Human readable message indicating details about last transition. | |||||
| // +optional | |||||
| Message string `json:"message,omitempty"` | |||||
| } | |||||
| ``` | |||||
| #### Validation | |||||
| [Open API v3 Schema based validation](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#validation) can be used to guard against bad requests. | |||||
| Invalid values for fields ( example string value for a boolean field etc) can be validated using this. | |||||
| Here is a list of validations we need to support : | |||||
| 1. The `dataset` specified in the crd should exist in k8s. | |||||
| 1. The `model` specified in the crd should exist in k8s. | |||||
| 1. The edgenode name specified in the crd should exist in k8s. | |||||
| ### joint inference sample | |||||
| ```yaml | |||||
| apiVersion: neptune.io/v1alpha1 | |||||
| kind: JointInferenceService | |||||
| metadata: | |||||
| name: helmet-detection-demo | |||||
| namespace: default | |||||
| spec: | |||||
| edgeWorker: | |||||
| name: "edgeworker" | |||||
| model: | |||||
| name: "small-model" | |||||
| nodeName: "edge0" | |||||
| hardExampleAlgorithm: | |||||
| name: "IBT" | |||||
| workerSpec: | |||||
| scriptDir: "/code" | |||||
| scriptBootFile: "edge_inference.py" | |||||
| frameworkType: "tensorflow" | |||||
| frameworkVersion: "1.18" | |||||
| parameters: | |||||
| - key: "nms_threshold" | |||||
| value: "0.6" | |||||
| cloudWorker: | |||||
| name: "work" | |||||
| model: | |||||
| name: "big-model" | |||||
| nodeName: "solar-corona-cloud" | |||||
| workerSpec: | |||||
| scriptDir: "/code" | |||||
| scriptBootFile: "cloud_inference.py" | |||||
| frameworkType: "tensorflow" | |||||
| frameworkVersion: "1.18" | |||||
| parameters: | |||||
| - key: "nms_threshold" | |||||
| value: "0.6" | |||||
| ``` | |||||
| ## Controller Design | |||||
| The joint inference controller starts three separate goroutines called `upstream`, `downstream` and `joint-inference`controller. These are not separate controllers as such but named here for clarity. | |||||
| - joint inference: watch the updates of joint-inference-task crds, and create the workers to complete the task. | |||||
| - downstream: synchronize the joint-inference updates from the cloud to the edge node. | |||||
| - upstream: synchronize the joint-inference updates from the edge to the cloud node. | |||||
| ### Joint Inference Controller | |||||
|  | |||||
| The joint-inference controller watches for the updates of joint-inference tasks and the corresponding pods against the K8S API server. | |||||
| Updates are categorized below along with the possible actions: | |||||
| | Update Type | Action | | |||||
| |-------------------------------|---------------------------------------------- | | |||||
| |New Joint-inference-service Created |Create the cloud/edge worker| | |||||
| |Joint-inference-service Deleted | NA. These workers will be deleted by GM.| | |||||
| |The corresponding pod created/running/completed/failed | Update the status of joint-inference task.| | |||||
| ### Downstream Controller | |||||
|  | |||||
| The downstream controller watches for joint-inference updates against the K8S API server. | |||||
| Updates are categorized below along with the possible actions that the downstream controller can take: | |||||
| | Update Type | Action | | |||||
| |-------------------------------|---------------------------------------------- | | |||||
| |New Joint-inference-service Created |Sends the task information to LCs.| | |||||
| |Joint-inference-service Deleted | The controller sends the delete event to LCs.| | |||||
| ### Upstream Controller | |||||
|  | |||||
| The upstream controller watches for joint-inference-task updates from the edge node and applies these updates against the API server in the cloud. | |||||
| Updates are categorized below along with the possible actions that the upstream controller can take: | |||||
| | Update Type | Action | | |||||
| |------------------------------- |---------------------------------------------- | | |||||
| |Joint-inference-service Reported State Updated | The controller appends the reported status of the Joint-inference-service in the cloud. | | |||||
| ### Details of api between GM(cloud) and LC(edge) | |||||
| 1. GM(downstream controller) syncs the task info to LC: | |||||
| ```go | |||||
| // POST <namespace>/neptune/downstream/jointinferenceservices/<name>/insert | |||||
| // body same to the task crd of k8s api, omitted here. | |||||
| ``` | |||||
| 1. LC uploads the task status which reported by the worker to GM(upstream controller): | |||||
| ```go | |||||
| // POST <namespace>/neptune/upstream/jointinferenceservices/<name>/status | |||||
| // JoinInferenceServiceStatus defines status that send to GlobalManager | |||||
| type JoinInferenceServiceStatus struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| Output *Output `json:"output"` | |||||
| } | |||||
| // Output defines task output information | |||||
| type Output struct { | |||||
| Models []Model `json:"models"` | |||||
| TaskInfo *TaskInfo `json:"taskInfo"` | |||||
| } | |||||
| // Model defines the model information | |||||
| type Model struct { | |||||
| Format string `json:"format"` | |||||
| URL string `json:"url"` | |||||
| } | |||||
| // TaskInfo defines the task information | |||||
| type TaskInfo struct { | |||||
| InferenceNumber int `json:"inferenceNumber"` | |||||
| HardExampleNumber int `json:"hardExampleNumber"` | |||||
| UploadCloudRatio float64 `json:"uploadCloudRatio"` | |||||
| StartTime string `json:"startTime"` | |||||
| CurrentTime string `json:"currentTime"` | |||||
| } | |||||
| ``` | |||||
| ### Details of api between Worker(edge) and LC(edge) | |||||
| 1. Worker sends inference info to LC in same edge node: | |||||
| ``` | |||||
| // POST /neptune/workers/<worker-name>/info | |||||
| ``` | |||||
| ```json | |||||
| { | |||||
| "name": "worker-name", | |||||
| "namespace": "default", | |||||
| "ownerName": "jointinferenceservice-name", | |||||
| "ownerKind": "jointinferenceservice", | |||||
| "kind": "inference", | |||||
| "status": "completed/failed/running", | |||||
| "taskInfo": { | |||||
| "inferenceNumber": 1000, | |||||
| "hardExampleNumber": 100, | |||||
| "uploadCloudRatio": 0.1, | |||||
| "startTime": "2020-11-03T08:39:22.517Z", | |||||
| "updateTime": "2020-11-03T08:50:22.517Z" | |||||
| } | |||||
| } | |||||
| ``` | |||||
| ### Flow of Joint Inference | |||||
| - The flow of joint inference service creation: | |||||
|  | |||||
| ## Workers Communication | |||||
|  | |||||
| @@ -0,0 +1,23 @@ | |||||
| ## Getting start | |||||
| Neptune is an open source framework for edge-cloud collaborative training and inference, so that AI applications running at the edge can benefit from cost reduction, model performance improvement and data privacy protection. | |||||
| ### Get Neptune | |||||
| You can find the latest Neptune release [here](TODO) | |||||
| ### Deploying Neptune | |||||
| Please refer to this [link](setup/install.html). | |||||
| ### Examples | |||||
| Please refer to this[link](TODO) | |||||
| ### Contributing | |||||
| Contributions are very welcome! You can see our [CONTRIBUTING.md](TODO) for more information | |||||
| ### Community | |||||
| Neptune is an open source project and In the spirit of openness and freedom, we welcome new contributors to join us . You can get in touch with the community according to the ways: | |||||
| * [Github Issues](TODO) | |||||
| @@ -0,0 +1,21 @@ | |||||
| # Roadmap | |||||
| This document defines a high level roadmap for neptune development. | |||||
| The [milestones defined in GitHub](https://github.com/edgeai-neptune/neptune/milestones) represent the most up-to-date plans. | |||||
| ## 2021 Q1 Roadmap | |||||
| - Support edge model and dataset management. | |||||
| - Support incremental learning, with time trigger, sample size trigger, and precision-based trigger, and integrating hard sample discovering algorithm. | |||||
| - Support collaborative training, integrating some common weight/gradient compression algorithm. | |||||
| ## Future | |||||
| - Integrate some common multi-task migration algorithms to resolve the problem of low precision caused by small size samples. | |||||
| - Integrate KubeFlow and ONNX into Neptune, to enable interoperability of edge models with diverse formats. | |||||
| - Integrate typical AI frameworks into Neptune, include Tensorflow, Pytorch, PaddlePaddle and Mindspore etc. | |||||
| @@ -0,0 +1,294 @@ | |||||
| * [Prerequisites](#prerequisites) | |||||
| * [Download project source](#download-source) | |||||
| * [Create CRDs](#create-crds) | |||||
| * [Deploy GM](#deploy-gm) | |||||
| * [Prepare GM config](#prepare-gm-config) | |||||
| * [Build worker base images](#build-worker-base-images) | |||||
| * [Run GM as k8s pod(recommended)](#run-gm-as-k8s-podrecommended) | |||||
| * [Run GM as a single process(alternative)](#run-gm-as-a-single-processalternative) | |||||
| * [Run GM as docker container(alternative)](#run-gm-as-docker-containeralternative) | |||||
| * [Deploy LC](#deploy-lc) | |||||
| ## Deploy Neptune | |||||
| ### Prerequisites | |||||
| - [GIT][git_tool] | |||||
| - [GO][go_tool] version v1.15+. | |||||
| - [Kubernetes][kubernetes] 1.16+. | |||||
| - [KubeEdge][kubeedge] version v.15+. | |||||
| GM will be deployed to a node which has satisfied these requirements: | |||||
| 1. Has a public IP address which the edge can access to. | |||||
| 1. Can access the k8s master. | |||||
| Simply you can use the node which `cloudcore` of `kubeedge` is deployed at. | |||||
| The shell commands below should to be executed in this node and **one terminal session** in case keeping the shell variables. | |||||
| ### Download source | |||||
| ```shell | |||||
| git clone http://github.com/edgeai-neptune/neptune.git | |||||
| cd neptune | |||||
| git checkout master | |||||
| ``` | |||||
| ### Create CRDs | |||||
| ```shell | |||||
| # create these crds including dataset, model, joint-inference | |||||
| kubectl apply -f build/crds/neptune/ | |||||
| ``` | |||||
| ### Deploy GM | |||||
| #### Prepare GM config | |||||
| Get `build/gm/gm-config.yaml` for a copy | |||||
| ```yaml | |||||
| kubeConfig: "" | |||||
| master: "" | |||||
| namespace: "" | |||||
| imageHub: | |||||
| "tensorflow:1.15": "docker.io/neptune/tensorflow-base-image-to-filled:1.15" | |||||
| websocket: | |||||
| address: 0.0.0.0 | |||||
| port: 9000 | |||||
| localController: | |||||
| server: http://localhost:9100 | |||||
| ``` | |||||
| 1. `kubeConfig`: config to connect k8s, default `""` | |||||
| 1. `master`: k8s master addr, default `""` | |||||
| 1. `namespace`: the namespace GM watches, `""` means that gm watches all namespaces, default `""`. | |||||
| 1. `imageHub`: the base image mapping for model training/evaluation/inference which key is frameworkType/frameVersion. | |||||
| 1. `websocket`: since the current limit of kubeedge(1.5), GM needs to build the websocket channel for communicating between GM and LCs. | |||||
| 1. `localController`: | |||||
| - `server`: to be injected into the worker to connect LC. | |||||
| #### Build worker base images | |||||
| Here build worker base image for tensorflow 1.15 for example: | |||||
| ```shell | |||||
| # edit it with the truly base repo by your choice. | |||||
| IMG_BASE_ADDR=docker.io/neptune | |||||
| # build tensorflow image | |||||
| WORKER_TF1_IMAGE=$IMG_BASE_ADDR/worker-tensorflow:1.15 | |||||
| docker build -f build/worker/base_images/tensorflow/tensorflow-1.15.Dockerfile -t $WORKER_TF1_IMAGE . | |||||
| # push worker image to registry, login to registry first if needed | |||||
| docker push $WORKER_TF1_IMAGE | |||||
| ``` | |||||
| There are some methods to run gm, you can choose one method below: | |||||
| #### Run GM as k8s pod(**recommended**): | |||||
| We don't need to config the kubeconfig in this method said by [accessing the API from a Pod](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod). | |||||
| 1\. Create the cluster role in case that gm can access/write the CRDs: | |||||
| ```shell | |||||
| # create the cluster role | |||||
| kubectl create -f build/gm/rbac/ | |||||
| ``` | |||||
| 2\. Prepare the config: | |||||
| ```shell | |||||
| # edit it with another number if you wish | |||||
| GM_PORT=9000 | |||||
| LC_PORT=9100 | |||||
| # fill the GM_NODE_NAME's ip which edge node can access to. | |||||
| # such as GM_IP=192.168.0.9 | |||||
| GM_IP=<GM_NODE_NAME_IP_ADDRESS> | |||||
| # edit it with the truly base repo by your choice. | |||||
| IMG_BASE_ADDR=docker.io/neptune | |||||
| GM_ADDRESS=$GM_IP:$GM_PORT | |||||
| LC_SERVER="http://localhost:$LC_PORT" | |||||
| ``` | |||||
| ```shell | |||||
| # copy and edit CONFIG_FILE. | |||||
| CONFIG_FILE=gm-config.yaml | |||||
| cp build/gm/gm-config.yaml $CONFIG_FILE | |||||
| # prepare the config with empty kubeconfig and empty master url meaning accessing k8s by rest.InClusterConfig(). | |||||
| # here using sed command, alternative you can edit the config file manully. | |||||
| sed -i 's@kubeConfig:.*@kubeConfig: ""@' $CONFIG_FILE | |||||
| sed -i 's@master:.*@master: ""@' $CONFIG_FILE | |||||
| sed -i "s@port:.*@port: $GM_PORT@" $CONFIG_FILE | |||||
| # setting tensorflow1.15 base image | |||||
| sed -i 's@\("tensorflow:1.15":\).*@\1 '"$WORKER_TF1_IMAGE@" $CONFIG_FILE | |||||
| # setting lc server | |||||
| sed -i "s@http://localhost:9100@$LC_SERVER@" $CONFIG_FILE | |||||
| ``` | |||||
| 3\. Build the GM image: | |||||
| ```shell | |||||
| # build image from source OR use the gm image previous built. | |||||
| # edit it with the truly base repo by your choice. | |||||
| GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1 | |||||
| # build docker image | |||||
| docker build -f build/gm/Dockerfile --tag $GM_IMAGE . | |||||
| # push image to registry, login to registry first if needed | |||||
| docker push $GM_IMAGE | |||||
| ``` | |||||
| 4\. Create gm configmap: | |||||
| ```shell | |||||
| # create configmap from $CONFIG_FILE | |||||
| CONFIG_NAME=neptune-gm-config # customize this configmap name | |||||
| kubectl create configmap $CONFIG_NAME --from-file=$CONFIG_FILE | |||||
| ``` | |||||
| 5\. Deploy GM as pod: | |||||
| ```shell | |||||
| # we assign gm to the node which edge node can access to. | |||||
| # here current terminal node name, i.e. the k8s master node. | |||||
| # remember the GM_IP | |||||
| GM_NODE_NAME=$(hostname) | |||||
| GM_POD_NAME=gm-from-$CONFIG_NAME | |||||
| kubectl apply -f - <<EOF | |||||
| apiVersion: v1 | |||||
| kind: Pod | |||||
| metadata: | |||||
| name: $GM_POD_NAME | |||||
| spec: | |||||
| restartPolicy: OnFailure | |||||
| hostNetwork: true | |||||
| nodeName: $GM_NODE_NAME | |||||
| containers: | |||||
| - name: gm | |||||
| image: $GM_IMAGE | |||||
| command: ["neptune-gm", "--config", "/config/$CONFIG_FILE", "-v2"] | |||||
| volumeMounts: | |||||
| - name: gm-config | |||||
| mountPath: /config | |||||
| volumes: | |||||
| - name: gm-config | |||||
| configMap: | |||||
| name: $CONFIG_NAME | |||||
| EOF | |||||
| ``` | |||||
| 6\. Check the GM status: | |||||
| ```shell | |||||
| kubectl get pod $GM_POD_NAME | |||||
| ``` | |||||
| #### Run GM as a single process(alternative) | |||||
| 1\. config GM: | |||||
| ```shell | |||||
| cp build/gm/neptune-gm.yaml gm.yaml | |||||
| # make sure /root/.kube/config exists | |||||
| sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' gm.yaml | |||||
| ``` | |||||
| 2\. compile and run GM direct: | |||||
| ```shell | |||||
| go build cmd/neptune-gm/neptune-gm.go | |||||
| ./neptune-gm --config gm.yaml -v2 | |||||
| ``` | |||||
| #### Run GM as docker container(alternative) | |||||
| 1\. build GM image: | |||||
| ```shell | |||||
| GM_IMAGE=$IMG_BASE_ADDR/neptune-gm:v1alpha1 | |||||
| sed -i 's@kubeConfig.*@kubeConfig: /root/.kube/config@' build/gm/neptune-gm.yaml | |||||
| docker build -f build/gm/Dockerfile --tag $GM_IMAGE . | |||||
| ``` | |||||
| 2\. run GM as container: | |||||
| ```shell | |||||
| docker run --net host -v /root/.kube:/root/.kube $GM_IMAGE | |||||
| ``` | |||||
| ### Deploy LC | |||||
| Prerequisites: | |||||
| 1. Run GM successfully. | |||||
| 2. Get the bind address/port of GM. | |||||
| Steps: | |||||
| 1\. Build LC image: | |||||
| ```shell | |||||
| LC_IMAGE=$IMG_BASE_ADDR/neptune-lc:v1alpha1 | |||||
| docker build -f build/lc/Dockerfile --tag $LC_IMAGE . | |||||
| # push image to registry, login to registry first if needed | |||||
| docker push $LC_IMAGE | |||||
| ``` | |||||
| 2\. Deploy LC as k8s daemonset: | |||||
| ```shell | |||||
| LC_DS_NAME=edge-lc | |||||
| kubectl create -f- <<EOF | |||||
| apiVersion: apps/v1 | |||||
| kind: DaemonSet | |||||
| metadata: | |||||
| labels: | |||||
| k8s-app: neptune-lc | |||||
| name: $LC_DS_NAME | |||||
| namespace: default | |||||
| spec: | |||||
| selector: | |||||
| matchLabels: | |||||
| k8s-app: $LC_DS_NAME | |||||
| template: | |||||
| metadata: | |||||
| labels: | |||||
| k8s-app: $LC_DS_NAME | |||||
| spec: | |||||
| containers: | |||||
| - name: $LC_DS_NAME | |||||
| image: $LC_IMAGE | |||||
| imagePullPolicy: Always | |||||
| env: | |||||
| - name: GM_ADDRESS | |||||
| value: $GM_ADDRESS | |||||
| - name: BIND_PORT | |||||
| value: "$LC_PORT" | |||||
| - name: NODENAME | |||||
| valueFrom: | |||||
| fieldRef: | |||||
| fieldPath: spec.nodeName | |||||
| - name: ROOTFS_MOUNT_DIR | |||||
| # the value of ROOTFS_MOUNT_DIR is same with the mount path of volume | |||||
| value: /rootfs | |||||
| volumeMounts: | |||||
| - name: localcontroller | |||||
| mountPath: /rootfs | |||||
| volumes: | |||||
| - name: localcontroller | |||||
| hostPath: | |||||
| path: / | |||||
| restartPolicy: Always | |||||
| hostNetwork: true | |||||
| EOF | |||||
| ``` | |||||
| 3\. Check the LC status: | |||||
| ```shell | |||||
| kubectl get ds $LC_DS_NAME | |||||
| kubectl get pod |grep $LC_DS_NAME | |||||
| ``` | |||||
| [git_tool]:https://git-scm.com/downloads | |||||
| [go_tool]:https://golang.org/dl/ | |||||
| [kubeedge]:https://github.com/kubeedge/kubeedge | |||||
| [kubernetes]:https://kubernetes.io/ | |||||