Browse Source

Add incremental-learning upstream/downstream

Signed-off-by: llhuii <liulinghui@huawei.com>
tags/v0.1.0
llhuii 4 years ago
parent
commit
58075efbce
2 changed files with 155 additions and 10 deletions
  1. +68
    -3
      pkg/globalmanager/downstream.go
  2. +87
    -7
      pkg/globalmanager/upstream.go

+ 68
- 3
pkg/globalmanager/downstream.go View File

@@ -1,9 +1,11 @@
package globalmanager package globalmanager


import ( import (
"context"
"fmt" "fmt"
"time" "time"


metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch" "k8s.io/apimachinery/pkg/watch"
@@ -67,6 +69,58 @@ func (dc *DownstreamController) syncFederatedLearningJob(eventType watch.EventTy
return nil return nil
} }


// syncModelWithName will sync the model to the specified node.
// Now called when creating the incrementaljob.
func (dc *DownstreamController) syncModelWithName(nodeName, modelName, namespace string) error {
model, err := dc.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{})
if err != nil {
// TODO: maybe use err.ErrStatus.Code == 404
return fmt.Errorf("model(%s/%s) not found", namespace, modelName)
}

// Since model.Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
if len(model.Kind) == 0 {
model.Kind = "Model"
}

dc.messageLayer.SendResourceObject(nodeName, watch.Added, model)
return nil
}

// syncIncrementalJob syncs the incremental learning jobs
func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, job *neptunev1.IncrementalLearningJob) error {
// Here only propagate to the nodes with non empty name
nodeName := job.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

// Sync the model info to edgenode when the job is created
if eventType == watch.Added {
models := make(map[string]bool)
for _, modelName := range []string{
job.Spec.InitialModel.Name,
job.Spec.DeploySpec.Model.Name,
} {
models[modelName] = true
}

for modelName := range models {
err := dc.syncModelWithName(nodeName, modelName, job.Namespace)
if err != nil {
klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", modelName, job.Name, nodeName, err)
}
}
} else if eventType == watch.Deleted {
// noop
}

dc.messageLayer.SendResourceObject(nodeName, eventType, job)
return nil
}

// sync defines the entrypoint of syncing all resources // sync defines the entrypoint of syncing all resources
func (dc *DownstreamController) sync(stopCh <-chan struct{}) { func (dc *DownstreamController) sync(stopCh <-chan struct{}) {
for { for {
@@ -111,6 +165,15 @@ func (dc *DownstreamController) sync(stopCh <-chan struct{}) {
name = t.Name name = t.Name
err = dc.syncFederatedLearningJob(e.Type, t) err = dc.syncFederatedLearningJob(e.Type, t)


case (*neptunev1.IncrementalLearningJob):
if len(t.Kind) == 0 {
t.Kind = "IncrementalLearningJob"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncIncrementalJob(e.Type, t)

default: default:
klog.Warningf("object type: %T unsupported", e) klog.Warningf("object type: %T unsupported", e)
continue continue
@@ -153,10 +216,12 @@ func (dc *DownstreamController) watch(stopCh <-chan struct{}) {
resyncPeriod := time.Second * 60 resyncPeriod := time.Second * 60
namespace := dc.cfg.Namespace namespace := dc.cfg.Namespace


// TODO: use the informer
for resourceName, object := range map[string]runtime.Object{ for resourceName, object := range map[string]runtime.Object{
"datasets": &neptunev1.Dataset{},
"jointinferenceservices": &neptunev1.JointInferenceService{},
"federatedlearningjobs": &neptunev1.FederatedLearningJob{},
"datasets": &neptunev1.Dataset{},
"jointinferenceservices": &neptunev1.JointInferenceService{},
"federatedlearningjobs": &neptunev1.FederatedLearningJob{},
"incrementallearningjobs": &neptunev1.IncrementalLearningJob{},
} { } {
lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything()) lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything())
si := cache.NewSharedInformer(lw, object, resyncPeriod) si := cache.NewSharedInformer(lw, object, resyncPeriod)


+ 87
- 7
pkg/globalmanager/upstream.go View File

@@ -4,7 +4,9 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"strings"


v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2" "k8s.io/klog/v2"


@@ -45,7 +47,7 @@ func newUnmarshalError(namespace, name, operation string, content []byte) error
return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content))
} }


func checkUpstreamOpeation(operation string) error {
func checkUpstreamOperation(operation string) error {
// current only support the 'status' operation // current only support the 'status' operation
if operation != "status" { if operation != "status" {
return fmt.Errorf("unknown operation %s", operation) return fmt.Errorf("unknown operation %s", operation)
@@ -75,7 +77,7 @@ func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status


// updateDatasetFromEdge syncs update from edge // updateDatasetFromEdge syncs update from edge
func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOpeation(operation)
err := checkUpstreamOperation(operation)
if err != nil { if err != nil {
return err return err
} }
@@ -124,7 +126,7 @@ func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string


// updateJointInferenceFromEdge syncs the edge updates to k8s // updateJointInferenceFromEdge syncs the edge updates to k8s
func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOpeation(operation)
err := checkUpstreamOperation(operation)
if err != nil { if err != nil {
return err return err
} }
@@ -218,7 +220,7 @@ func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, na


// updateFederatedLearningJobFromEdge updates the federated job's status // updateFederatedLearningJobFromEdge updates the federated job's status
func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) {
err = checkUpstreamOpeation(operation)
err = checkUpstreamOperation(operation)
if err != nil { if err != nil {
return err return err
} }
@@ -276,6 +278,83 @@ func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace
return nil return nil
} }


func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond neptunev1.ILJobCondition) error {
client := uc.client.IncrementalLearningJobs(namespace)
return retryUpdateStatus(name, namespace, (func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
}))
}

// updateIncrementalLearningFromEdge syncs the edge updates to k8s
func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOperation(operation)
if err != nil {
return err
}
var jobStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
}

err = json.Unmarshal(content, &jobStatus)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}

// Get the condition data.
// Here unmarshal and marshal immediately to skip the unnecessary fields
var condData IncrementalCondData
err = json.Unmarshal(content, &condData)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}
condDataBytes, _ := json.Marshal(&condData)

cond := neptunev1.ILJobCondition{
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Data: string(condDataBytes),
Message: "reported by lc",
}

switch strings.ToLower(jobStatus.Phase) {
case "train":
cond.Stage = neptunev1.ILJobTrain
case "eval":
cond.Stage = neptunev1.ILJobEval
case "deploy":
cond.Stage = neptunev1.ILJobDeploy
default:
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase)
}

switch strings.ToLower(jobStatus.Status) {
case "ready":
cond.Type = neptunev1.ILJobStageCondReady
case "completed":
cond.Type = neptunev1.ILJobStageCondCompleted
case "failed":
cond.Type = neptunev1.ILJobStageCondFailed
case "waiting":
cond.Type = neptunev1.ILJobStageCondWaiting
default:
return fmt.Errorf("invalid condition type: %v", jobStatus.Status)
}

err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond)
if err != nil {
return fmt.Errorf("failed to append condition, err:%+w", err)
}
return nil
}

// syncEdgeUpdate receives the updates from edge and syncs these to k8s. // syncEdgeUpdate receives the updates from edge and syncs these to k8s.
func (uc *UpstreamController) syncEdgeUpdate() { func (uc *UpstreamController) syncEdgeUpdate() {
for { for {
@@ -336,9 +415,10 @@ func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, er
// NOTE: current no direct model update from edge, // NOTE: current no direct model update from edge,
// model update will be triggered by the corresponding training feature // model update will be triggered by the corresponding training feature
uc.updateHandlers = map[string]updateHandler{ uc.updateHandlers = map[string]updateHandler{
"dataset": uc.updateDatasetFromEdge,
"jointinferenceservice": uc.updateJointInferenceFromEdge,
"federatedlearningjob": uc.updateFederatedLearningJobFromEdge,
"dataset": uc.updateDatasetFromEdge,
"jointinferenceservice": uc.updateJointInferenceFromEdge,
"federatedlearningjob": uc.updateFederatedLearningJobFromEdge,
"incrementallearningjob": uc.updateIncrementalLearningFromEdge,
} }


return uc, nil return uc, nil


Loading…
Cancel
Save