From 48135b2393b3f277c6dd0da343a3acb4245d874c Mon Sep 17 00:00:00 2001 From: llhuii Date: Tue, 20 Jul 2021 16:49:06 +0800 Subject: [PATCH 1/7] gm: refactor all features into independent dir All controllers are placed into globalmanager/controllers: 1. each feature has the independent subdirectory 2. upstream/downstream are kept as top level. Commom types/utils/worker.go are placed into globalmanager/runtime. Signed-off-by: llhuii --- cmd/sedna-gm/app/controller.go | 11 +- .../{ => controllers}/downstream.go | 23 +- .../federatedlearningjob.go | 163 ++++++------ .../incrementallearningjob.go | 242 +++++++++--------- .../jointinference}/jointinferenceservice.go | 157 ++++++------ .../lifelonglearning}/lifelonglearningjob.go | 222 ++++++++-------- .../{controller.go => controllers/manager.go} | 49 ++-- pkg/globalmanager/controllers/registry.go | 33 +++ .../{ => controllers}/upstream.go | 16 +- pkg/globalmanager/{ => runtime}/common.go | 11 +- .../{ => runtime}/secret_injector.go | 2 +- .../storage_initializer_injector.go | 14 +- pkg/globalmanager/{ => runtime}/types.go | 3 +- pkg/globalmanager/{ => runtime}/worker.go | 30 +-- .../manager/incrementallearningjob.go | 8 +- 15 files changed, 511 insertions(+), 473 deletions(-) rename pkg/globalmanager/{ => controllers}/downstream.go (94%) rename pkg/globalmanager/{ => controllers/federatedlearning}/federatedlearningjob.go (76%) rename pkg/globalmanager/{ => controllers/incrementallearning}/incrementallearningjob.go (77%) rename pkg/globalmanager/{ => controllers/jointinference}/jointinferenceservice.go (76%) rename pkg/globalmanager/{ => controllers/lifelonglearning}/lifelonglearningjob.go (76%) rename pkg/globalmanager/{controller.go => controllers/manager.go} (50%) create mode 100644 pkg/globalmanager/controllers/registry.go rename pkg/globalmanager/{ => controllers}/upstream.go (96%) rename pkg/globalmanager/{ => runtime}/common.go (92%) rename pkg/globalmanager/{ => runtime}/secret_injector.go (99%) rename pkg/globalmanager/{ => runtime}/storage_initializer_injector.go (97%) rename pkg/globalmanager/{ => runtime}/types.go (99%) rename pkg/globalmanager/{ => runtime}/worker.go (88%) diff --git a/cmd/sedna-gm/app/controller.go b/cmd/sedna-gm/app/controller.go index 07ec6287..52174e50 100644 --- a/cmd/sedna-gm/app/controller.go +++ b/cmd/sedna-gm/app/controller.go @@ -18,6 +18,7 @@ package app import ( "fmt" + "os" "github.com/spf13/cobra" "github.com/spf13/pflag" @@ -27,7 +28,7 @@ import ( "k8s.io/klog/v2" "github.com/kubeedge/sedna/cmd/sedna-gm/app/options" - controller "github.com/kubeedge/sedna/pkg/globalmanager" + controller "github.com/kubeedge/sedna/pkg/globalmanager/controllers" "github.com/kubeedge/sedna/pkg/util" "github.com/kubeedge/sedna/pkg/version/verflag" ) @@ -61,8 +62,12 @@ func NewControllerCommand() *cobra.Command { if errs := config.Validate(); len(errs) > 0 { klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors())) } - c := controller.NewController(config) - c.Start() + c := controller.New(config) + err = c.Start() + if err != nil { + klog.Errorf("failed to start controller: %v", err) + os.Exit(1) + } }, } fs := cmd.Flags() diff --git a/pkg/globalmanager/downstream.go b/pkg/globalmanager/controllers/downstream.go similarity index 94% rename from pkg/globalmanager/downstream.go rename to pkg/globalmanager/controllers/downstream.go index 5de2f831..a6134454 100644 --- a/pkg/globalmanager/downstream.go +++ b/pkg/globalmanager/controllers/downstream.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package controllers import ( "context" @@ -23,7 +23,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/runtime" + k8sruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" @@ -33,6 +33,7 @@ import ( clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) @@ -49,7 +50,7 @@ type DownstreamController struct { messageLayer messagelayer.MessageLayer } -func (dc *DownstreamController) injectSecret(obj CommonInterface, secretName string) error { +func (dc *DownstreamController) injectSecret(obj runtime.CommonInterface, secretName string) error { if secretName == "" { return nil } @@ -61,7 +62,7 @@ func (dc *DownstreamController) injectSecret(obj CommonInterface, secretName str return err } - InjectSecretObj(obj, secret) + runtime.InjectSecretObj(obj, secret) return err } @@ -148,8 +149,8 @@ func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, jo ann := job.GetAnnotations() if ann != nil { - trainNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] - evalNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobEval)] + trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] + evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)] } if eventType == watch.Deleted { @@ -303,7 +304,7 @@ func (dc *DownstreamController) sync(stopCh <-chan struct{}) { func (dc *DownstreamController) watch(stopCh <-chan struct{}) { rh := cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { - eventObj := obj.(runtime.Object) + eventObj := obj.(k8sruntime.Object) dc.events <- watch.Event{Type: watch.Added, Object: eventObj} }, UpdateFunc: func(old, cur interface{}) { @@ -313,10 +314,10 @@ func (dc *DownstreamController) watch(stopCh <-chan struct{}) { // Update: // We sync it to edge when using self-built websocket, and // this sync isn't needed when we switch out self-built websocket. - dc.events <- watch.Event{Type: watch.Added, Object: cur.(runtime.Object)} + dc.events <- watch.Event{Type: watch.Added, Object: cur.(k8sruntime.Object)} }, DeleteFunc: func(obj interface{}) { - eventObj := obj.(runtime.Object) + eventObj := obj.(k8sruntime.Object) dc.events <- watch.Event{Type: watch.Deleted, Object: eventObj} }, } @@ -328,7 +329,7 @@ func (dc *DownstreamController) watch(stopCh <-chan struct{}) { namespace := dc.cfg.Namespace // TODO: use the informer - for resourceName, object := range map[string]runtime.Object{ + for resourceName, object := range map[string]k8sruntime.Object{ "datasets": &sednav1.Dataset{}, "jointinferenceservices": &sednav1.JointInferenceService{}, "federatedlearningjobs": &sednav1.FederatedLearningJob{}, @@ -361,7 +362,7 @@ func (dc *DownstreamController) GetName() string { } // NewDownstreamController creates a controller DownstreamController from config -func NewDownstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +func NewDownstreamController(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { // TODO: make bufferSize configurable bufferSize := 10 events := make(chan watch.Event, bufferSize) diff --git a/pkg/globalmanager/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go similarity index 76% rename from pkg/globalmanager/federatedlearningjob.go rename to pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index a8ab0c48..103cfcb5 100644 --- a/pkg/globalmanager/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package federatedlearning import ( "context" @@ -46,20 +46,25 @@ import ( sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) +const ( + Name = "FederatedLearning" +) + const ( FLJobStageAgg = "Aggregation" FLJobStageTrain = "Training" ) -// flJobControllerKind contains the schema.GroupVersionKind for this controller type. -var flJobControllerKind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob") +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob") -// FederatedController ensures that all FLJob objects have corresponding pods to +// Controller ensures that all FLJob objects have corresponding pods to // run their configured workload. -type FederatedController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -85,17 +90,17 @@ type FederatedController struct { } // Run the main goroutine responsible for watching and syncing jobs. -func (fc *FederatedController) Start() error { +func (c *Controller) Start() error { workers := 1 stopCh := messageContext.Done() go func() { defer utilruntime.HandleCrash() - defer fc.queue.ShutDown() + defer c.queue.ShutDown() klog.Infof("Starting federatedlearning job controller") defer klog.Infof("Shutting down federatedlearning job controller") - if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, fc.podStoreSynced, fc.jobStoreSynced) { + if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, c.podStoreSynced, c.jobStoreSynced) { klog.Errorf("failed to wait for caches to sync") return @@ -103,7 +108,7 @@ func (fc *FederatedController) Start() error { klog.Infof("Starting federatedlearning job workers") for i := 0; i < workers; i++ { - go wait.Until(fc.worker, time.Second, stopCh) + go wait.Until(c.worker, time.Second, stopCh) } <-stopCh @@ -112,18 +117,18 @@ func (fc *FederatedController) Start() error { } // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. -func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != flJobControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - job, err := fc.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) + job, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -132,27 +137,27 @@ func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { return } - fc.enqueueController(job, immediate) + c.enqueueController(job, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (fc *FederatedController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - fc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - fc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what federatedlearning job manage it and wake them up. -func (fc *FederatedController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -161,11 +166,11 @@ func (fc *FederatedController) updatePod(old, cur interface{}) { return } - fc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the FederatedLearningJob obj When a pod is deleted -func (fc *FederatedController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -186,13 +191,13 @@ func (fc *FederatedController) deletePod(obj interface{}) { return } } - fc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (fc *FederatedController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { klog.Warningf("Couldn't get key for object %+v: %v", obj, err) @@ -201,43 +206,43 @@ func (fc *FederatedController) enqueueController(obj interface{}, immediate bool backoff := time.Duration(0) if !immediate { - backoff = getBackoff(fc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - fc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (fc *FederatedController) worker() { - for fc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (fc *FederatedController) processNextWorkItem() bool { - key, quit := fc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer fc.queue.Done(key) + defer c.queue.Done(key) - forget, err := fc.syncFLJob(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - fc.queue.Forget(key) + c.queue.Forget(key) } return true } klog.Warningf("Error syncing federatedlearning job: %v", err) - fc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } -// syncFLJob will sync the flJob with the given key if it has had its expectations fulfilled, meaning +// sync will sync the FederatedLearningJob with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (fc *FederatedController) syncFLJob(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) @@ -250,7 +255,7 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) } - sharedFLJob, err := fc.jobLister.FederatedLearningJobs(ns).Get(name) + sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("FLJob has been deleted: %v", key) @@ -258,15 +263,15 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { } return false, err } - flJob := *sharedFLJob + flJob := *sharedJob // set kind for flJob in case that the kind is None flJob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob")) // if flJob was finished previously, we don't want to redo the termination if IsFLJobFinished(&flJob) { return true, nil } - selector, _ := GenerateSelector(&flJob) - pods, err := fc.podStore.Pods(flJob.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(&flJob) + pods, err := c.podStore.Pods(flJob.Namespace).List(selector) if err != nil { return false, err } @@ -296,11 +301,11 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { if jobFailed { flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) flJob.Status.Phase = sednav1.FLJobFailed - fc.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage) + c.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage) } else { // in the First time, we create the pods if len(pods) == 0 { - active, manageJobErr = fc.createPod(&flJob) + active, manageJobErr = c.createPod(&flJob) } complete := false if succeeded > 0 && active == 0 { @@ -310,7 +315,7 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondComplete, "", "")) now := metav1.Now() flJob.Status.CompletionTime = &now - fc.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed") + c.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed") flJob.Status.Phase = sednav1.FLJobSucceeded } else { flJob.Status.Phase = sednav1.FLJobRunning @@ -361,10 +366,10 @@ func getStatus(pods []*v1.Pod) (succeeded, failed int32) { return } -func (fc *FederatedController) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error { - jobClient := fc.client.FederatedLearningJobs(flJob.Namespace) +func (c *Controller) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error { + jobClient := c.client.FederatedLearningJobs(flJob.Namespace) var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { + for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { var newFLJob *sednav1.FederatedLearningJob newFLJob, err = jobClient.Get(context.TODO(), flJob.Name, metav1.GetOptions{}) if err != nil { @@ -398,12 +403,12 @@ func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { return false } -func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { +func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { active = 0 ctx := context.Background() modelName := job.Spec.AggregationWorker.Model.Name - model, err := fc.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) + model, err := c.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) if err != nil { return active, fmt.Errorf("failed to get model %s: %w", modelName, err) @@ -412,7 +417,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act secretName := model.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) @@ -420,10 +425,10 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act // deliver pod for aggregation worker aggWorker := job.Spec.AggregationWorker - // Configure container mounting and Env information by initial WorkerParam + // Configure container mounting and Env information by initial runtime.WorkerParam var aggPort int32 = 7363 - var aggWorkerParam *WorkerParam = new(WorkerParam) - aggWorkerParam.env = map[string]string{ + aggWorkerParam := new(runtime.WorkerParam) + aggWorkerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "WORKER_NAME": "aggworker-" + utilrand.String(5), "JOB_NAME": job.Name, @@ -432,12 +437,12 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act "PARTICIPANTS_COUNT": participantsCount, } - aggWorkerParam.workerType = FLJobStageAgg - aggWorkerParam.restartPolicy = v1.RestartPolicyOnFailure + aggWorkerParam.WorkerType = FLJobStageAgg + aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure - aggWorkerParam.mounts = append(aggWorkerParam.mounts, - WorkerMount{ - URL: &MountURL{ + aggWorkerParam.Mounts = append(aggWorkerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: model.Spec.URL, Secret: modelSecret, DownloadByInitializer: false, @@ -447,7 +452,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act ) // create aggpod based on configured parameters - _, err = createPodWithTemplate(fc.kubeClient, job, &aggWorker.Template, aggWorkerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, aggWorkerParam) if err != nil { return active, err } @@ -458,9 +463,9 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. - appIP, err = GetNodeIPByName(fc.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) + appIP, err = runtime.GetNodeIPByName(c.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) - aggServicePort, err = CreateKubernetesService(fc.kubeClient, job, FLJobStageAgg, aggPort, appIP) + aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, FLJobStageAgg, aggPort, appIP) if err != nil { return active, err } @@ -468,7 +473,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act for _, trainingWorker := range job.Spec.TrainingWorkers { // get dataseturl through parsing crd of dataset datasetName := trainingWorker.Dataset.Name - dataset, err := fc.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) if err != nil { return active, fmt.Errorf("failed to get dataset %s: %w", datasetName, err) @@ -477,23 +482,23 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act secretName := dataset.Spec.CredentialName var datasetSecret *v1.Secret if secretName != "" { - datasetSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + datasetSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } - // Configure container mounting and Env information by initial WorkerParam - var workerParam *WorkerParam = new(WorkerParam) + // Configure container mounting and env information + workerParam := new(runtime.WorkerParam) - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: model.Spec.URL, Secret: modelSecret, }, EnvName: "MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataset.Spec.URL, Secret: datasetSecret, }, @@ -501,7 +506,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "AGG_PORT": strconv.Itoa(int(aggServicePort)), "AGG_IP": appIP, @@ -511,13 +516,13 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act "NAMESPACE": job.Namespace, "MODEL_NAME": modelName, "DATASET_NAME": datasetName, - "LC_SERVER": fc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = TrainPodType - workerParam.hostNetwork = true - workerParam.restartPolicy = v1.RestartPolicyOnFailure + workerParam.WorkerType = runtime.TrainPodType + workerParam.HostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure // create train pod based on configured parameters - _, err = createPodWithTemplate(fc.kubeClient, job, &trainingWorker.Template, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, workerParam) if err != nil { return active, err } @@ -526,13 +531,9 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act return } -func (fc *FederatedController) GetName() string { - return "FederatedLearningJobController" -} - -// NewFederatedController creates a new FederatedLearningJob controller that keeps the relevant pods -// in sync with their corresponding FFederatedLearningJob objects. -func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +// New creates a new federated learning job controller that keeps the relevant pods +// in sync with their corresponding FederatedLearningJob objects. +func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -550,11 +551,11 @@ func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, e eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - fc := &FederatedController{ + fc := &Controller{ kubeClient: kubeClient, client: crdclient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "flJob"), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "flJob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}), cfg: cfg, } diff --git a/pkg/globalmanager/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go similarity index 77% rename from pkg/globalmanager/incrementallearningjob.go rename to pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index 2df9d9f6..b422a875 100644 --- a/pkg/globalmanager/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package incrementallearning import ( "context" @@ -48,15 +48,20 @@ import ( sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) -// ijControllerKind contains the schema.GroupVersionKind for this controller type. -var ijControllerKind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob") +const ( + Name = "IncrementalLearningJob" +) + +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob") -// IncrementalJobController ensures that all IncrementalLearningJob objects have corresponding pods to +// Controller ensures that all IncrementalLearningJob objects have corresponding pods to // run their configured workload. -type IncrementalJobController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -82,24 +87,24 @@ type IncrementalJobController struct { } // Run the main goroutine responsible for watching and syncing jobs. -func (jc *IncrementalJobController) Start() error { +func (c *Controller) Start() error { workers := 1 stopCh := messageContext.Done() go func() { defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() + defer c.queue.ShutDown() klog.Infof("Starting incrementallearning job controller") defer klog.Infof("Shutting down incrementallearning job controller") - if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { + if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, c.podStoreSynced, c.jobStoreSynced) { klog.Errorf("failed to wait for caches to sync") return } klog.Infof("Starting incrementallearning job workers") for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) + go wait.Until(c.worker, time.Second, stopCh) } <-stopCh @@ -108,18 +113,18 @@ func (jc *IncrementalJobController) Start() error { } // enqueueByPod enqueues the jointInferenceService object of the specified pod. -func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != ijControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) + service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -128,27 +133,27 @@ func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *IncrementalJobController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what joint inference service manage it and wake them up. -func (jc *IncrementalJobController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -157,11 +162,11 @@ func (jc *IncrementalJobController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the jointinferenceservice obj When a pod is deleted -func (jc *IncrementalJobController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -182,13 +187,13 @@ func (jc *IncrementalJobController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) @@ -197,36 +202,36 @@ func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (jc *IncrementalJobController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *IncrementalJobController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err)) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } @@ -234,7 +239,7 @@ func (jc *IncrementalJobController) processNextWorkItem() bool { // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (jc *IncrementalJobController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime)) @@ -247,7 +252,7 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) } - sharedIncrementalJob, err := jc.jobLister.IncrementalLearningJobs(ns).Get(name) + sharedIncrementalJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("incrementallearning job has been deleted: %v", key) @@ -262,12 +267,12 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { if incrementaljob.Status.StartTime == nil { now := metav1.Now() incrementaljob.Status.StartTime = &now - pod := jc.getSpecifiedPods(&incrementaljob, InferencePodType) + pod := c.getSpecifiedPods(&incrementaljob, runtime.InferencePodType) if pod == nil { - err = jc.createInferPod(&incrementaljob) + err = c.createInferPod(&incrementaljob) } else { if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { - err = jc.createInferPod(&incrementaljob) + err = c.createInferPod(&incrementaljob) } } if err != nil { @@ -285,13 +290,13 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { needUpdated := false // update conditions of incremental job - needUpdated, err = jc.updateIncrementalJobConditions(&incrementaljob) + needUpdated, err = c.updateIncrementalJobConditions(&incrementaljob) if err != nil { klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) } if needUpdated { - if err := jc.updateIncrementalJobStatus(&incrementaljob); err != nil { + if err := c.updateIncrementalJobStatus(&incrementaljob); err != nil { return forget, err } @@ -308,8 +313,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { // setWorkerNodeNameOfJob sets the worker nodeName of the specified job // which is used for downstream to sync job info to the specified LC located in nodeName. -func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { - key := AnnotationsKeyPrefix + jobStage +func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { + key := runtime.AnnotationsKeyPrefix + jobStage ann := job.GetAnnotations() if ann != nil { @@ -319,9 +324,9 @@ func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.Incremen } } - jobClient := jc.client.IncrementalLearningJobs(job.Namespace) + jobClient := c.client.IncrementalLearningJobs(job.Namespace) var err error - for i := 0; i <= ResourceUpdateRetries; i++ { + for i := 0; i <= runtime.ResourceUpdateRetries; i++ { var newJob *sednav1.IncrementalLearningJob newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { @@ -345,7 +350,7 @@ func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.Incremen } // updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus -func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) { +func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) { var initialType sednav1.ILJobStageConditionType var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ Stage: sednav1.ILJobTrain, @@ -361,7 +366,7 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo latestCondition = (jobConditions)[len(jobConditions)-1] klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name, latestCondition.Stage) - pod = jc.getSpecifiedPods(incrementaljob, string(latestCondition.Stage)) + pod = c.getSpecifiedPods(incrementaljob, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase @@ -383,14 +388,14 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo // include train, eval, deploy pod var err error if jobStage == sednav1.ILJobDeploy { - err = jc.restartInferPod(incrementaljob) + err = c.restartInferPod(incrementaljob) if err != nil { klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) } else { klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = jc.createPod(incrementaljob, jobStage) + err = c.createPod(incrementaljob, jobStage) } if err != nil { return needUpdated, err @@ -406,7 +411,7 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo newConditionType = sednav1.ILJobStageCondRunning // add nodeName to job - if err := jc.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil { + if err := c.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil { return needUpdated, err } } @@ -439,10 +444,10 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo } // updateIncrementalJobStatus ensures that jobstatus can be updated rightly -func (jc *IncrementalJobController) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error { - jobClient := jc.client.IncrementalLearningJobs(incrementaljob.Namespace) +func (c *Controller) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error { + jobClient := c.client.IncrementalLearningJobs(incrementaljob.Namespace) var err error - for i := 0; i <= ResourceUpdateRetries; i++ { + for i := 0; i <= runtime.ResourceUpdateRetries; i++ { var newIncrementalJob *sednav1.IncrementalLearningJob newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{}) if err != nil { @@ -468,17 +473,17 @@ func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, j } } -func (jc *IncrementalJobController) generatePodName(jobName string, workerType string) string { +func (c *Controller) generatePodName(jobName string, workerType string) string { return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) } -func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { +func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { if podType == "Deploy" { - podType = InferencePodType + podType = runtime.InferencePodType } var latestPod *v1.Pod - selector, _ := GenerateSelector(job) - pods, err := jc.podStore.Pods(job.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if len(pods) == 0 || err != nil { return nil } @@ -498,20 +503,20 @@ func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLea return latestPod } -func (jc *IncrementalJobController) restartInferPod(job *sednav1.IncrementalLearningJob) error { - inferPod := jc.getSpecifiedPods(job, InferencePodType) +func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error { + inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) if inferPod == nil { klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name) - err := jc.createInferPod(job) + err := c.createInferPod(job) return err } ctx := context.Background() - err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) + err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) if err != nil { klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err } - err = jc.createInferPod(job) + err = c.createInferPod(job) if err != nil { klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err @@ -537,9 +542,9 @@ func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool { return false } -func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { +func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { if name != "" { - secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the secret %s for %s: %w", name, @@ -549,7 +554,7 @@ func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr s return } -func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { +func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { ctx := context.Background() var podTemplate *v1.PodTemplateSpec @@ -558,25 +563,25 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo deployModelName := job.Spec.DeploySpec.Model.Name // check initial model name - initialModel, err := jc.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) + initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get initial model %s: %w", initialModelName, err) } - _, err = jc.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) + _, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get deploy model %s: %w", deployModelName, err) } - dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get dataset %s: %w", incrementalDatasetName, err) } - datasetSecret, err := jc.getSecret( + datasetSecret, err := c.getSecret( job.Namespace, dataset.Spec.CredentialName, fmt.Sprintf("dataset %s", dataset.Name), @@ -585,7 +590,7 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo return err } - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("incremental job %s", job.Name), @@ -597,7 +602,7 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond IncrementalCondData + var cond runtime.IncrementalCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -614,25 +619,25 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo originalDataURLOrIndex = dataset.Spec.URL } - var workerParam *WorkerParam = new(WorkerParam) + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) if podtype == sednav1.ILJobTrain { - workerParam.workerType = TrainPodType + workerParam.WorkerType = runtime.TrainPodType podTemplate = &job.Spec.TrainSpec.Template // Env parameters for train - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "train-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } baseModelURL := inputmodelURLs[0] var baseModelSecret *v1.Secret if baseModelURL == initialModel.Spec.URL { - baseModelSecret, err = jc.getSecret( + baseModelSecret, err = c.getSecret( job.Namespace, initialModel.Spec.CredentialName, fmt.Sprintf("initial model %s", initialModelName), @@ -644,17 +649,17 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo baseModelSecret = jobSecret } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: baseModelURL, Secret: baseModelSecret, DownloadByInitializer: true, }, EnvName: "BASE_MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -662,8 +667,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo EnvName: "MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, DownloadByInitializer: true, Secret: jobSecret, @@ -672,8 +677,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo }, // see https://github.com/kubeedge/sedna/issues/35 - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -684,22 +689,22 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo ) } else { podTemplate = &job.Spec.EvalSpec.Template - workerParam.workerType = "Eval" + workerParam.WorkerType = "Eval" - // Configure Env information for eval by initial WorkerParam - workerParam.env = map[string]string{ + // Configure Env information for eval by initial runtime.WorkerParam + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "eval-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - var modelMountURLs []MountURL + var modelMountURLs []runtime.MountURL for _, url := range inputmodelURLs { var modelSecret *v1.Secret if url == initialModel.Spec.URL { - modelSecret, err = jc.getSecret( + modelSecret, err = c.getSecret( job.Namespace, initialModel.Spec.CredentialName, fmt.Sprintf("initial model %s", initialModelName), @@ -711,21 +716,21 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo modelSecret = jobSecret } - modelMountURLs = append(modelMountURLs, MountURL{ + modelMountURLs = append(modelMountURLs, runtime.MountURL{ URL: url, Secret: modelSecret, DownloadByInitializer: true, }) } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ URLs: modelMountURLs, Name: "models", EnvName: "MODEL_URLS", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: datasetSecret, DownloadByInitializer: true, @@ -734,8 +739,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo EnvName: "TEST_DATASET_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -748,20 +753,20 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo } // set the default policy instead of Always policy - workerParam.restartPolicy = v1.RestartPolicyOnFailure - workerParam.hostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure + workerParam.HostNetwork = true // create pod based on podtype - _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam) if err != nil { return err } return } -func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearningJob) error { +func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { infermodelName := job.Spec.DeploySpec.Model.Name - inferModel, err := jc.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) + inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get infer model %s: %w", infermodelName, err) @@ -772,16 +777,16 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) HEMParameterString := string(HEMParameterJSON) - // Configure container mounting and Env information by initial WorkerParam - modelSecret, err := jc.getSecret( + // Configure container mounting and Env information by initial runtime.WorkerParam + modelSecret, err := c.getSecret( job.Namespace, inferModel.Spec.CredentialName, fmt.Sprintf("model %s", inferModel.Name), ) - var workerParam *WorkerParam = new(WorkerParam) - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: inferModelURL, Secret: modelSecret, DownloadByInitializer: true, @@ -791,7 +796,7 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "inferworker-" + utilrand.String(5), @@ -799,25 +804,20 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name, "HEM_PARAMETERS": HEMParameterString, - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = InferencePodType - workerParam.hostNetwork = true + workerParam.WorkerType = runtime.InferencePodType + workerParam.HostNetwork = true // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) return err } -// GetName returns the name of the incrementallearning job controller -func (jc *IncrementalJobController) GetName() string { - return "IncrementalLearningJobController" -} - -// NewIncrementalJobController creates a new IncrementalJob controller that keeps the relevant pods +// New creates a new IncrementalJob controller that keeps the relevant pods // in sync with their corresponding IncrementalJob objects. -func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -846,11 +846,11 @@ func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControlle eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - jc := &IncrementalJobController{ + jc := &Controller{ kubeClient: kubeClient, client: crdclient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "incrementallearningjob"), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "incrementallearningjob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}), cfg: cfg, } diff --git a/pkg/globalmanager/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go similarity index 76% rename from pkg/globalmanager/jointinferenceservice.go rename to pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index 8d22fa3b..d32ee83c 100644 --- a/pkg/globalmanager/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package jointinference import ( "context" @@ -47,20 +47,26 @@ import ( sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) +const ( + Name = "JointInference" +) + const ( jointInferenceForEdge = "Edge" jointInferenceForCloud = "Cloud" + bigModelPort = 5000 ) -// jointServiceControllerKind contains the schema.GroupVersionKind for this controller type. -var jointServiceControllerKind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService") +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService") -// JointInferenceServiceController ensures that all JointInferenceService objects +// Controller ensures that all JointInferenceService objects // have corresponding pods to run their configured workload. -type JointInferenceServiceController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -83,17 +89,17 @@ type JointInferenceServiceController struct { } // Start starts the main goroutine responsible for watching and syncing services. -func (jc *JointInferenceServiceController) Start() error { +func (c *Controller) Start() error { workers := 1 stopCh := messageContext.Done() go func() { defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() + defer c.queue.ShutDown() klog.Infof("Starting joint inference service controller") defer klog.Infof("Shutting down joint inference service controller") - if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, jc.podStoreSynced, jc.serviceStoreSynced) { + if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, c.podStoreSynced, c.serviceStoreSynced) { klog.Errorf("failed to wait for joint inferce service caches to sync") return @@ -101,7 +107,7 @@ func (jc *JointInferenceServiceController) Start() error { klog.Infof("Starting joint inference service workers") for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) + go wait.Until(c.worker, time.Second, stopCh) } <-stopCh @@ -110,18 +116,18 @@ func (jc *JointInferenceServiceController) Start() error { } // enqueueByPod enqueues the jointInferenceService object of the specified pod. -func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != jointServiceControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) + service, err := c.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -130,27 +136,27 @@ func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate b return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *JointInferenceServiceController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what joint inference service manage it and wake them up. -func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -159,11 +165,11 @@ func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the jointinferenceservice obj When a pod is deleted -func (jc *JointInferenceServiceController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -184,13 +190,13 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *JointInferenceServiceController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { klog.Warningf("Couldn't get key for object %+v: %v", obj, err) @@ -199,42 +205,42 @@ func (jc *JointInferenceServiceController) enqueueController(obj interface{}, im backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the sync is never invoked concurrently with the same key. -func (jc *JointInferenceServiceController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *JointInferenceServiceController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } klog.Warningf("Error syncing jointinference service: %v", err) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } // sync will sync the jointinferenceservice with the given key. // This function is not meant to be invoked concurrently with the same key. -func (jc *JointInferenceServiceController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime)) @@ -247,7 +253,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) } - sharedJointinferenceservice, err := jc.serviceLister.JointInferenceServices(ns).Get(name) + sharedJointinferenceservice, err := c.serviceLister.JointInferenceServices(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("JointInferenceService has been deleted: %v", key) @@ -265,10 +271,10 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { // set kind for jointinferenceservice in case that the kind is None // more details at https://github.com/kubernetes/kubernetes/issues/3030 - jointinferenceservice.SetGroupVersionKind(jointServiceControllerKind) + jointinferenceservice.SetGroupVersionKind(Kind) - selector, _ := GenerateSelector(&jointinferenceservice) - pods, err := jc.podStore.Pods(jointinferenceservice.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(&jointinferenceservice) + pods, err := c.podStore.Pods(jointinferenceservice.Namespace).List(selector) if err != nil { return false, err @@ -278,7 +284,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { latestConditionLen := len(jointinferenceservice.Status.Conditions) - active := calcActivePodCount(pods) + active := runtime.CalcActivePodCount(pods) var failed int32 = 0 // neededCounts means that two pods should be created successfully in a jointinference service currently // two pods consist of edge pod and cloud pod @@ -313,10 +319,10 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { reason = "workerFailed" message = "the worker of Jointinferenceservice failed" newCondtionType = sednav1.JointInferenceServiceCondFailed - jc.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message) + c.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message) } else { if len(pods) == 0 { - active, manageServiceErr = jc.createWorkers(&jointinferenceservice) + active, manageServiceErr = c.createWorkers(&jointinferenceservice) } if manageServiceErr != nil { serviceFailed = true @@ -340,7 +346,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { jointinferenceservice.Status.Active = active jointinferenceservice.Status.Failed = failed - if err := jc.updateStatus(&jointinferenceservice); err != nil { + if err := c.updateStatus(&jointinferenceservice); err != nil { return forget, err } @@ -367,10 +373,10 @@ func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServi } } -func (jc *JointInferenceServiceController) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error { - serviceClient := jc.client.JointInferenceServices(jointinferenceservice.Namespace) +func (c *Controller) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error { + serviceClient := c.client.JointInferenceServices(jointinferenceservice.Namespace) var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { + for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { var newJointinferenceservice *sednav1.JointInferenceService newJointinferenceservice, err = serviceClient.Get(context.TODO(), jointinferenceservice.Name, metav1.GetOptions{}) if err != nil { @@ -393,11 +399,11 @@ func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { return false } -func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { +func (c *Controller) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { active = 0 // create cloud worker - err = jc.createCloudWorker(service) + err = c.createCloudWorker(service) if err != nil { return active, err } @@ -406,14 +412,14 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI // create k8s service for cloudPod // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. - bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) - bigServicePort, err := CreateKubernetesService(jc.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) + bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) + bigServicePort, err := runtime.CreateKubernetesService(c.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) if err != nil { return active, err } // create edge worker - err = jc.createEdgeWorker(service, bigServicePort) + err = c.createEdgeWorker(service, bigServicePort) if err != nil { return active, err } @@ -422,24 +428,24 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI return active, err } -func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.JointInferenceService) error { +func (c *Controller) createCloudWorker(service *sednav1.JointInferenceService) error { // deliver pod for cloudworker cloudModelName := service.Spec.CloudWorker.Model.Name - cloudModel, err := jc.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) + cloudModel, err := c.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get cloud model %s: %w", cloudModelName, err) } - var workerParam WorkerParam + var workerParam runtime.WorkerParam secretName := cloudModel.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } - workerParam.mounts = append(workerParam.mounts, WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cloudModel.Spec.URL, Secret: modelSecret, DownloadByInitializer: true, @@ -448,7 +454,7 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo EnvName: "MODEL_URL", }) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": service.Namespace, "SERVICE_NAME": service.Name, "WORKER_NAME": "cloudworker-" + utilrand.String(5), @@ -456,21 +462,21 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo "BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)), } - workerParam.workerType = jointInferenceForCloud + workerParam.WorkerType = jointInferenceForCloud // create cloud pod - _, err = createPodWithTemplate(jc.kubeClient, + _, err = runtime.CreatePodWithTemplate(c.kubeClient, service, &service.Spec.CloudWorker.Template, &workerParam) return err } -func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { +func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { // deliver pod for edgeworker ctx := context.Background() edgeModelName := service.Spec.EdgeWorker.Model.Name - edgeModel, err := jc.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) + edgeModel, err := c.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get edge model %s: %w", edgeModelName, err) @@ -479,13 +485,13 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi secretName := edgeModel.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. // get bigModelIP from nodeName in cloudWorker - bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) + bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) if err != nil { return fmt.Errorf("failed to get node ip: %w", err) } @@ -494,10 +500,10 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters) HEMParameterString := string(HEMParameterJSON) - var workerParam WorkerParam + var workerParam runtime.WorkerParam - workerParam.mounts = append(workerParam.mounts, WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: edgeModel.Spec.URL, Secret: modelSecret, DownloadByInitializer: true, @@ -506,7 +512,7 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi EnvName: "MODEL_URL", }) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": service.Namespace, "SERVICE_NAME": service.Name, "WORKER_NAME": "edgeworker-" + utilrand.String(5), @@ -517,28 +523,23 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi "HEM_NAME": edgeWorker.HardExampleMining.Name, "HEM_PARAMETERS": HEMParameterString, - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = jointInferenceForEdge - workerParam.hostNetwork = true + workerParam.WorkerType = jointInferenceForEdge + workerParam.HostNetwork = true // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, + _, err = runtime.CreatePodWithTemplate(c.kubeClient, service, &service.Spec.EdgeWorker.Template, &workerParam) return err } -// GetName returns the name of the joint inference controller -func (jc *JointInferenceServiceController) GetName() string { - return "JointInferenceServiceController" -} - -// NewJointController creates a new JointInferenceService controller that keeps the relevant pods +// New creates a new JointInferenceService controller that keeps the relevant pods // in sync with their corresponding JointInferenceService objects. -func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { var err error namespace := cfg.Namespace if namespace == "" { @@ -558,11 +559,11 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - jc := &JointInferenceServiceController{ + jc := &Controller{ kubeClient: kubeClient, client: crdclient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "jointinferenceservice"), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "jointinferenceservice"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), cfg: cfg, } diff --git a/pkg/globalmanager/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go similarity index 76% rename from pkg/globalmanager/lifelonglearningjob.go rename to pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index 73e2efc4..2d95dcc5 100644 --- a/pkg/globalmanager/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package lifelonglearning import ( "context" @@ -46,15 +46,20 @@ import ( sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) -// ljControllerKind contains the schema.GroupVersionKind for this controller type. -var ljControllerKind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob") +const ( + Name = "LifelongLearning" +) + +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob") -// LifelongLearningJobController ensures that all LifelongLearningJob objects have corresponding pods to +// Controller ensures that all LifelongLearningJob objects have corresponding pods to // run their configured workload. -type LifelongLearningJobController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -80,24 +85,24 @@ type LifelongLearningJobController struct { } // Run the main goroutine responsible for watching and syncing jobs. -func (jc *LifelongLearningJobController) Start() error { +func (c *Controller) Start() error { workers := 1 stopCh := messageContext.Done() go func() { defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() + defer c.queue.ShutDown() klog.Infof("Starting lifelonglearning job controller") defer klog.Infof("Shutting down lifelonglearning job controller") - if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { + if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, c.podStoreSynced, c.jobStoreSynced) { klog.Errorf("failed to wait for caches to sync") return } klog.Infof("Starting lifelonglearning job workers") for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) + go wait.Until(c.worker, time.Second, stopCh) } <-stopCh @@ -106,18 +111,18 @@ func (jc *LifelongLearningJobController) Start() error { } // enqueueByPod enqueues the lifelonglearningjob object of the specified pod. -func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != ljControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) + service, err := c.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -126,27 +131,27 @@ func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate boo return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *LifelongLearningJobController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what lifelonglearning job manage it and wake them up. -func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -155,11 +160,11 @@ func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the lifelonglearningjob obj When a pod is deleted -func (jc *LifelongLearningJobController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -180,13 +185,13 @@ func (jc *LifelongLearningJobController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *LifelongLearningJobController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) @@ -195,36 +200,36 @@ func (jc *LifelongLearningJobController) enqueueController(obj interface{}, imme backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (jc *LifelongLearningJobController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *LifelongLearningJobController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err)) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } @@ -232,7 +237,7 @@ func (jc *LifelongLearningJobController) processNextWorkItem() bool { // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (jc *LifelongLearningJobController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime)) @@ -245,7 +250,7 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) } - sharedLifelongLearningJob, err := jc.jobLister.LifelongLearningJobs(ns).Get(name) + sharedLifelongLearningJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) @@ -273,13 +278,13 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { needUpdated := false // update conditions of lifelonglearning job - needUpdated, err = jc.updateLifelongLearningJobConditions(&lifelonglearningjob) + needUpdated, err = c.updateLifelongLearningJobConditions(&lifelonglearningjob) if err != nil { klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) } if needUpdated { - if err := jc.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil { + if err := c.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil { return forget, err } @@ -295,7 +300,7 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { } // updateLifelongLearningJobConditions ensures that conditions of lifelonglearning job can be changed by podstatus -func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) { +func (c *Controller) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) { var initialType sednav1.LLJobStageConditionType var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ Stage: sednav1.LLJobTrain, @@ -311,7 +316,7 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif latestCondition = (jobConditions)[len(jobConditions)-1] klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", lifelonglearningjob.Namespace, lifelonglearningjob.Name, latestCondition.Stage) - pod := jc.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage)) + pod := c.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase @@ -333,14 +338,14 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif // include train, eval, deploy pod var err error if jobStage == sednav1.LLJobDeploy { - err = jc.restartInferPod(lifelonglearningjob) + err = c.restartInferPod(lifelonglearningjob) if err != nil { klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) } else { klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", lifelonglearningjob.Namespace, lifelonglearningjob.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = jc.createPod(lifelonglearningjob, jobStage) + err = c.createPod(lifelonglearningjob, jobStage) } if err != nil { return needUpdated, err @@ -364,7 +369,7 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) } case sednav1.LLJobStageCondCompleted: - jobStage = jc.getNextStage(jobStage) + jobStage = c.getNextStage(jobStage) newConditionType = sednav1.LLJobStageCondWaiting case sednav1.LLJobStageCondFailed: @@ -384,10 +389,10 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif } // updateLifelongLearningJobStatus ensures that jobstatus can be updated rightly -func (jc *LifelongLearningJobController) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error { - jobClient := jc.client.LifelongLearningJobs(lifelonglearningjob.Namespace) +func (c *Controller) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error { + jobClient := c.client.LifelongLearningJobs(lifelonglearningjob.Namespace) var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { + for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { var newLifelongLearningJob *sednav1.LifelongLearningJob newLifelongLearningJob, err = jobClient.Get(context.TODO(), lifelonglearningjob.Name, metav1.GetOptions{}) if err != nil { @@ -413,17 +418,17 @@ func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionTy } } -func (jc *LifelongLearningJobController) generatePodName(jobName string, workerType string) string { +func (c *Controller) generatePodName(jobName string, workerType string) string { return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) } -func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { +func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { if podType == "Deploy" { - podType = InferencePodType + podType = runtime.InferencePodType } var latestPod *v1.Pod - selector, _ := GenerateSelector(job) - pods, err := jc.podStore.Pods(job.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if len(pods) == 0 || err != nil { return nil } @@ -443,20 +448,20 @@ func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongL return latestPod } -func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLearningJob) error { - inferPod := jc.getSpecifiedPods(job, InferencePodType) +func (c *Controller) restartInferPod(job *sednav1.LifelongLearningJob) error { + inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) if inferPod == nil { klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name) - err := jc.createInferPod(job) + err := c.createInferPod(job) return err } ctx := context.Background() - err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) + err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) if err != nil { klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err } - err = jc.createInferPod(job) + err = c.createInferPod(job) if err != nil { klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err @@ -464,7 +469,7 @@ func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLe return nil } -func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { +func (c *Controller) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { switch currentStage { case sednav1.LLJobTrain: return sednav1.LLJobEval @@ -477,9 +482,9 @@ func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJob } } -func (jc *LifelongLearningJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { +func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { if name != "" { - secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the secret %s for %s: %w", name, @@ -494,18 +499,18 @@ func IsLifelongLearningJobFinished(j *sednav1.LifelongLearningJob) bool { return false } -func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { +func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { ctx := context.Background() var podTemplate *v1.PodTemplateSpec LLDatasetName := job.Spec.Dataset.Name - dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err) } - datasetSecret, err := jc.getSecret( + datasetSecret, err := c.getSecret( job.Namespace, dataset.Spec.CredentialName, fmt.Sprintf("dataset %s", dataset.Name), @@ -514,7 +519,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning return err } - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("lifelonglearning job %s", job.Name), @@ -526,7 +531,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond LifelongLearningCondData + var cond runtime.LifelongLearningCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -543,25 +548,25 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning originalDataURLOrIndex = dataset.Spec.URL } - var workerParam *WorkerParam = new(WorkerParam) + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) if podtype == sednav1.LLJobTrain { - workerParam.workerType = "Train" + workerParam.WorkerType = "Train" podTemplate = &job.Spec.TrainSpec.Template // Env parameters for train - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "train-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, - "KB_SERVER": jc.cfg.KB.Server, + "LC_SERVER": c.cfg.LC.Server, + "KB_SERVER": c.cfg.KB.Server, } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -569,8 +574,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "OUTPUT_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: jobSecret, DownloadByInitializer: true, @@ -579,8 +584,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning }, // see https://github.com/kubeedge/sedna/issues/35 - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, Indirect: dataset.Spec.URL != originalDataURLOrIndex, @@ -591,35 +596,35 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning ) } else { podTemplate = &job.Spec.EvalSpec.Template - workerParam.workerType = "Eval" + workerParam.WorkerType = "Eval" - // Configure Env information for eval by initial WorkerParam - workerParam.env = map[string]string{ + // Configure Env information for eval by initial runtime.WorkerParam + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "eval-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, - "KB_SERVER": jc.cfg.KB.Server, + "LC_SERVER": c.cfg.LC.Server, + "KB_SERVER": c.cfg.KB.Server, } - var modelMountURLs []MountURL + var modelMountURLs []runtime.MountURL for _, url := range inputmodelURLs { - modelMountURLs = append(modelMountURLs, MountURL{ + modelMountURLs = append(modelMountURLs, runtime.MountURL{ URL: url, Secret: jobSecret, DownloadByInitializer: true, }) } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ URLs: modelMountURLs, Name: "models", EnvName: "MODEL_URLS", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -627,8 +632,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "OUTPUT_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: datasetSecret, DownloadByInitializer: true, @@ -637,8 +642,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "TEST_DATASET_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -651,21 +656,21 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning } // set the default policy instead of Always policy - workerParam.restartPolicy = v1.RestartPolicyOnFailure - workerParam.hostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure + workerParam.HostNetwork = true // create pod based on podtype - _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam) if err != nil { return err } return } -func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLearningJob) error { +func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/") - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("lifelonglearning job %s", job.Name), @@ -674,10 +679,10 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea return err } - var workerParam *WorkerParam = new(WorkerParam) - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: inferModelURL, Secret: jobSecret, DownloadByInitializer: false, @@ -687,30 +692,25 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "inferworker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = InferencePodType - workerParam.hostNetwork = true + workerParam.WorkerType = runtime.InferencePodType + workerParam.HostNetwork = true // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) return err } -// GetName returns the name of the lifelonglearning job controller -func (jc *LifelongLearningJobController) GetName() string { - return "LifelongLearningJobController" -} - -// NewLifelongLearningJobController creates a new LifelongLearningJob controller that keeps the relevant pods +// New creates a new LifelongLearningJob controller that keeps the relevant pods // in sync with their corresponding LifelongLearningJob objects. -func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -739,10 +739,10 @@ func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureCont eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - jc := &LifelongLearningJobController{ + jc := &Controller{ kubeClient: kubeClient, client: crdclient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "lifelonglearningjob"), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "lifelonglearningjob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}), cfg: cfg, } diff --git a/pkg/globalmanager/controller.go b/pkg/globalmanager/controllers/manager.go similarity index 50% rename from pkg/globalmanager/controller.go rename to pkg/globalmanager/controllers/manager.go index 0085fe8f..dfb9149c 100644 --- a/pkg/globalmanager/controller.go +++ b/pkg/globalmanager/controllers/manager.go @@ -14,11 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package controllers import ( "fmt" - "os" "k8s.io/klog/v2" @@ -26,46 +25,44 @@ import ( websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" ) -// MainController defines the main controller -type MainController struct { +// Manager defines the controller manager +type Manager struct { Config *config.ControllerConfig } -// NewController creates a new main controller -func NewController(cc *config.ControllerConfig) *MainController { +// New creates the controller manager +func New(cc *config.ControllerConfig) *Manager { config.InitConfigure(cc) - return &MainController{ + return &Manager{ Config: cc, } } -// Start starts the main controller -func (c *MainController) Start() { - type newFunc func(cfg *config.ControllerConfig) (FeatureControllerI, error) +// Start starts the controllers it has managed +func (m *Manager) Start() error { + uc, _ := NewUpstreamController(m.Config) + dc, _ := NewDownstreamController(m.Config) + uc.Start() + dc.Start() - for _, featureFunc := range []newFunc{ - NewUpstreamController, - NewDownstreamController, - NewFederatedController, - NewJointController, - NewIncrementalJobController, - NewLifelongLearningJobController, - } { - f, _ := featureFunc(c.Config) - err := f.Start() + for name, factory := range NewRegistry() { + f, err := factory(m.Config) if err != nil { - klog.Warningf("failed to start controller %s: %+v", f.GetName(), err) - } else { - klog.Infof("started controller %s", f.GetName()) + return fmt.Errorf("failed to initialize controller %s: %v", name, err) } + err = f.Start() + if err != nil { + return fmt.Errorf("failed to start controller %s: %v", name, err) + } + klog.Infof("started controller %s", name) } - addr := fmt.Sprintf("%s:%d", c.Config.WebSocket.Address, c.Config.WebSocket.Port) + addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port) ws := websocket.NewServer(addr) err := ws.ListenAndServe() if err != nil { - klog.Fatalf("failed to listen websocket at %s", addr) - os.Exit(1) + return fmt.Errorf("failed to listen websocket at %s: %v", addr, err) } + return nil } diff --git a/pkg/globalmanager/controllers/registry.go b/pkg/globalmanager/controllers/registry.go new file mode 100644 index 00000000..af419fc1 --- /dev/null +++ b/pkg/globalmanager/controllers/registry.go @@ -0,0 +1,33 @@ +package controllers + +import ( + "fmt" + + "github.com/kubeedge/sedna/pkg/globalmanager/config" + fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" + il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" + ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference" + ll "github.com/kubeedge/sedna/pkg/globalmanager/controllers/lifelonglearning" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +type FeatureFactory = func(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) + +type Registry map[string]FeatureFactory + +func (r Registry) Register(name string, factory FeatureFactory) error { + if _, ok := r[name]; ok { + return fmt.Errorf("a feature controller named %s already exists", name) + } + r[name] = factory + return nil +} + +func NewRegistry() Registry { + return Registry{ + ji.Name: ji.New, + fl.Name: fl.New, + il.Name: il.New, + ll.Name: ll.New, + } +} diff --git a/pkg/globalmanager/upstream.go b/pkg/globalmanager/controllers/upstream.go similarity index 96% rename from pkg/globalmanager/upstream.go rename to pkg/globalmanager/controllers/upstream.go index 13d64483..66b9172a 100644 --- a/pkg/globalmanager/upstream.go +++ b/pkg/globalmanager/controllers/upstream.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package controllers import ( "context" @@ -29,7 +29,9 @@ import ( sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" + fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) @@ -250,8 +252,8 @@ func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace // Output defines job output information type Output struct { - Models []Model `json:"models"` - JobInfo *JobInfo `json:"ownerInfo"` + Models []runtime.Model `json:"models"` + JobInfo *JobInfo `json:"ownerInfo"` } var status struct { @@ -286,7 +288,7 @@ func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace // TODO: more meaningful reason/message reason := "DoTraining" message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) - cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) + cond := fl.NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) uc.appendFederatedLearningJobStatusCondition(name, namespace, cond) } } @@ -325,7 +327,7 @@ func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, // Get the condition data. // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData IncrementalCondData + var condData runtime.IncrementalCondData err = json.Unmarshal(content, &condData) if err != nil { return newUnmarshalError(namespace, name, operation, content) @@ -402,7 +404,7 @@ func (uc *UpstreamController) updateLifelongLearningJobFromEdge(name, namespace, // Get the condition data. // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData LifelongLearningCondData + var condData runtime.LifelongLearningCondData err = json.Unmarshal(content, &condData) if err != nil { return newUnmarshalError(namespace, name, operation, content) @@ -495,7 +497,7 @@ func (uc *UpstreamController) GetName() string { } // NewUpstreamController creates a new Upstream controller from config -func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { +func NewUpstreamController(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { client, err := utils.NewCRDClient() if err != nil { return nil, fmt.Errorf("create crd client failed with error: %w", err) diff --git a/pkg/globalmanager/common.go b/pkg/globalmanager/runtime/common.go similarity index 92% rename from pkg/globalmanager/common.go rename to pkg/globalmanager/runtime/common.go index 85842b3d..531fa27c 100644 --- a/pkg/globalmanager/common.go +++ b/pkg/globalmanager/runtime/common.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "context" @@ -33,8 +33,7 @@ const ( // DefaultBackOff is the default backoff period DefaultBackOff = 10 * time.Second // MaxBackOff is the max backoff period - MaxBackOff = 360 * time.Second - bigModelPort int32 = 5000 + MaxBackOff = 360 * time.Second // ResourceUpdateRetries defines times of retrying to update resource ResourceUpdateRetries = 3 ) @@ -62,8 +61,8 @@ func GetNodeIPByName(kubeClient kubernetes.Interface, name string) (string, erro return "", fmt.Errorf("can't found node ip for node %s", name) } -// getBackoff calc the next wait time for the key -func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { +// GetBackoff calc the next wait time for the key +func GetBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { exp := queue.NumRequeues(key) if exp <= 0 { @@ -83,7 +82,7 @@ func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Dur return calculated } -func calcActivePodCount(pods []*v1.Pod) int32 { +func CalcActivePodCount(pods []*v1.Pod) int32 { var result int32 = 0 for _, p := range pods { if v1.PodSucceeded != p.Status.Phase && diff --git a/pkg/globalmanager/secret_injector.go b/pkg/globalmanager/runtime/secret_injector.go similarity index 99% rename from pkg/globalmanager/secret_injector.go rename to pkg/globalmanager/runtime/secret_injector.go index 6b5577f2..7649dfaa 100644 --- a/pkg/globalmanager/secret_injector.go +++ b/pkg/globalmanager/runtime/secret_injector.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "encoding/json" diff --git a/pkg/globalmanager/storage_initializer_injector.go b/pkg/globalmanager/runtime/storage_initializer_injector.go similarity index 97% rename from pkg/globalmanager/storage_initializer_injector.go rename to pkg/globalmanager/runtime/storage_initializer_injector.go index e6ee0d09..f9df1af8 100644 --- a/pkg/globalmanager/storage_initializer_injector.go +++ b/pkg/globalmanager/runtime/storage_initializer_injector.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "net/url" @@ -179,7 +179,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { hostPathType := v1.HostPathDirectory - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.HostPath == "" { continue @@ -240,7 +240,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) { var secretEnvs []v1.EnvVar - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.Disable || m.DownloadByInitializer { continue @@ -259,7 +259,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { var downloadPairs []string var secretEnvs []v1.EnvVar - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.Disable { continue @@ -345,7 +345,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { var mounts []WorkerMount // parse the mounts and environment key - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { var envPaths []string if mount.URL != nil { @@ -374,13 +374,13 @@ func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { } if mount.EnvName != "" { - workerParam.env[mount.EnvName] = strings.Join( + workerParam.Env[mount.EnvName] = strings.Join( envPaths, urlsFieldSep, ) } } - workerParam.mounts = mounts + workerParam.Mounts = mounts // need to call injectInitializerContainer before injectHostPathMount // since injectHostPathMount could inject volumeMount to init container diff --git a/pkg/globalmanager/types.go b/pkg/globalmanager/runtime/types.go similarity index 99% rename from pkg/globalmanager/types.go rename to pkg/globalmanager/runtime/types.go index 2fb9534b..0f5788fa 100644 --- a/pkg/globalmanager/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "encoding/json" @@ -34,7 +34,6 @@ type CommonInterface interface { // FeatureControllerI defines the interface of an AI Feature controller type FeatureControllerI interface { Start() error - GetName() string } type Model struct { diff --git a/pkg/globalmanager/worker.go b/pkg/globalmanager/runtime/worker.go similarity index 88% rename from pkg/globalmanager/worker.go rename to pkg/globalmanager/runtime/worker.go index dc950faf..fab3dd13 100644 --- a/pkg/globalmanager/worker.go +++ b/pkg/globalmanager/runtime/worker.go @@ -1,4 +1,4 @@ -package globalmanager +package runtime import ( "context" @@ -27,15 +27,15 @@ type WorkerMount struct { // WorkerParam describes the system-defined parameters of worker type WorkerParam struct { - mounts []WorkerMount + Mounts []WorkerMount - env map[string]string - workerType string + Env map[string]string + WorkerType string // if true, force to use hostNetwork - hostNetwork bool + HostNetwork bool - restartPolicy v1.RestartPolicy + RestartPolicy v1.RestartPolicy } // generateLabels generates labels for an object @@ -105,11 +105,11 @@ func CreateKubernetesService(kubeClient kubernetes.Interface, object CommonInter return service.Spec.Ports[0].NodePort, nil } -// injectWorkerParam modifies pod in-place +// injectWorkerParam.Modifies pod in-place func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { InjectStorageInitializer(pod, workerParam) - envs := createEnvVars(workerParam.env) + envs := createEnvVars(workerParam.Env) for idx := range pod.Spec.Containers { pod.Spec.Containers[idx].Env = append( pod.Spec.Containers[idx].Env, envs..., @@ -121,27 +121,27 @@ func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInter pod.Labels = make(map[string]string) } - for k, v := range generateLabels(object, workerParam.workerType) { + for k, v := range generateLabels(object, workerParam.WorkerType) { pod.Labels[k] = v } - pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.workerType) + "-" + pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.WorkerType) + "-" pod.Namespace = object.GetNamespace() - if workerParam.hostNetwork { + if workerParam.HostNetwork { // FIXME // force to set hostnetwork pod.Spec.HostNetwork = true } if pod.Spec.RestartPolicy == "" { - pod.Spec.RestartPolicy = workerParam.restartPolicy + pod.Spec.RestartPolicy = workerParam.RestartPolicy } } -// createPodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam -func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { +// CreatePodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam +func CreatePodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { objectKind := object.GroupVersionKind() pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind)) injectWorkerParam(pod, workerParam, object) @@ -149,7 +149,7 @@ func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{}) objectName := object.GetNamespace() + "/" + object.GetName() if err != nil { - klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.workerType, objectKind, objectName, err) + klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.WorkerType, objectKind, objectName, err) return nil, err } klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName) diff --git a/pkg/localcontroller/manager/incrementallearningjob.go b/pkg/localcontroller/manager/incrementallearningjob.go index 70b826b4..e565e4d0 100644 --- a/pkg/localcontroller/manager/incrementallearningjob.go +++ b/pkg/localcontroller/manager/incrementallearningjob.go @@ -31,7 +31,7 @@ import ( "github.com/kubeedge/sedna/cmd/sedna-lc/app/options" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/localcontroller/db" "github.com/kubeedge/sedna/pkg/localcontroller/gmclient" "github.com/kubeedge/sedna/pkg/localcontroller/storage" @@ -437,11 +437,11 @@ func newTrigger(t sednav1.Trigger) (trigger.Base, error) { func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo { jobConditions := job.Status.Conditions - // TODO: globalmanager.type changes to common.type for gm and lc - var models []globalmanager.Model + // TODO: runtime.type changes to common.type for gm and lc + var models []runtime.Model for i := len(jobConditions) - 1; i >= 0; i-- { - var cond globalmanager.IncrementalCondData + var cond runtime.IncrementalCondData jobCond := jobConditions[i] if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil { From 55f835b1a00b8f2cdb6714ab122b15bf338da235 Mon Sep 17 00:00:00 2001 From: llhuii Date: Thu, 22 Jul 2021 17:42:21 +0800 Subject: [PATCH 2/7] gm: refactor upstream controller Split upstream controller, merge each feature CR logic code into its controller. Signed-off-by: llhuii --- .../controllers/dataset/dataset.go | 58 +++ .../federatedlearning/federatedlearningjob.go | 104 +++- .../incrementallearningjob.go | 87 +++- .../jointinference/jointinferenceservice.go | 72 ++- .../lifelonglearning/lifelonglearningjob.go | 86 +++- pkg/globalmanager/controllers/manager.go | 7 +- pkg/globalmanager/controllers/registry.go | 3 +- pkg/globalmanager/controllers/upstream.go | 452 +----------------- pkg/globalmanager/runtime/common.go | 38 ++ pkg/globalmanager/runtime/types.go | 13 + 10 files changed, 477 insertions(+), 443 deletions(-) create mode 100644 pkg/globalmanager/controllers/dataset/dataset.go diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go new file mode 100644 index 00000000..8964f641 --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -0,0 +1,58 @@ +package dataset + +import ( + "context" + "encoding/json" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" + sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// Controller handles all dataset objects including: syncing to edge and update from edge. +type Controller struct { + client sednaclientset.SednaV1alpha1Interface + + storeSynced cache.InformerSynced + + // A store of dataset + lister sednav1listers.DatasetLister + + cfg *config.ControllerConfig +} + +// updateDatasetFromEdge syncs update from edge +func (c *Controller) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { + status := sednav1.DatasetStatus{} + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + return c.updateDatasetStatus(name, namespace, status) +} + +// updateDatasetStatus updates the dataset status +func (c *Controller) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error { + client := c.client.Datasets(namespace) + + if status.UpdateTime == nil { + now := metav1.Now() + status.UpdateTime = &now + } + + return runtime.RetryUpdateStatus(name, namespace, func() error { + dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + dataset.Status = status + _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) + return err + }) +} diff --git a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index 103cfcb5..47448742 100644 --- a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -18,6 +18,7 @@ package federatedlearning import ( "context" + "encoding/json" "fmt" "strconv" "time" @@ -51,6 +52,9 @@ import ( ) const ( + // KindName is the kind name of CR this controller controls + KindName = "FederatedLearningJob" + // Name is this controller name Name = "FederatedLearning" ) @@ -60,7 +64,7 @@ const ( ) // Kind contains the schema.GroupVersionKind for this controller type. -var Kind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob") +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) // Controller ensures that all FLJob objects have corresponding pods to // run their configured workload. @@ -531,9 +535,102 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, return } +func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error { + var err error + job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{}) + if err != nil { + // federated crd not found + return err + } + modelName := job.Spec.AggregationWorker.Model.Name + client := c.client.Models(namespace) + + return runtime.RetryUpdateStatus(modelName, namespace, (func() error { + model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{}) + if err != nil { + return err + } + + now := metav1.Now() + model.Status.UpdateTime = &now + model.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) + return err + })) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { + client := c.client.FederatedLearningJobs(namespace) + + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge updates the federated job's status +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) { + // JobInfo defines the job information + type JobInfo struct { + // Current training round + CurrentRound int `json:"currentRound"` + UpdateTime string `json:"updateTime"` + } + + // Output defines job output information + type Output struct { + Models []runtime.Model `json:"models"` + JobInfo *JobInfo `json:"ownerInfo"` + } + + var status struct { + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err = json.Unmarshal(content, &status) + if err != nil { + return + } + + output := status.Output + + if output != nil { + // Update the model's metrics + if len(output.Models) > 0 { + // only one model + model := output.Models[0] + metrics := runtime.ConvertMapToMetrics(model.Metrics) + if len(metrics) > 0 { + c.updateModelMetrics(name, namespace, metrics) + } + } + + jobInfo := output.JobInfo + // update job info if having any info + if jobInfo != nil && jobInfo.CurrentRound > 0 { + // Find a good place to save the progress info + // TODO: more meaningful reason/message + reason := "DoTraining" + message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) + cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) + c.appendStatusCondition(name, namespace, cond) + } + } + + return nil +} + // New creates a new federated learning job controller that keeps the relevant pods // in sync with their corresponding FederatedLearningJob objects. -func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { +func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := controllerContext.Config namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -585,5 +682,8 @@ func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { stopCh := make(chan struct{}) kubeInformerFactory.Start(stopCh) jobInformerFactory.Start(stopCh) + + controllerContext.UpstreamController.Add(KindName, fc.updateFromEdge) + return fc, err } diff --git a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index b422a875..338ed792 100644 --- a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -53,11 +53,15 @@ import ( ) const ( - Name = "IncrementalLearningJob" + // KindName is the kind name of CR this controller controls + KindName = "IncrementalLearningJob" + + // Name is this controller name + Name = "IncrementalLearning" ) // Kind contains the schema.GroupVersionKind for this controller type. -var Kind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob") +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) // Controller ensures that all IncrementalLearningJob objects have corresponding pods to // run their configured workload. @@ -815,9 +819,83 @@ func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { return err } +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { + client := c.client.IncrementalLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData runtime.IncrementalCondData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.ILJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.ILJobTrain + case "eval": + cond.Stage = sednav1.ILJobEval + case "deploy": + cond.Stage = sednav1.ILJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.ILJobStageCondReady + case "completed": + cond.Type = sednav1.ILJobStageCondCompleted + case "failed": + cond.Type = sednav1.ILJobStageCondFailed + case "waiting": + cond.Type = sednav1.ILJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + // New creates a new IncrementalJob controller that keeps the relevant pods // in sync with their corresponding IncrementalJob objects. -func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { +func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := controllerContext.Config namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -880,5 +958,8 @@ func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { stopCh := make(chan struct{}) kubeInformerFactory.Start(stopCh) jobInformerFactory.Start(stopCh) + + controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + return jc, err } diff --git a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index d32ee83c..13c221af 100644 --- a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -52,7 +52,11 @@ import ( ) const ( + // Name is this controller name Name = "JointInference" + + // KindName is the kind name of CR this controller controls + KindName = "JointInferenceService" ) const ( @@ -62,7 +66,7 @@ const ( ) // Kind contains the schema.GroupVersionKind for this controller type. -var Kind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService") +var Kind = sednav1.SchemeGroupVersion.WithKind(Name) // Controller ensures that all JointInferenceService objects // have corresponding pods to run their configured workload. @@ -537,9 +541,70 @@ func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bi return err } +func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error { + client := c.client.JointInferenceServices(namespace) + + return runtime.RetryUpdateStatus(name, namespace, func() error { + joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + joint.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + // Output defines owner output information + type Output struct { + ServiceInfo map[string]interface{} `json:"ownerInfo"` + } + + var status struct { + // Phase always should be "inference" + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + // TODO: propagate status.Status to k8s + + output := status.Output + if output == nil || output.ServiceInfo == nil { + // no output info + klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) + return nil + } + + info := output.ServiceInfo + + for _, ignoreTimeKey := range []string{ + "startTime", + "updateTime", + } { + delete(info, ignoreTimeKey) + } + + metrics := runtime.ConvertMapToMetrics(info) + + err = c.updateMetrics(name, namespace, metrics) + if err != nil { + return fmt.Errorf("failed to update metrics, err:%+w", err) + } + return nil +} + // New creates a new JointInferenceService controller that keeps the relevant pods // in sync with their corresponding JointInferenceService objects. -func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { +func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := controllerContext.Config var err error namespace := cfg.Namespace if namespace == "" { @@ -597,5 +662,8 @@ func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { stopCh := messageContext.Done() kubeInformerFactory.Start(stopCh) serviceInformerFactory.Start(stopCh) + + controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + return jc, err } diff --git a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index 2d95dcc5..f7d7e197 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -18,6 +18,7 @@ package lifelonglearning import ( "context" + "encoding/json" "fmt" "strings" "time" @@ -51,11 +52,14 @@ import ( ) const ( + // KindName is the kind name of CR this controller controls + KindName = "LifelongLearningJob" + // Name is this controller name Name = "LifelongLearning" ) // Kind contains the schema.GroupVersionKind for this controller type. -var Kind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob") +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) // Controller ensures that all LifelongLearningJob objects have corresponding pods to // run their configured workload. @@ -708,9 +712,84 @@ func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { return err } +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { + client := c.client.LifelongLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData runtime.LifelongLearningCondData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.LLJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.LLJobTrain + case "eval": + cond.Stage = sednav1.LLJobEval + case "deploy": + cond.Stage = sednav1.LLJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.LLJobStageCondReady + case "completed": + cond.Type = sednav1.LLJobStageCondCompleted + case "failed": + cond.Type = sednav1.LLJobStageCondFailed + case "waiting": + cond.Type = sednav1.LLJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + // New creates a new LifelongLearningJob controller that keeps the relevant pods // in sync with their corresponding LifelongLearningJob objects. -func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { +func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := controllerContext.Config namespace := cfg.Namespace if namespace == "" { namespace = metav1.NamespaceAll @@ -772,5 +851,8 @@ func New(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { stopCh := make(chan struct{}) kubeInformerFactory.Start(stopCh) jobInformerFactory.Start(stopCh) + + controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + return jc, err } diff --git a/pkg/globalmanager/controllers/manager.go b/pkg/globalmanager/controllers/manager.go index dfb9149c..563e4489 100644 --- a/pkg/globalmanager/controllers/manager.go +++ b/pkg/globalmanager/controllers/manager.go @@ -23,6 +23,7 @@ import ( "github.com/kubeedge/sedna/pkg/globalmanager/config" websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) // Manager defines the controller manager @@ -44,9 +45,13 @@ func (m *Manager) Start() error { dc, _ := NewDownstreamController(m.Config) uc.Start() dc.Start() + context := &runtime.ControllerContext{ + UpstreamController: uc, + Config: m.Config, + } for name, factory := range NewRegistry() { - f, err := factory(m.Config) + f, err := factory(context) if err != nil { return fmt.Errorf("failed to initialize controller %s: %v", name, err) } diff --git a/pkg/globalmanager/controllers/registry.go b/pkg/globalmanager/controllers/registry.go index af419fc1..dde760aa 100644 --- a/pkg/globalmanager/controllers/registry.go +++ b/pkg/globalmanager/controllers/registry.go @@ -3,7 +3,6 @@ package controllers import ( "fmt" - "github.com/kubeedge/sedna/pkg/globalmanager/config" fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference" @@ -11,7 +10,7 @@ import ( "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) -type FeatureFactory = func(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) +type FeatureFactory = func(*runtime.ControllerContext) (runtime.FeatureControllerI, error) type Registry map[string]FeatureFactory diff --git a/pkg/globalmanager/controllers/upstream.go b/pkg/globalmanager/controllers/upstream.go index 66b9172a..8688594e 100644 --- a/pkg/globalmanager/controllers/upstream.go +++ b/pkg/globalmanager/controllers/upstream.go @@ -17,55 +17,22 @@ limitations under the License. package controllers import ( - "context" - "encoding/json" "fmt" "strings" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" - - sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" + "k8s.io/klog/v2" ) -// updateHandler handles the updates from LC(running at edge) to update the -// corresponding resource -type updateHandler func(namespace, name, operation string, content []byte) error - // UpstreamController subscribes the updates from edge and syncs to k8s api server type UpstreamController struct { - client *clientset.SednaV1alpha1Client messageLayer messagelayer.MessageLayer - updateHandlers map[string]updateHandler + updateHandlers map[string]runtime.UpstreamUpdateHandler } -const upstreamStatusUpdateRetries = 3 - -// retryUpdateStatus simply retries to call the status update func -func retryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { - var err error - for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ { - err = updateStatusFunc() - if err == nil { - return nil - } - klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err) - } - return err -} - -func newUnmarshalError(namespace, name, operation string, content []byte) error { - return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) -} - -func checkUpstreamOperation(operation string) error { +func (uc *UpstreamController)checkOperation(operation string) error { // current only support the 'status' operation if operation != "status" { return fmt.Errorf("unknown operation %s", operation) @@ -73,383 +40,6 @@ func checkUpstreamOperation(operation string) error { return nil } -// updateDatasetStatus updates the dataset status -func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error { - client := uc.client.Datasets(namespace) - - if status.UpdateTime == nil { - now := metav1.Now() - status.UpdateTime = &now - } - - return retryUpdateStatus(name, namespace, func() error { - dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - dataset.Status = status - _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) - return err - }) -} - -// updateDatasetFromEdge syncs update from edge -func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - - status := sednav1.DatasetStatus{} - err = json.Unmarshal(content, &status) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - return uc.updateDatasetStatus(name, namespace, status) -} - -// convertToMetrics converts the metrics from LCs to resource metrics -func convertToMetrics(m map[string]interface{}) []sednav1.Metric { - var l []sednav1.Metric - for k, v := range m { - var displayValue string - switch t := v.(type) { - case string: - displayValue = t - default: - // ignore the json marshal error - b, _ := json.Marshal(v) - displayValue = string(b) - } - - l = append(l, sednav1.Metric{Key: k, Value: displayValue}) - } - return l -} - -func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.JointInferenceServices(namespace) - - return retryUpdateStatus(name, namespace, func() error { - joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - joint.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) - return err - }) -} - -// updateJointInferenceFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - - // Output defines owner output information - type Output struct { - ServiceInfo map[string]interface{} `json:"ownerInfo"` - } - - var status struct { - // Phase always should be "inference" - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err = json.Unmarshal(content, &status) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // TODO: propagate status.Status to k8s - - output := status.Output - if output == nil || output.ServiceInfo == nil { - // no output info - klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) - return nil - } - - info := output.ServiceInfo - - for _, ignoreTimeKey := range []string{ - "startTime", - "updateTime", - } { - delete(info, ignoreTimeKey) - } - - metrics := convertToMetrics(info) - - err = uc.updateJointInferenceMetrics(name, namespace, metrics) - if err != nil { - return fmt.Errorf("failed to update metrics, err:%+w", err) - } - return nil -} - -func (uc *UpstreamController) updateModelMetrics(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.Models(namespace) - - return retryUpdateStatus(name, namespace, (func() error { - model, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - - now := metav1.Now() - model.Status.UpdateTime = &now - model.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) - return err - })) -} - -func (uc *UpstreamController) updateModelMetricsByFederatedName(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.FederatedLearningJobs(namespace) - var err error - federatedLearningJob, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - // federated crd not found - return err - } - modelName := federatedLearningJob.Spec.AggregationWorker.Model.Name - return uc.updateModelMetrics(modelName, namespace, metrics) -} - -func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { - client := uc.client.FederatedLearningJobs(namespace) - - return retryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateFederatedLearningJobFromEdge updates the federated job's status -func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { - err = checkUpstreamOperation(operation) - if err != nil { - return err - } - - // JobInfo defines the job information - type JobInfo struct { - // Current training round - CurrentRound int `json:"currentRound"` - UpdateTime string `json:"updateTime"` - } - - // Output defines job output information - type Output struct { - Models []runtime.Model `json:"models"` - JobInfo *JobInfo `json:"ownerInfo"` - } - - var status struct { - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err = json.Unmarshal(content, &status) - if err != nil { - err = newUnmarshalError(namespace, name, operation, content) - return - } - - output := status.Output - - if output != nil { - // Update the model's metrics - if len(output.Models) > 0 { - // only one model - model := output.Models[0] - metrics := convertToMetrics(model.Metrics) - if len(metrics) > 0 { - uc.updateModelMetricsByFederatedName(name, namespace, metrics) - } - } - - jobInfo := output.JobInfo - // update job info if having any info - if jobInfo != nil && jobInfo.CurrentRound > 0 { - // Find a good place to save the progress info - // TODO: more meaningful reason/message - reason := "DoTraining" - message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) - cond := fl.NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) - uc.appendFederatedLearningJobStatusCondition(name, namespace, cond) - } - } - - return nil -} - -func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { - client := uc.client.IncrementalLearningJobs(namespace) - return retryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateIncrementalLearningFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err = json.Unmarshal(content, &jobStatus) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData runtime.IncrementalCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.ILJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.ILJobTrain - case "eval": - cond.Stage = sednav1.ILJobEval - case "deploy": - cond.Stage = sednav1.ILJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.ILJobStageCondReady - case "completed": - cond.Type = sednav1.ILJobStageCondCompleted - case "failed": - cond.Type = sednav1.ILJobStageCondFailed - case "waiting": - cond.Type = sednav1.ILJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - -func (uc *UpstreamController) appendLifelongLearningJobStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { - client := uc.client.LifelongLearningJobs(namespace) - return retryUpdateStatus(name, namespace, func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - }) -} - -// updateLifelongLearningJobFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateLifelongLearningJobFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err = json.Unmarshal(content, &jobStatus) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData runtime.LifelongLearningCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.LLJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.LLJobTrain - case "eval": - cond.Stage = sednav1.LLJobEval - case "deploy": - cond.Stage = sednav1.LLJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.LLJobStageCondReady - case "completed": - cond.Type = sednav1.LLJobStageCondCompleted - case "failed": - cond.Type = sednav1.LLJobStageCondFailed - case "waiting": - cond.Type = sednav1.LLJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = uc.appendLifelongLearningJobStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - // syncEdgeUpdate receives the updates from edge and syncs these to k8s. func (uc *UpstreamController) syncEdgeUpdate() { for { @@ -461,10 +51,14 @@ func (uc *UpstreamController) syncEdgeUpdate() { } update, err := uc.messageLayer.ReceiveResourceUpdate() - if err != nil { + if err == nil { + err = uc.checkOperation(update.operation) + } + if err != nil && err := { klog.Warningf("Ignore update since this err: %+v", err) continue } + if err != nil kind := update.Kind namespace := update.Namespace @@ -496,25 +90,21 @@ func (uc *UpstreamController) GetName() string { return "UpstreamController" } -// NewUpstreamController creates a new Upstream controller from config -func NewUpstreamController(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { - client, err := utils.NewCRDClient() - if err != nil { - return nil, fmt.Errorf("create crd client failed with error: %w", err) - } - uc := &UpstreamController{ - client: client, - messageLayer: messagelayer.NewContextMessageLayer(), +func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamUpdateHandler) error { + kind = strings.ToLower(kind) + if _, ok := uc.updateHandlers[kind]; ok { + return fmt.Errorf("a upstream handler for kind %s already exists", kind) } + uc.updateHandlers[kind] = handler + + return nil +} - // NOTE: current no direct model update from edge, - // model update will be triggered by the corresponding training feature - uc.updateHandlers = map[string]updateHandler{ - "dataset": uc.updateDatasetFromEdge, - "jointinferenceservice": uc.updateJointInferenceFromEdge, - "federatedlearningjob": uc.updateFederatedLearningJobFromEdge, - "incrementallearningjob": uc.updateIncrementalLearningFromEdge, - "lifelonglearningjob": uc.updateLifelongLearningJobFromEdge, +// NewUpstreamController creates a new Upstream controller from config +func NewUpstreamController(cfg *config.ControllerConfig) (*UpstreamController, error) { + uc := &UpstreamController{ + messageLayer: messagelayer.NewContextMessageLayer(), + updateHandlers: make(map[string]runtime.UpstreamUpdateHandler), } return uc, nil diff --git a/pkg/globalmanager/runtime/common.go b/pkg/globalmanager/runtime/common.go index 531fa27c..47bc7e0e 100644 --- a/pkg/globalmanager/runtime/common.go +++ b/pkg/globalmanager/runtime/common.go @@ -18,6 +18,7 @@ package runtime import ( "context" + "encoding/json" "fmt" "math" "strings" @@ -27,6 +28,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" ) const ( @@ -128,3 +132,37 @@ func ConvertK8SValidName(name string) string { return string(fixName) } + +// ConvertMapToMetrics converts the metric map to list of resource Metric +func ConvertMapToMetrics(metric map[string]interface{}) []sednav1.Metric { + var l []sednav1.Metric + for k, v := range metric { + var displayValue string + switch t := v.(type) { + case string: + displayValue = t + default: + // ignore the json marshal error + b, _ := json.Marshal(v) + displayValue = string(b) + } + + l = append(l, sednav1.Metric{Key: k, Value: displayValue}) + } + return l +} + +const upstreamStatusUpdateRetries = 3 + +// RetryUpdateStatus simply retries to call the status update func +func RetryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { + var err error + for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ { + err = updateStatusFunc() + if err == nil { + return nil + } + klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err) + } + return err +} diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go index 0f5788fa..0e18812e 100644 --- a/pkg/globalmanager/runtime/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -19,6 +19,7 @@ package runtime import ( "encoding/json" + "github.com/kubeedge/sedna/pkg/globalmanager/config" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -165,3 +166,15 @@ func (cd *LifelongLearningCondData) GetInputModelURLs() []string { func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) } + +// updateHandler handles the updates from LC(running at edge) to update the +// corresponding resource +type UpstreamUpdateHandler func(namespace, name, operation string, content []byte) error +type UpstreamControllerI interface { + Add(kind string, updateHandler UpstreamUpdateHandler) error +} + +type ControllerContext struct { + Config *config.ControllerConfig + UpstreamController UpstreamControllerI +} From c0886dfe28a27272def17467993c0c42b15604d9 Mon Sep 17 00:00:00 2001 From: llhuii Date: Fri, 23 Jul 2021 11:17:54 +0800 Subject: [PATCH 3/7] gm: share client/Informer with all controllers Make all controllers sharing with: 1. kubernetes client, and informerFactory with random resync period. 2. sedna crd client, and informerFactory with random resync period. This can reduce code and improve slim performance. Signed-off-by: llhuii --- pkg/globalmanager/controllers/downstream.go | 34 ++----- .../federatedlearning/federatedlearningjob.go | 71 +++++---------- .../incrementallearningjob.go | 91 ++++++------------- .../jointinference/jointinferenceservice.go | 73 +++++---------- .../lifelonglearning/lifelonglearningjob.go | 82 +++++------------ pkg/globalmanager/controllers/manager.go | 74 ++++++++++++--- pkg/globalmanager/controllers/upstream.go | 19 ++-- pkg/globalmanager/runtime/types.go | 19 +++- 8 files changed, 197 insertions(+), 266 deletions(-) diff --git a/pkg/globalmanager/controllers/downstream.go b/pkg/globalmanager/controllers/downstream.go index a6134454..34938817 100644 --- a/pkg/globalmanager/controllers/downstream.go +++ b/pkg/globalmanager/controllers/downstream.go @@ -34,7 +34,6 @@ import ( "github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) // DownstreamController watch kubernetes api server and send the controller resource change to edge @@ -44,7 +43,7 @@ type DownstreamController struct { cfg *config.ControllerConfig - client *clientset.SednaV1alpha1Client + client clientset.SednaV1alpha1Interface kubeClient kubernetes.Interface messageLayer messagelayer.MessageLayer @@ -344,44 +343,25 @@ func (dc *DownstreamController) watch(stopCh <-chan struct{}) { } // Start starts the controller -func (dc *DownstreamController) Start() error { - stopCh := dc.messageLayer.Done() - +func (dc *DownstreamController) Run(stopCh <-chan struct{}) { // watch is an asynchronous call dc.watch(stopCh) // sync is a synchronous call - go dc.sync(stopCh) - - return nil -} - -// GetName returns the name of the downstream controller -func (dc *DownstreamController) GetName() string { - return "DownstreamController" + dc.sync(stopCh) } // NewDownstreamController creates a controller DownstreamController from config -func NewDownstreamController(cfg *config.ControllerConfig) (runtime.FeatureControllerI, error) { +func NewDownstreamController(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { // TODO: make bufferSize configurable bufferSize := 10 events := make(chan watch.Event, bufferSize) - crdclient, err := utils.NewCRDClient() - if err != nil { - return nil, fmt.Errorf("create crd client failed with error: %w", err) - } - - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } - dc := &DownstreamController{ - cfg: cfg, + cfg: cc.Config, events: events, - client: crdclient, - kubeClient: kubeClient, + client: cc.SednaClient.SednaV1alpha1(), + kubeClient: cc.KubeClient, messageLayer: messagelayer.NewContextMessageLayer(), } diff --git a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index 47448742..fb54a36b 100644 --- a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -29,7 +29,6 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -41,14 +40,10 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) const ( @@ -93,31 +88,28 @@ type Controller struct { cfg *config.ControllerConfig } -// Run the main goroutine responsible for watching and syncing jobs. -func (c *Controller) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer c.queue.ShutDown() - klog.Infof("Starting federatedlearning job controller") - defer klog.Infof("Shutting down federatedlearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, c.podStoreSynced, c.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - klog.Infof("Starting federatedlearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(c.worker, time.Second, stopCh) - } + return + } - <-stopCh - }() - return nil + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. @@ -629,28 +621,19 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // New creates a new federated learning job controller that keeps the relevant pods // in sync with their corresponding FederatedLearningJob objects. -func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { - cfg := controllerContext.Config - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - kubecfg, _ := utils.KubeConfig() - crdclient, err := clientset.NewForConfig(kubecfg) - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - podInformer := kubeInformerFactory.Core().V1().Pods() + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) fc := &Controller{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "flJob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}), @@ -679,11 +662,7 @@ func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControlle fc.podStore = podInformer.Lister() fc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - - controllerContext.UpstreamController.Add(KindName, fc.updateFromEdge) + cc.UpstreamController.Add(KindName, fc.updateFromEdge) - return fc, err + return fc, nil } diff --git a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index 338ed792..d995eac6 100644 --- a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -30,7 +30,6 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -42,22 +41,18 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) const ( - // KindName is the kind name of CR this controller controls - KindName = "IncrementalLearningJob" - // Name is this controller name Name = "IncrementalLearning" + + // KindName is the kind name of CR this controller controls + KindName = "IncrementalLearningJob" ) // Kind contains the schema.GroupVersionKind for this controller type. @@ -90,30 +85,28 @@ type Controller struct { cfg *config.ControllerConfig } -// Run the main goroutine responsible for watching and syncing jobs. -func (c *Controller) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { + // TODO: make workers parameter workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer c.queue.ShutDown() - klog.Infof("Starting incrementallearning job controller") - defer klog.Infof("Shutting down incrementallearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, c.podStoreSynced, c.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } - klog.Infof("Starting incrementallearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(c.worker, time.Second, stopCh) - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - <-stopCh - }() - return nil + return + } + klog.Infof("Starting %s job workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the jointInferenceService object of the specified pod. @@ -894,43 +887,21 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // New creates a new IncrementalJob controller that keeps the relevant pods // in sync with their corresponding IncrementalJob objects. -func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { - cfg := controllerContext.Config - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - kubecfg, err := utils.KubeConfig() - if err != nil { - return nil, err - } - crdclient, err := clientset.NewForConfig(kubecfg) - if err != nil { - return nil, err - } - - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) - - podInformer := kubeInformerFactory.Core().V1().Pods() - - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) jc := &Controller{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "incrementallearningjob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}), - cfg: cfg, + cfg: cc.Config, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -955,11 +926,7 @@ func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControlle jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - - controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + cc.UpstreamController.Add(KindName, jc.updateFromEdge) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index 13c221af..50cbf408 100644 --- a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -29,7 +29,6 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -41,14 +40,10 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) const ( @@ -92,31 +87,28 @@ type Controller struct { cfg *config.ControllerConfig } -// Start starts the main goroutine responsible for watching and syncing services. -func (c *Controller) Start() error { +// Run starts the main goroutine responsible for watching and syncing services. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer c.queue.ShutDown() - klog.Infof("Starting joint inference service controller") - defer klog.Infof("Shutting down joint inference service controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, c.podStoreSynced, c.serviceStoreSynced) { - klog.Errorf("failed to wait for joint inferce service caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.serviceStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - klog.Infof("Starting joint inference service workers") - for i := 0; i < workers; i++ { - go wait.Until(c.worker, time.Second, stopCh) - } + return + } - <-stopCh - }() - return nil + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the jointInferenceService object of the specified pod. @@ -603,30 +595,19 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // New creates a new JointInferenceService controller that keeps the relevant pods // in sync with their corresponding JointInferenceService objects. -func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { - cfg := controllerContext.Config - var err error - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - - kubeClient, _ := utils.KubeClient() - kubecfg, _ := utils.KubeConfig() - crdclient, _ := clientset.NewForConfig(kubecfg) - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - podInformer := kubeInformerFactory.Core().V1().Pods() + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - serviceInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - serviceInformer := serviceInformerFactory.Sedna().V1alpha1().JointInferenceServices() + serviceInformer := cc.SednaInformerFactory.Sedna().V1alpha1().JointInferenceServices() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) jc := &Controller{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "jointinferenceservice"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), @@ -659,11 +640,7 @@ func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControlle jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := messageContext.Done() - kubeInformerFactory.Start(stopCh) - serviceInformerFactory.Start(stopCh) - - controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + cc.UpstreamController.Add(KindName, jc.updateFromEdge) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index f7d7e197..1aa80cdd 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -29,7 +29,6 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -41,14 +40,10 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) const ( @@ -88,30 +83,27 @@ type Controller struct { cfg *config.ControllerConfig } -// Run the main goroutine responsible for watching and syncing jobs. -func (c *Controller) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer c.queue.ShutDown() - klog.Infof("Starting lifelonglearning job controller") - defer klog.Infof("Shutting down lifelonglearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, c.podStoreSynced, c.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } - klog.Infof("Starting lifelonglearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(c.worker, time.Second, stopCh) - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - <-stopCh - }() - return nil + return + } + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the lifelonglearningjob object of the specified pod. @@ -788,39 +780,19 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // New creates a new LifelongLearningJob controller that keeps the relevant pods // in sync with their corresponding LifelongLearningJob objects. -func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControllerI, error) { - cfg := controllerContext.Config - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - kubecfg, err := utils.KubeConfig() - if err != nil { - return nil, err - } - crdclient, err := clientset.NewForConfig(kubecfg) - if err != nil { - return nil, err - } - - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - podInformer := kubeInformerFactory.Core().V1().Pods() - - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) jc := &Controller{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "lifelonglearningjob"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}), cfg: cfg, @@ -848,11 +820,7 @@ func New(controllerContext *runtime.ControllerContext) (runtime.FeatureControlle jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - - controllerContext.UpstreamController.Add(KindName, jc.updateFromEdge) + cc.UpstreamController.Add(KindName, jc.updateFromEdge) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/manager.go b/pkg/globalmanager/controllers/manager.go index 563e4489..1066597b 100644 --- a/pkg/globalmanager/controllers/manager.go +++ b/pkg/globalmanager/controllers/manager.go @@ -18,12 +18,19 @@ package controllers import ( "fmt" + "math/rand" + "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeinformers "k8s.io/client-go/informers" "k8s.io/klog/v2" + clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" + sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" "github.com/kubeedge/sedna/pkg/globalmanager/config" websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) // Manager defines the controller manager @@ -39,34 +46,79 @@ func New(cc *config.ControllerConfig) *Manager { } } +func genResyncPeriod(minPeriod time.Duration) time.Duration { + factor := rand.Float64() + 1 + // [minPeriod, 2*minPeriod) + return time.Duration(factor * float64(minPeriod.Nanoseconds())) +} + // Start starts the controllers it has managed func (m *Manager) Start() error { - uc, _ := NewUpstreamController(m.Config) - dc, _ := NewDownstreamController(m.Config) - uc.Start() - dc.Start() + kubeClient, err := utils.KubeClient() + if err != nil { + return err + } + + kubecfg, err := utils.KubeConfig() + if err != nil { + return err + } + + sednaClient, err := clientset.NewForConfig(kubecfg) + if err != nil { + return err + } + + cfg := m.Config + namespace := cfg.Namespace + if namespace == "" { + namespace = metav1.NamespaceAll + } + + // make this period configurable + minResyncPeriod := time.Second * 30 + + kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, genResyncPeriod(minResyncPeriod), kubeinformers.WithNamespace(namespace)) + + sednaInformerFactory := sednainformers.NewSharedInformerFactoryWithOptions(sednaClient, genResyncPeriod(minResyncPeriod), sednainformers.WithNamespace(namespace)) + context := &runtime.ControllerContext{ - UpstreamController: uc, - Config: m.Config, + Config: m.Config, + + KubeClient: kubeClient, + KubeInformerFactory: kubeInformerFactory, + + SednaClient: sednaClient, + SednaInformerFactory: sednaInformerFactory, } + uc, _ := NewUpstreamController(context) + dc, _ := NewDownstreamController(context) + context.UpstreamController = uc + + stopCh := make(chan struct{}) + + kubeInformerFactory.Start(stopCh) + sednaInformerFactory.Start(stopCh) + + go uc.Run(stopCh) + go dc.Run(stopCh) + for name, factory := range NewRegistry() { f, err := factory(context) if err != nil { return fmt.Errorf("failed to initialize controller %s: %v", name, err) } - err = f.Start() - if err != nil { - return fmt.Errorf("failed to start controller %s: %v", name, err) - } + go f.Run(stopCh) klog.Infof("started controller %s", name) } addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port) ws := websocket.NewServer(addr) - err := ws.ListenAndServe() + err = ws.ListenAndServe() if err != nil { + close(stopCh) return fmt.Errorf("failed to listen websocket at %s: %v", addr, err) } return nil diff --git a/pkg/globalmanager/controllers/upstream.go b/pkg/globalmanager/controllers/upstream.go index 8688594e..785f7c21 100644 --- a/pkg/globalmanager/controllers/upstream.go +++ b/pkg/globalmanager/controllers/upstream.go @@ -20,10 +20,10 @@ import ( "fmt" "strings" - "github.com/kubeedge/sedna/pkg/globalmanager/config" + "k8s.io/klog/v2" + "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - "k8s.io/klog/v2" ) // UpstreamController subscribes the updates from edge and syncs to k8s api server @@ -77,17 +77,12 @@ func (uc *UpstreamController) syncEdgeUpdate() { } } -// Start the upstream controller -func (uc *UpstreamController) Start() error { +// Run starts the upstream controller +func (uc *UpstreamController) Run(stopCh <-chan struct{}) { klog.Info("Start the sedna upstream controller") - go uc.syncEdgeUpdate() - return nil -} - -// GetName returns the name of the upstream controller -func (uc *UpstreamController) GetName() string { - return "UpstreamController" + uc.syncEdgeUpdate() + <-stopCh } func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamUpdateHandler) error { @@ -101,7 +96,7 @@ func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamUpdateHan } // NewUpstreamController creates a new Upstream controller from config -func NewUpstreamController(cfg *config.ControllerConfig) (*UpstreamController, error) { +func NewUpstreamController(cc *runtime.ControllerContext) (runtime.UpstreamControllerI, error) { uc := &UpstreamController{ messageLayer: messagelayer.NewContextMessageLayer(), updateHandlers: make(map[string]runtime.UpstreamUpdateHandler), diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go index 0e18812e..ecd4144e 100644 --- a/pkg/globalmanager/runtime/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -21,20 +21,25 @@ import ( "github.com/kubeedge/sedna/pkg/globalmanager/config" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" + k8sruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + kubeinformers "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" + sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" ) // CommonInterface describes the commom interface of CRs type CommonInterface interface { metav1.Object schema.ObjectKind - runtime.Object + k8sruntime.Object } // FeatureControllerI defines the interface of an AI Feature controller type FeatureControllerI interface { - Start() error + Run(stopCh <-chan struct{}) } type Model struct { @@ -170,11 +175,19 @@ func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { // updateHandler handles the updates from LC(running at edge) to update the // corresponding resource type UpstreamUpdateHandler func(namespace, name, operation string, content []byte) error + type UpstreamControllerI interface { + FeatureControllerI Add(kind string, updateHandler UpstreamUpdateHandler) error } type ControllerContext struct { Config *config.ControllerConfig UpstreamController UpstreamControllerI + + KubeClient kubernetes.Interface + KubeInformerFactory kubeinformers.SharedInformerFactory + + SednaClient sednaclientset.Interface + SednaInformerFactory sednainformers.SharedInformerFactory } From 704d9477aa51ce4e1f5b4a26be0b67e99a8c4a29 Mon Sep 17 00:00:00 2001 From: llhuii Date: Fri, 23 Jul 2021 11:21:08 +0800 Subject: [PATCH 4/7] gm: add dataset controller Only handle dataset update from edge. Signed-off-by: llhuii --- .../controllers/dataset/dataset.go | 45 +++++++++++++------ pkg/globalmanager/controllers/registry.go | 34 +++++++++----- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go index 8964f641..9225ffa2 100644 --- a/pkg/globalmanager/controllers/dataset/dataset.go +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -3,42 +3,43 @@ package dataset import ( "context" "encoding/json" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/cache" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +const ( + // KindName is the kind name of CR this controller controls + KindName = "Dataset" + + // Name is this controller name + Name = "Dataset" ) // Controller handles all dataset objects including: syncing to edge and update from edge. type Controller struct { client sednaclientset.SednaV1alpha1Interface - storeSynced cache.InformerSynced - - // A store of dataset - lister sednav1listers.DatasetLister - cfg *config.ControllerConfig } -// updateDatasetFromEdge syncs update from edge -func (c *Controller) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { +// updateFromEdge syncs update from edge +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { status := sednav1.DatasetStatus{} err := json.Unmarshal(content, &status) if err != nil { return err } - return c.updateDatasetStatus(name, namespace, status) + return c.updateStatus(name, namespace, status) } -// updateDatasetStatus updates the dataset status -func (c *Controller) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error { +// updateStatus updates the dataset status +func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error { client := c.client.Datasets(namespace) if status.UpdateTime == nil { @@ -56,3 +57,19 @@ func (c *Controller) updateDatasetStatus(name, namespace string, status sednav1. return err }) } + +func (c *Controller) Run(stopCh <-chan struct{}) { + // noop now +} + +// New creates a dataset controller +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + c := &Controller{ + client: cc.SednaClient.SednaV1alpha1(), + } + + // only upstream + cc.UpstreamController.Add(KindName, c.updateFromEdge) + + return c, nil +} diff --git a/pkg/globalmanager/controllers/registry.go b/pkg/globalmanager/controllers/registry.go index dde760aa..dbd4cd36 100644 --- a/pkg/globalmanager/controllers/registry.go +++ b/pkg/globalmanager/controllers/registry.go @@ -1,8 +1,25 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controllers import ( "fmt" + "github.com/kubeedge/sedna/pkg/globalmanager/controllers/dataset" fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference" @@ -14,19 +31,12 @@ type FeatureFactory = func(*runtime.ControllerContext) (runtime.FeatureControlle type Registry map[string]FeatureFactory -func (r Registry) Register(name string, factory FeatureFactory) error { - if _, ok := r[name]; ok { - return fmt.Errorf("a feature controller named %s already exists", name) - } - r[name] = factory - return nil -} - func NewRegistry() Registry { return Registry{ - ji.Name: ji.New, - fl.Name: fl.New, - il.Name: il.New, - ll.Name: ll.New, + ji.Name: ji.New, + fl.Name: fl.New, + il.Name: il.New, + ll.Name: ll.New, + dataset.Name: dataset.New, } } From e52ac06ed1700ef6b215076f09a4007a003d8310 Mon Sep 17 00:00:00 2001 From: llhuii Date: Fri, 23 Jul 2021 12:11:38 +0800 Subject: [PATCH 5/7] gm: split all upstream logic into separate file Signed-off-by: llhuii --- .../controllers/dataset/dataset.go | 59 ++----- .../controllers/dataset/downstream.go | 17 ++ .../controllers/dataset/upstream.go | 62 +++++++ .../federatedlearning/federatedlearningjob.go | 95 +--------- .../controllers/federatedlearning/upstream.go | 123 +++++++++++++ .../incrementallearningjob.go | 77 +------- .../incrementallearning/upstream.go | 162 +++++++++++++++++ .../jointinference/jointinferenceservice.go | 62 +------ .../controllers/jointinference/upstream.go | 92 ++++++++++ .../lifelonglearning/lifelonglearningjob.go | 79 +-------- .../controllers/lifelonglearning/upstream.go | 164 ++++++++++++++++++ pkg/globalmanager/controllers/registry.go | 2 - pkg/globalmanager/controllers/upstream.go | 7 +- pkg/globalmanager/runtime/types.go | 110 ------------ .../manager/incrementallearningjob.go | 3 +- 15 files changed, 650 insertions(+), 464 deletions(-) create mode 100644 pkg/globalmanager/controllers/dataset/downstream.go create mode 100644 pkg/globalmanager/controllers/dataset/upstream.go create mode 100644 pkg/globalmanager/controllers/federatedlearning/upstream.go create mode 100644 pkg/globalmanager/controllers/incrementallearning/upstream.go create mode 100644 pkg/globalmanager/controllers/jointinference/upstream.go create mode 100644 pkg/globalmanager/controllers/lifelonglearning/upstream.go diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go index 9225ffa2..aad4efd9 100644 --- a/pkg/globalmanager/controllers/dataset/dataset.go +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -1,15 +1,26 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package dataset import ( - "context" - "encoding/json" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - "github.com/kubeedge/sedna/pkg/globalmanager/runtime" - sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) const ( @@ -27,37 +38,6 @@ type Controller struct { cfg *config.ControllerConfig } -// updateFromEdge syncs update from edge -func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { - status := sednav1.DatasetStatus{} - err := json.Unmarshal(content, &status) - if err != nil { - return err - } - - return c.updateStatus(name, namespace, status) -} - -// updateStatus updates the dataset status -func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error { - client := c.client.Datasets(namespace) - - if status.UpdateTime == nil { - now := metav1.Now() - status.UpdateTime = &now - } - - return runtime.RetryUpdateStatus(name, namespace, func() error { - dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - dataset.Status = status - _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) - return err - }) -} - func (c *Controller) Run(stopCh <-chan struct{}) { // noop now } @@ -68,8 +48,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { client: cc.SednaClient.SednaV1alpha1(), } - // only upstream - cc.UpstreamController.Add(KindName, c.updateFromEdge) + c.addUpstreamHandler(cc) return c, nil } diff --git a/pkg/globalmanager/controllers/dataset/downstream.go b/pkg/globalmanager/controllers/dataset/downstream.go new file mode 100644 index 00000000..d876e30f --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/downstream.go @@ -0,0 +1,17 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dataset diff --git a/pkg/globalmanager/controllers/dataset/upstream.go b/pkg/globalmanager/controllers/dataset/upstream.go new file mode 100644 index 00000000..26a9feaa --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/upstream.go @@ -0,0 +1,62 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dataset + +import ( + "context" + "encoding/json" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// updateFromEdge syncs update from edge +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + status := sednav1.DatasetStatus{} + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + return c.updateStatus(name, namespace, status) +} + +// updateStatus updates the dataset status +func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error { + client := c.client.Datasets(namespace) + + if status.UpdateTime == nil { + now := metav1.Now() + status.UpdateTime = &now + } + + return runtime.RetryUpdateStatus(name, namespace, func() error { + dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + dataset.Status = status + _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) + return err + }) +} + +func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { + return cc.UpstreamController.Add(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index fb54a36b..402ed8a4 100644 --- a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -18,7 +18,6 @@ package federatedlearning import ( "context" - "encoding/json" "fmt" "strconv" "time" @@ -527,98 +526,6 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, return } -func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error { - var err error - job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{}) - if err != nil { - // federated crd not found - return err - } - modelName := job.Spec.AggregationWorker.Model.Name - client := c.client.Models(namespace) - - return runtime.RetryUpdateStatus(modelName, namespace, (func() error { - model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{}) - if err != nil { - return err - } - - now := metav1.Now() - model.Status.UpdateTime = &now - model.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) - return err - })) -} - -func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { - client := c.client.FederatedLearningJobs(namespace) - - return runtime.RetryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateFromEdge updates the federated job's status -func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) { - // JobInfo defines the job information - type JobInfo struct { - // Current training round - CurrentRound int `json:"currentRound"` - UpdateTime string `json:"updateTime"` - } - - // Output defines job output information - type Output struct { - Models []runtime.Model `json:"models"` - JobInfo *JobInfo `json:"ownerInfo"` - } - - var status struct { - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err = json.Unmarshal(content, &status) - if err != nil { - return - } - - output := status.Output - - if output != nil { - // Update the model's metrics - if len(output.Models) > 0 { - // only one model - model := output.Models[0] - metrics := runtime.ConvertMapToMetrics(model.Metrics) - if len(metrics) > 0 { - c.updateModelMetrics(name, namespace, metrics) - } - } - - jobInfo := output.JobInfo - // update job info if having any info - if jobInfo != nil && jobInfo.CurrentRound > 0 { - // Find a good place to save the progress info - // TODO: more meaningful reason/message - reason := "DoTraining" - message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) - cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) - c.appendStatusCondition(name, namespace, cond) - } - } - - return nil -} - // New creates a new federated learning job controller that keeps the relevant pods // in sync with their corresponding FederatedLearningJob objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { @@ -662,7 +569,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { fc.podStore = podInformer.Lister() fc.podStoreSynced = podInformer.Informer().HasSynced - cc.UpstreamController.Add(KindName, fc.updateFromEdge) + fc.addUpstreamHandler(cc) return fc, nil } diff --git a/pkg/globalmanager/controllers/federatedlearning/upstream.go b/pkg/globalmanager/controllers/federatedlearning/upstream.go new file mode 100644 index 00000000..0bcba81e --- /dev/null +++ b/pkg/globalmanager/controllers/federatedlearning/upstream.go @@ -0,0 +1,123 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package federatedlearning + +import ( + "context" + "encoding/json" + "fmt" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error { + var err error + job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{}) + if err != nil { + // federated crd not found + return err + } + modelName := job.Spec.AggregationWorker.Model.Name + client := c.client.Models(namespace) + + return runtime.RetryUpdateStatus(modelName, namespace, (func() error { + model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{}) + if err != nil { + return err + } + + now := metav1.Now() + model.Status.UpdateTime = &now + model.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) + return err + })) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { + client := c.client.FederatedLearningJobs(namespace) + + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge updates the federated job's status +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) { + // JobInfo defines the job information + type JobInfo struct { + // Current training round + CurrentRound int `json:"currentRound"` + UpdateTime string `json:"updateTime"` + } + + // Output defines job output information + type Output struct { + Models []runtime.Model `json:"models"` + JobInfo *JobInfo `json:"ownerInfo"` + } + + var status struct { + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err = json.Unmarshal(content, &status) + if err != nil { + return + } + + output := status.Output + + if output != nil { + // Update the model's metrics + if len(output.Models) > 0 { + // only one model + model := output.Models[0] + metrics := runtime.ConvertMapToMetrics(model.Metrics) + if len(metrics) > 0 { + c.updateModelMetrics(name, namespace, metrics) + } + } + + jobInfo := output.JobInfo + // update job info if having any info + if jobInfo != nil && jobInfo.CurrentRound > 0 { + // Find a good place to save the progress info + // TODO: more meaningful reason/message + reason := "DoTraining" + message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) + cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) + c.appendStatusCondition(name, namespace, cond) + } + } + + return nil +} + +func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { + return cc.UpstreamController.Add(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index d995eac6..4accd112 100644 --- a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -599,7 +599,7 @@ func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sedn // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond runtime.IncrementalCondData + var cond IncrementalCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -812,79 +812,6 @@ func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { return err } -func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { - client := c.client.IncrementalLearningJobs(namespace) - return runtime.RetryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateFromEdge syncs the edge updates to k8s -func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err := json.Unmarshal(content, &jobStatus) - if err != nil { - return err - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData runtime.IncrementalCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return err - } - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.ILJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.ILJobTrain - case "eval": - cond.Stage = sednav1.ILJobEval - case "deploy": - cond.Stage = sednav1.ILJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.ILJobStageCondReady - case "completed": - cond.Type = sednav1.ILJobStageCondCompleted - case "failed": - cond.Type = sednav1.ILJobStageCondFailed - case "waiting": - cond.Type = sednav1.ILJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = c.appendStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - // New creates a new IncrementalJob controller that keeps the relevant pods // in sync with their corresponding IncrementalJob objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { @@ -926,7 +853,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - cc.UpstreamController.Add(KindName, jc.updateFromEdge) + jc.addUpstreamHandler(cc) return jc, nil } diff --git a/pkg/globalmanager/controllers/incrementallearning/upstream.go b/pkg/globalmanager/controllers/incrementallearning/upstream.go new file mode 100644 index 00000000..fa3975a4 --- /dev/null +++ b/pkg/globalmanager/controllers/incrementallearning/upstream.go @@ -0,0 +1,162 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package incrementallearning + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type Model = runtime.Model + +// the data of this condition including the input/output to do the next step +type IncrementalCondData struct { + Input *struct { + // Only one model cases + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + + DataURL string `json:"dataURL,omitempty"` + + // the data samples reference will be stored into this URL. + // The content of this url would be: + // # the first uncomment line means the directory + // s3://dataset/ + // mnist/0.jpg + // mnist/1.jpg + DataIndexURL string `json:"dataIndexURL,omitempty"` + + OutputDir string `json:"outputDir,omitempty"` + } `json:"input,omitempty"` + + Output *struct { + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + } `json:"output,omitempty"` +} + +func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { + var modelURLs []string + if model != nil { + modelURLs = append(modelURLs, model.GetURL()) + } else { + for _, m := range models { + modelURLs = append(modelURLs, m.GetURL()) + } + } + return modelURLs +} + +func (cd *IncrementalCondData) GetInputModelURLs() []string { + return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) +} + +func (cd *IncrementalCondData) GetOutputModelURLs() []string { + return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) +} + +func (cd *IncrementalCondData) Unmarshal(data []byte) error { + return json.Unmarshal(data, cd) +} + +func (cd IncrementalCondData) Marshal() ([]byte, error) { + return json.Marshal(cd) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { + client := c.client.IncrementalLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData IncrementalCondData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.ILJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.ILJobTrain + case "eval": + cond.Stage = sednav1.ILJobEval + case "deploy": + cond.Stage = sednav1.ILJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.ILJobStageCondReady + case "completed": + cond.Type = sednav1.ILJobStageCondCompleted + case "failed": + cond.Type = sednav1.ILJobStageCondFailed + case "waiting": + cond.Type = sednav1.ILJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + +func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { + return cc.UpstreamController.Add(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index 50cbf408..ea1c8574 100644 --- a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -533,66 +533,6 @@ func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bi return err } -func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error { - client := c.client.JointInferenceServices(namespace) - - return runtime.RetryUpdateStatus(name, namespace, func() error { - joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - joint.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) - return err - }) -} - -// updateFromEdge syncs the edge updates to k8s -func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { - // Output defines owner output information - type Output struct { - ServiceInfo map[string]interface{} `json:"ownerInfo"` - } - - var status struct { - // Phase always should be "inference" - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err := json.Unmarshal(content, &status) - if err != nil { - return err - } - - // TODO: propagate status.Status to k8s - - output := status.Output - if output == nil || output.ServiceInfo == nil { - // no output info - klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) - return nil - } - - info := output.ServiceInfo - - for _, ignoreTimeKey := range []string{ - "startTime", - "updateTime", - } { - delete(info, ignoreTimeKey) - } - - metrics := runtime.ConvertMapToMetrics(info) - - err = c.updateMetrics(name, namespace, metrics) - if err != nil { - return fmt.Errorf("failed to update metrics, err:%+w", err) - } - return nil -} - // New creates a new JointInferenceService controller that keeps the relevant pods // in sync with their corresponding JointInferenceService objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { @@ -640,7 +580,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - cc.UpstreamController.Add(KindName, jc.updateFromEdge) + jc.addUpstreamHandler(cc) return jc, nil } diff --git a/pkg/globalmanager/controllers/jointinference/upstream.go b/pkg/globalmanager/controllers/jointinference/upstream.go new file mode 100644 index 00000000..ceff6e77 --- /dev/null +++ b/pkg/globalmanager/controllers/jointinference/upstream.go @@ -0,0 +1,92 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jointinference + +import ( + "context" + "encoding/json" + "fmt" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" +) + +func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error { + client := c.client.JointInferenceServices(namespace) + + return runtime.RetryUpdateStatus(name, namespace, func() error { + joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + joint.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + // Output defines owner output information + type Output struct { + ServiceInfo map[string]interface{} `json:"ownerInfo"` + } + + var status struct { + // Phase always should be "inference" + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + // TODO: propagate status.Status to k8s + + output := status.Output + if output == nil || output.ServiceInfo == nil { + // no output info + klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) + return nil + } + + info := output.ServiceInfo + + for _, ignoreTimeKey := range []string{ + "startTime", + "updateTime", + } { + delete(info, ignoreTimeKey) + } + + metrics := runtime.ConvertMapToMetrics(info) + + err = c.updateMetrics(name, namespace, metrics) + if err != nil { + return fmt.Errorf("failed to update metrics, err:%+w", err) + } + return nil +} + +func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { + return cc.UpstreamController.Add(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index 1aa80cdd..baff37e1 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -18,7 +18,6 @@ package lifelonglearning import ( "context" - "encoding/json" "fmt" "strings" "time" @@ -527,7 +526,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1 // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond runtime.LifelongLearningCondData + var cond LifelongLearningCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -704,80 +703,6 @@ func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { return err } -func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { - client := c.client.LifelongLearningJobs(namespace) - return runtime.RetryUpdateStatus(name, namespace, func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - }) -} - -// updateFromEdge syncs the edge updates to k8s -func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err := json.Unmarshal(content, &jobStatus) - if err != nil { - return err - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData runtime.LifelongLearningCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return err - } - - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.LLJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.LLJobTrain - case "eval": - cond.Stage = sednav1.LLJobEval - case "deploy": - cond.Stage = sednav1.LLJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.LLJobStageCondReady - case "completed": - cond.Type = sednav1.LLJobStageCondCompleted - case "failed": - cond.Type = sednav1.LLJobStageCondFailed - case "waiting": - cond.Type = sednav1.LLJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = c.appendStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - // New creates a new LifelongLearningJob controller that keeps the relevant pods // in sync with their corresponding LifelongLearningJob objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { @@ -820,7 +745,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - cc.UpstreamController.Add(KindName, jc.updateFromEdge) + jc.addUpstreamHandler(cc) return jc, nil } diff --git a/pkg/globalmanager/controllers/lifelonglearning/upstream.go b/pkg/globalmanager/controllers/lifelonglearning/upstream.go new file mode 100644 index 00000000..1c5e768f --- /dev/null +++ b/pkg/globalmanager/controllers/lifelonglearning/upstream.go @@ -0,0 +1,164 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package lifelonglearning + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +type Model = runtime.Model + +// the data of this condition including the input/output to do the next step +type LifelongLearningCondData struct { + Input *struct { + // Only one model cases + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + + DataURL string `json:"dataURL,omitempty"` + + // the data samples reference will be stored into this URL. + // The content of this url would be: + // # the first uncomment line means the directory + // s3://dataset/ + // mnist/0.jpg + // mnist/1.jpg + DataIndexURL string `json:"dataIndexURL,omitempty"` + + OutputDir string `json:"outputDir,omitempty"` + } `json:"input,omitempty"` + + Output *struct { + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + } `json:"output,omitempty"` +} + +func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string { + var modelURLs []string + if model != nil { + modelURLs = append(modelURLs, model.GetURL()) + } else { + for _, m := range models { + modelURLs = append(modelURLs, m.GetURL()) + } + } + return modelURLs +} + +func (cd *LifelongLearningCondData) Unmarshal(data []byte) error { + return json.Unmarshal(data, cd) +} + +func (cd LifelongLearningCondData) Marshal() ([]byte, error) { + return json.Marshal(cd) +} + +func (cd *LifelongLearningCondData) GetInputModelURLs() []string { + return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) +} + +func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { + return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { + client := c.client.LifelongLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData LifelongLearningCondData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.LLJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.LLJobTrain + case "eval": + cond.Stage = sednav1.LLJobEval + case "deploy": + cond.Stage = sednav1.LLJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.LLJobStageCondReady + case "completed": + cond.Type = sednav1.LLJobStageCondCompleted + case "failed": + cond.Type = sednav1.LLJobStageCondFailed + case "waiting": + cond.Type = sednav1.LLJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + +func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { + return cc.UpstreamController.Add(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/registry.go b/pkg/globalmanager/controllers/registry.go index dbd4cd36..1af7db1f 100644 --- a/pkg/globalmanager/controllers/registry.go +++ b/pkg/globalmanager/controllers/registry.go @@ -17,8 +17,6 @@ limitations under the License. package controllers import ( - "fmt" - "github.com/kubeedge/sedna/pkg/globalmanager/controllers/dataset" fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" diff --git a/pkg/globalmanager/controllers/upstream.go b/pkg/globalmanager/controllers/upstream.go index 785f7c21..9e6a2216 100644 --- a/pkg/globalmanager/controllers/upstream.go +++ b/pkg/globalmanager/controllers/upstream.go @@ -32,7 +32,7 @@ type UpstreamController struct { updateHandlers map[string]runtime.UpstreamUpdateHandler } -func (uc *UpstreamController)checkOperation(operation string) error { +func (uc *UpstreamController) checkOperation(operation string) error { // current only support the 'status' operation if operation != "status" { return fmt.Errorf("unknown operation %s", operation) @@ -52,13 +52,12 @@ func (uc *UpstreamController) syncEdgeUpdate() { update, err := uc.messageLayer.ReceiveResourceUpdate() if err == nil { - err = uc.checkOperation(update.operation) + err = uc.checkOperation(update.Operation) } - if err != nil && err := { + if err != nil { klog.Warningf("Ignore update since this err: %+v", err) continue } - if err != nil kind := update.Kind namespace := update.Namespace diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go index ecd4144e..5bbe13da 100644 --- a/pkg/globalmanager/runtime/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -17,8 +17,6 @@ limitations under the License. package runtime import ( - "encoding/json" - "github.com/kubeedge/sedna/pkg/globalmanager/config" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sruntime "k8s.io/apimachinery/pkg/runtime" @@ -48,32 +46,6 @@ type Model struct { Metrics map[string]interface{} `json:"metrics,omitempty"` } -// the data of this condition including the input/output to do the next step -type IncrementalCondData struct { - Input *struct { - // Only one model cases - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - - DataURL string `json:"dataURL,omitempty"` - - // the data samples reference will be stored into this URL. - // The content of this url would be: - // # the first uncomment line means the directory - // s3://dataset/ - // mnist/0.jpg - // mnist/1.jpg - DataIndexURL string `json:"dataIndexURL,omitempty"` - - OutputDir string `json:"outputDir,omitempty"` - } `json:"input,omitempty"` - - Output *struct { - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - } `json:"output,omitempty"` -} - const ( // TrainPodType is type of train pod TrainPodType = "train" @@ -90,88 +62,6 @@ func (m *Model) GetURL() string { return m.URL } -func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { - var modelURLs []string - if model != nil { - modelURLs = append(modelURLs, model.GetURL()) - } else { - for _, m := range models { - modelURLs = append(modelURLs, m.GetURL()) - } - } - return modelURLs -} - -func (cd *IncrementalCondData) GetInputModelURLs() []string { - return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) -} - -func (cd *IncrementalCondData) GetOutputModelURLs() []string { - return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) -} - -func (cd *IncrementalCondData) Unmarshal(data []byte) error { - return json.Unmarshal(data, cd) -} - -func (cd IncrementalCondData) Marshal() ([]byte, error) { - return json.Marshal(cd) -} - -// the data of this condition including the input/output to do the next step -type LifelongLearningCondData struct { - Input *struct { - // Only one model cases - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - - DataURL string `json:"dataURL,omitempty"` - - // the data samples reference will be stored into this URL. - // The content of this url would be: - // # the first uncomment line means the directory - // s3://dataset/ - // mnist/0.jpg - // mnist/1.jpg - DataIndexURL string `json:"dataIndexURL,omitempty"` - - OutputDir string `json:"outputDir,omitempty"` - } `json:"input,omitempty"` - - Output *struct { - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - } `json:"output,omitempty"` -} - -func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string { - var modelURLs []string - if model != nil { - modelURLs = append(modelURLs, model.GetURL()) - } else { - for _, m := range models { - modelURLs = append(modelURLs, m.GetURL()) - } - } - return modelURLs -} - -func (cd *LifelongLearningCondData) Unmarshal(data []byte) error { - return json.Unmarshal(data, cd) -} - -func (cd LifelongLearningCondData) Marshal() ([]byte, error) { - return json.Marshal(cd) -} - -func (cd *LifelongLearningCondData) GetInputModelURLs() []string { - return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) -} - -func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { - return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) -} - // updateHandler handles the updates from LC(running at edge) to update the // corresponding resource type UpstreamUpdateHandler func(namespace, name, operation string, content []byte) error diff --git a/pkg/localcontroller/manager/incrementallearningjob.go b/pkg/localcontroller/manager/incrementallearningjob.go index e565e4d0..abe30aec 100644 --- a/pkg/localcontroller/manager/incrementallearningjob.go +++ b/pkg/localcontroller/manager/incrementallearningjob.go @@ -31,6 +31,7 @@ import ( "github.com/kubeedge/sedna/cmd/sedna-lc/app/options" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + gmtypes "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/localcontroller/db" "github.com/kubeedge/sedna/pkg/localcontroller/gmclient" @@ -441,7 +442,7 @@ func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob var models []runtime.Model for i := len(jobConditions) - 1; i >= 0; i-- { - var cond runtime.IncrementalCondData + var cond gmtypes.IncrementalCondData jobCond := jobConditions[i] if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil { From 5c1c1674da5a4c97e69cc2c854864ed9da696540 Mon Sep 17 00:00:00 2001 From: llhuii Date: Mon, 26 Jul 2021 15:56:39 +0800 Subject: [PATCH 6/7] gm: split all downstream logic into separate file Since all CR watch actions are placed into corresponding controller, controllers/downstream.go is unnecessary. Signed-off-by: llhuii --- .../controllers/dataset/dataset.go | 28 +- .../controllers/dataset/downstream.go | 40 ++ pkg/globalmanager/controllers/downstream.go | 369 ------------------ .../federatedlearning/downstream.go | 51 +++ .../federatedlearning/federatedlearningjob.go | 6 + .../incrementallearning/downstream.go | 142 +++++++ .../incrementallearningjob.go | 6 + .../controllers/jointinference/downstream.go | 51 +++ .../jointinference/jointinferenceservice.go | 6 + .../lifelonglearning/downstream.go | 49 +++ .../lifelonglearning/lifelonglearningjob.go | 6 + pkg/globalmanager/controllers/manager.go | 7 +- pkg/globalmanager/runtime/secret_injector.go | 16 +- pkg/globalmanager/runtime/types.go | 13 +- 14 files changed, 412 insertions(+), 378 deletions(-) delete mode 100644 pkg/globalmanager/controllers/downstream.go create mode 100644 pkg/globalmanager/controllers/federatedlearning/downstream.go create mode 100644 pkg/globalmanager/controllers/incrementallearning/downstream.go create mode 100644 pkg/globalmanager/controllers/jointinference/downstream.go create mode 100644 pkg/globalmanager/controllers/lifelonglearning/downstream.go diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go index aad4efd9..1de420c9 100644 --- a/pkg/globalmanager/controllers/dataset/dataset.go +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -17,9 +17,12 @@ limitations under the License. package dataset import ( + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) @@ -33,9 +36,12 @@ const ( // Controller handles all dataset objects including: syncing to edge and update from edge. type Controller struct { - client sednaclientset.SednaV1alpha1Interface + kubeClient kubernetes.Interface + client sednaclientset.SednaV1alpha1Interface cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } func (c *Controller) Run(stopCh <-chan struct{}) { @@ -45,8 +51,24 @@ func (c *Controller) Run(stopCh <-chan struct{}) { // New creates a dataset controller func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { c := &Controller{ - client: cc.SednaClient.SednaV1alpha1(), + client: cc.SednaClient.SednaV1alpha1(), + kubeClient: cc.KubeClient, } + informer := cc.SednaInformerFactory.Sedna().V1alpha1().Datasets().Informer() + informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + + AddFunc: func(obj interface{}) { + c.syncToEdge(watch.Added, obj) + }, + + UpdateFunc: func(old, cur interface{}) { + c.syncToEdge(watch.Added, cur) + }, + + DeleteFunc: func(obj interface{}) { + c.syncToEdge(watch.Deleted, obj) + }, + }) c.addUpstreamHandler(cc) diff --git a/pkg/globalmanager/controllers/dataset/downstream.go b/pkg/globalmanager/controllers/dataset/downstream.go index d876e30f..8f9553fb 100644 --- a/pkg/globalmanager/controllers/dataset/downstream.go +++ b/pkg/globalmanager/controllers/dataset/downstream.go @@ -15,3 +15,43 @@ limitations under the License. */ package dataset + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// syncToEdge syncs the dataset resources +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + dataset, ok := obj.(*sednav1.Dataset) + if !ok { + return nil + } + + // Here only propagate to the nodes with non empty name + nodeName := dataset.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + // Since t.Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + if len(dataset.Kind) == 0 { + dataset.Kind = KindName + } + + runtime.InjectSecretAnnotations(c.kubeClient, dataset, dataset.Spec.CredentialName) + + return c.sendToEdgeFunc(nodeName, eventType, dataset) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + + return nil +} diff --git a/pkg/globalmanager/controllers/downstream.go b/pkg/globalmanager/controllers/downstream.go deleted file mode 100644 index 34938817..00000000 --- a/pkg/globalmanager/controllers/downstream.go +++ /dev/null @@ -1,369 +0,0 @@ -/* -Copyright 2021 The KubeEdge Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controllers - -import ( - "context" - "fmt" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - k8sruntime "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - - sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager/config" - "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" - "github.com/kubeedge/sedna/pkg/globalmanager/runtime" -) - -// DownstreamController watch kubernetes api server and send the controller resource change to edge -type DownstreamController struct { - // events from watch kubernetes api server - events chan watch.Event - - cfg *config.ControllerConfig - - client clientset.SednaV1alpha1Interface - kubeClient kubernetes.Interface - - messageLayer messagelayer.MessageLayer -} - -func (dc *DownstreamController) injectSecret(obj runtime.CommonInterface, secretName string) error { - if secretName == "" { - return nil - } - - secret, err := dc.kubeClient.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) - if err != nil { - klog.Warningf("failed to get the secret %s: %+v", - secretName, err) - - return err - } - runtime.InjectSecretObj(obj, secret) - return err -} - -// syncDataset syncs the dataset resources -func (dc *DownstreamController) syncDataset(eventType watch.EventType, dataset *sednav1.Dataset) error { - // Here only propagate to the nodes with non empty name - nodeName := dataset.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - dc.injectSecret(dataset, dataset.Spec.CredentialName) - - return dc.messageLayer.SendResourceObject(nodeName, eventType, dataset) -} - -// syncJointInferenceService syncs the joint-inference-service resources -func (dc *DownstreamController) syncJointInferenceService(eventType watch.EventType, joint *sednav1.JointInferenceService) error { - // Here only propagate to the nodes with non empty name - // FIXME: only the case that Spec.NodeName specified is support - nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - - return dc.messageLayer.SendResourceObject(nodeName, eventType, joint) -} - -// syncFederatedLearningJob syncs the federated resources -func (dc *DownstreamController) syncFederatedLearningJob(eventType watch.EventType, job *sednav1.FederatedLearningJob) error { - // broadcast to all nodes specified in spec - nodeset := make(map[string]bool) - for _, trainingWorker := range job.Spec.TrainingWorkers { - // Here only propagate to the nodes with non empty name - if len(trainingWorker.Template.Spec.NodeName) > 0 { - nodeset[trainingWorker.Template.Spec.NodeName] = true - } - } - - for nodeName := range nodeset { - dc.messageLayer.SendResourceObject(nodeName, eventType, job) - } - return nil -} - -// syncModelWithName will sync the model to the specified node. -// Now called when creating the incrementaljob. -func (dc *DownstreamController) syncModelWithName(nodeName, modelName, namespace string) error { - model, err := dc.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) - if err != nil { - // TODO: maybe use err.ErrStatus.Code == 404 - return fmt.Errorf("model(%s/%s) not found", namespace, modelName) - } - - // Since model.Kind may be empty, - // we need to fix the kind here if missing. - // more details at https://github.com/kubernetes/kubernetes/issues/3030 - if len(model.Kind) == 0 { - model.Kind = "Model" - } - - dc.injectSecret(model, model.Spec.CredentialName) - - dc.messageLayer.SendResourceObject(nodeName, watch.Added, model) - return nil -} - -// syncIncrementalJob syncs the incremental learning jobs -func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, job *sednav1.IncrementalLearningJob) error { - jobConditions := job.Status.Conditions - if len(jobConditions) == 0 { - return nil - } - - dataName := job.Spec.Dataset.Name - ds, err := dc.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) - } - // LC has dataset object on this node that may call dataset node - dsNodeName := ds.Spec.NodeName - - var trainNodeName string - var evalNodeName string - - ann := job.GetAnnotations() - if ann != nil { - trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] - evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)] - } - - if eventType == watch.Deleted { - // delete jobs from all LCs - for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { - if v != "" { - dc.messageLayer.SendResourceObject(v, eventType, job) - } - } - return nil - } - - latestCondition := jobConditions[len(jobConditions)-1] - currentType := latestCondition.Type - jobStage := latestCondition.Stage - - syncModelWithName := func(modelName string) { - if err := dc.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { - klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", - modelName, job.Name, dsNodeName, err) - } - } - - syncJobWithNodeName := func(nodeName string) { - if err := dc.messageLayer.SendResourceObject(nodeName, eventType, job); err != nil { - klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", - job.Name, nodeName, jobStage, err) - } - } - - dc.injectSecret(job, job.Spec.CredentialName) - - doJobStageEvent := func(modelName string, nodeName string) { - if currentType == sednav1.ILJobStageCondWaiting { - syncJobWithNodeName(dsNodeName) - syncModelWithName(modelName) - } else if currentType == sednav1.ILJobStageCondRunning { - if nodeName != "" { - syncJobWithNodeName(nodeName) - } - } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { - if nodeName != dsNodeName { - // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. - dc.messageLayer.SendResourceObject(nodeName, watch.Deleted, job) - } - } - } - - switch jobStage { - case sednav1.ILJobTrain: - doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) - case sednav1.ILJobEval: - doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) - } - - return nil -} - -// syncLifelongLearningJob syncs the lifelonglearning jobs -func (dc *DownstreamController) syncLifelongLearningJob(eventType watch.EventType, job *sednav1.LifelongLearningJob) error { - // Here only propagate to the nodes with non empty name - - // FIXME(llhuii): only the case that all workers having the same nodeName are support, - // will support Spec.NodeSelector and differenect nodeName. - nodeName := job.Spec.TrainSpec.Template.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - - dc.injectSecret(job, job.Spec.CredentialName) - dc.messageLayer.SendResourceObject(nodeName, eventType, job) - - return nil -} - -// sync defines the entrypoint of syncing all resources -func (dc *DownstreamController) sync(stopCh <-chan struct{}) { - for { - select { - case <-stopCh: - klog.Info("Stop controller downstream loop") - return - - case e := <-dc.events: - - var err error - var kind, namespace, name string - switch t := e.Object.(type) { - case (*sednav1.Dataset): - // Since t.Kind may be empty, - // we need to fix the kind here if missing. - // more details at https://github.com/kubernetes/kubernetes/issues/3030 - if len(t.Kind) == 0 { - t.Kind = "Dataset" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncDataset(e.Type, t) - - case (*sednav1.JointInferenceService): - // TODO: find a good way to avoid these duplicate codes - if len(t.Kind) == 0 { - t.Kind = "JointInferenceService" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncJointInferenceService(e.Type, t) - - case (*sednav1.FederatedLearningJob): - if len(t.Kind) == 0 { - t.Kind = "FederatedLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncFederatedLearningJob(e.Type, t) - - case (*sednav1.IncrementalLearningJob): - if len(t.Kind) == 0 { - t.Kind = "IncrementalLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncIncrementalJob(e.Type, t) - case (*sednav1.LifelongLearningJob): - if len(t.Kind) == 0 { - t.Kind = "LifelongLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncLifelongLearningJob(e.Type, t) - default: - klog.Warningf("object type: %T unsupported", e) - continue - } - - if err != nil { - klog.Warningf("Error to sync %s(%s/%s), err: %+v", kind, namespace, name, err) - } else { - klog.V(2).Infof("synced %s(%s/%s)", kind, namespace, name) - } - } - } -} - -// watch function watches the crd resources which should by synced to nodes -func (dc *DownstreamController) watch(stopCh <-chan struct{}) { - rh := cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - eventObj := obj.(k8sruntime.Object) - dc.events <- watch.Event{Type: watch.Added, Object: eventObj} - }, - UpdateFunc: func(old, cur interface{}) { - // Since we don't support the spec update operation currently, - // so only status updates arrive here and NO propagation to edge. - - // Update: - // We sync it to edge when using self-built websocket, and - // this sync isn't needed when we switch out self-built websocket. - dc.events <- watch.Event{Type: watch.Added, Object: cur.(k8sruntime.Object)} - }, - DeleteFunc: func(obj interface{}) { - eventObj := obj.(k8sruntime.Object) - dc.events <- watch.Event{Type: watch.Deleted, Object: eventObj} - }, - } - - client := dc.client.RESTClient() - - // make this option configurable - resyncPeriod := time.Second * 60 - namespace := dc.cfg.Namespace - - // TODO: use the informer - for resourceName, object := range map[string]k8sruntime.Object{ - "datasets": &sednav1.Dataset{}, - "jointinferenceservices": &sednav1.JointInferenceService{}, - "federatedlearningjobs": &sednav1.FederatedLearningJob{}, - "incrementallearningjobs": &sednav1.IncrementalLearningJob{}, - "lifelonglearningjobs": &sednav1.LifelongLearningJob{}, - } { - lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything()) - si := cache.NewSharedInformer(lw, object, resyncPeriod) - si.AddEventHandler(rh) - go si.Run(stopCh) - } -} - -// Start starts the controller -func (dc *DownstreamController) Run(stopCh <-chan struct{}) { - // watch is an asynchronous call - dc.watch(stopCh) - - // sync is a synchronous call - dc.sync(stopCh) -} - -// NewDownstreamController creates a controller DownstreamController from config -func NewDownstreamController(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { - // TODO: make bufferSize configurable - bufferSize := 10 - events := make(chan watch.Event, bufferSize) - - dc := &DownstreamController{ - cfg: cc.Config, - events: events, - client: cc.SednaClient.SednaV1alpha1(), - kubeClient: cc.KubeClient, - messageLayer: messagelayer.NewContextMessageLayer(), - } - - return dc, nil -} diff --git a/pkg/globalmanager/controllers/federatedlearning/downstream.go b/pkg/globalmanager/controllers/federatedlearning/downstream.go new file mode 100644 index 00000000..9a50a8ec --- /dev/null +++ b/pkg/globalmanager/controllers/federatedlearning/downstream.go @@ -0,0 +1,51 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package federatedlearning + +import ( + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.FederatedLearningJob) + if !ok { + return nil + } + + // broadcast to all nodes specified in spec + nodeset := make(map[string]bool) + for _, trainingWorker := range job.Spec.TrainingWorkers { + // Here only propagate to the nodes with non empty name + if len(trainingWorker.Template.Spec.NodeName) > 0 { + nodeset[trainingWorker.Template.Spec.NodeName] = true + } + } + + for nodeName := range nodeset { + c.sendToEdgeFunc(nodeName, eventType, job) + } + return nil +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + + return nil +} diff --git a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index 402ed8a4..d3730a0f 100644 --- a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -28,6 +28,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -85,6 +86,8 @@ type Controller struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } // Run starts the main goroutine responsible for watching and syncing jobs. @@ -550,12 +553,15 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { fc.enqueueController(obj, true) + fc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { fc.enqueueController(cur, true) + fc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { fc.enqueueController(obj, true) + fc.syncToEdge(watch.Deleted, obj) }, }) fc.jobLister = jobInformer.Lister() diff --git a/pkg/globalmanager/controllers/incrementallearning/downstream.go b/pkg/globalmanager/controllers/incrementallearning/downstream.go new file mode 100644 index 00000000..cba0b136 --- /dev/null +++ b/pkg/globalmanager/controllers/incrementallearning/downstream.go @@ -0,0 +1,142 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package incrementallearning + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/klog/v2" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// syncModelWithName will sync the model to the specified node. +// Now called when creating the incrementaljob. +func (c *Controller) syncModelWithName(nodeName, modelName, namespace string) error { + model, err := c.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) + if err != nil { + // TODO: maybe use err.ErrStatus.Code == 404 + return fmt.Errorf("model(%s/%s) not found", namespace, modelName) + } + + // Since model.Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + if len(model.Kind) == 0 { + model.Kind = "Model" + } + + runtime.InjectSecretAnnotations(c.kubeClient, model, model.Spec.CredentialName) + + c.sendToEdgeFunc(nodeName, watch.Added, model) + return nil +} + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.IncrementalLearningJob) + if !ok { + return nil + } + job.Kind = KindName + + jobConditions := job.Status.Conditions + if len(jobConditions) == 0 { + return nil + } + + dataName := job.Spec.Dataset.Name + ds, err := c.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) + } + // LC has dataset object on this node that may call dataset node + dsNodeName := ds.Spec.NodeName + + var trainNodeName string + var evalNodeName string + + ann := job.GetAnnotations() + if ann != nil { + trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] + evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)] + } + + if eventType == watch.Deleted { + // delete jobs from all LCs + for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { + if v != "" { + c.sendToEdgeFunc(v, eventType, job) + } + } + return nil + } + + latestCondition := jobConditions[len(jobConditions)-1] + currentType := latestCondition.Type + jobStage := latestCondition.Stage + + syncModelWithName := func(modelName string) { + if err := c.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { + klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", + modelName, job.Name, dsNodeName, err) + } + } + + syncJobWithNodeName := func(nodeName string) { + if err := c.sendToEdgeFunc(nodeName, eventType, job); err != nil { + klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", + job.Name, nodeName, jobStage, err) + } + } + + runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) + + doJobStageEvent := func(modelName string, nodeName string) { + if currentType == sednav1.ILJobStageCondWaiting { + syncJobWithNodeName(dsNodeName) + syncModelWithName(modelName) + } else if currentType == sednav1.ILJobStageCondRunning { + if nodeName != "" { + syncJobWithNodeName(nodeName) + } + } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { + if nodeName != dsNodeName { + // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. + c.sendToEdgeFunc(nodeName, watch.Deleted, job) + } + } + } + + switch jobStage { + case sednav1.ILJobTrain: + doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) + case sednav1.ILJobEval: + doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) + } + + return nil + +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + return nil +} diff --git a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index 4accd112..5e88232f 100644 --- a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -30,6 +30,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -83,6 +84,8 @@ type Controller struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } // Run starts the main goroutine responsible for watching and syncing jobs. @@ -834,12 +837,15 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) jc.jobLister = jobInformer.Lister() diff --git a/pkg/globalmanager/controllers/jointinference/downstream.go b/pkg/globalmanager/controllers/jointinference/downstream.go new file mode 100644 index 00000000..31778ef6 --- /dev/null +++ b/pkg/globalmanager/controllers/jointinference/downstream.go @@ -0,0 +1,51 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jointinference + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + joint, ok := obj.(*sednav1.JointInferenceService) + if !ok { + return nil + } + + // Here only propagate to the nodes with non empty name + // FIXME: only the case that Spec.NodeName specified is support + nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + if len(joint.Kind) == 0 { + joint.Kind = KindName + } + return c.sendToEdgeFunc(nodeName, eventType, joint) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + + return nil +} diff --git a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index ea1c8574..7a182119 100644 --- a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -29,6 +29,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -85,6 +86,8 @@ type Controller struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } // Run starts the main goroutine responsible for watching and syncing services. @@ -557,14 +560,17 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) diff --git a/pkg/globalmanager/controllers/lifelonglearning/downstream.go b/pkg/globalmanager/controllers/lifelonglearning/downstream.go new file mode 100644 index 00000000..2f33516a --- /dev/null +++ b/pkg/globalmanager/controllers/lifelonglearning/downstream.go @@ -0,0 +1,49 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package lifelonglearning + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.LifelongLearningJob) + if !ok { + return nil + } + // Here only propagate to the nodes with non empty name + + // FIXME(llhuii): only the case that all workers having the same nodeName are support, + // will support Spec.NodeSelector and differenect nodeName. + nodeName := job.Spec.TrainSpec.Template.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) + return c.sendToEdgeFunc(nodeName, eventType, job) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + return nil +} diff --git a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index baff37e1..d7d2dbef 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -28,6 +28,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -80,6 +81,8 @@ type Controller struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } // Run starts the main goroutine responsible for watching and syncing jobs. @@ -726,12 +729,15 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) jc.jobLister = jobInformer.Lister() diff --git a/pkg/globalmanager/controllers/manager.go b/pkg/globalmanager/controllers/manager.go index 1066597b..85328023 100644 --- a/pkg/globalmanager/controllers/manager.go +++ b/pkg/globalmanager/controllers/manager.go @@ -28,6 +28,7 @@ import ( clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" "github.com/kubeedge/sedna/pkg/globalmanager/config" + "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" @@ -93,22 +94,24 @@ func (m *Manager) Start() error { } uc, _ := NewUpstreamController(context) - dc, _ := NewDownstreamController(context) context.UpstreamController = uc + downstreamSendFunc := messagelayer.NewContextMessageLayer().SendResourceObject + stopCh := make(chan struct{}) kubeInformerFactory.Start(stopCh) sednaInformerFactory.Start(stopCh) go uc.Run(stopCh) - go dc.Run(stopCh) for name, factory := range NewRegistry() { f, err := factory(context) if err != nil { return fmt.Errorf("failed to initialize controller %s: %v", name, err) } + f.SetDownstreamSendFunc(downstreamSendFunc) + go f.Run(stopCh) klog.Infof("started controller %s", name) } diff --git a/pkg/globalmanager/runtime/secret_injector.go b/pkg/globalmanager/runtime/secret_injector.go index 7649dfaa..4386a034 100644 --- a/pkg/globalmanager/runtime/secret_injector.go +++ b/pkg/globalmanager/runtime/secret_injector.go @@ -17,10 +17,13 @@ limitations under the License. package runtime import ( + "context" "encoding/json" "fmt" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" ) const ( @@ -106,10 +109,18 @@ func MergeSecretEnvs(nowE, newE []v1.EnvVar, overwrite bool) []v1.EnvVar { return nowE } -func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { - if secret == nil { +func InjectSecretAnnotations(client kubernetes.Interface, obj CommonInterface, secretName string) (err error) { + if len(secretName) == 0 { + return + } + secret, err := client.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) + if err != nil { return } + return injectSecretObj(obj, secret) +} + +func injectSecretObj(obj CommonInterface, secret *v1.Secret) (err error) { secretData := secret.GetAnnotations() @@ -127,4 +138,5 @@ func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { ann[SecretAnnotationKey] = string(b) obj.SetAnnotations(ann) + return nil } diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go index 5bbe13da..ebbcf61f 100644 --- a/pkg/globalmanager/runtime/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -21,6 +21,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/watch" kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -35,9 +36,15 @@ type CommonInterface interface { k8sruntime.Object } +// BaseControllerI defines the interface of an controller +type BaseControllerI interface { + Run(stopCh <-chan struct{}) +} + // FeatureControllerI defines the interface of an AI Feature controller type FeatureControllerI interface { - Run(stopCh <-chan struct{}) + BaseControllerI + SetDownstreamSendFunc(f DownstreamSendFunc) error } type Model struct { @@ -67,10 +74,12 @@ func (m *Model) GetURL() string { type UpstreamUpdateHandler func(namespace, name, operation string, content []byte) error type UpstreamControllerI interface { - FeatureControllerI + BaseControllerI Add(kind string, updateHandler UpstreamUpdateHandler) error } +type DownstreamSendFunc = func(nodeName string, eventType watch.EventType, obj interface{}) error + type ControllerContext struct { Config *config.ControllerConfig UpstreamController UpstreamControllerI From e962aab7bd91a63e17875d3111e53bdce5a2eb64 Mon Sep 17 00:00:00 2001 From: llhuii Date: Mon, 26 Jul 2021 16:40:58 +0800 Subject: [PATCH 7/7] gm: more code clean after initial refactor done 1. remove the feature redundant name in all feature controllers(e.g. 'federatedlearningJob' to 'job'), since it has already own independent package, no need the feature extra name 2. upstream interface optimizaztion 3. fix empty Kind of all CR in downstream 4. add extra doc string 5. fix code style Signed-off-by: llhuii --- cmd/sedna-gm/sedna-gm.go | 4 + .../controllers/dataset/dataset.go | 2 - .../controllers/dataset/downstream.go | 13 +- .../controllers/dataset/upstream.go | 4 +- .../federatedlearning/downstream.go | 5 + .../federatedlearning/federatedlearningjob.go | 157 +++++++------- .../controllers/federatedlearning/upstream.go | 6 +- .../incrementallearning/downstream.go | 5 +- .../incrementallearningjob.go | 195 +++++++++--------- .../incrementallearning/upstream.go | 4 +- .../controllers/jointinference/downstream.go | 5 + .../jointinference/jointinferenceservice.go | 85 ++++---- .../controllers/jointinference/upstream.go | 4 +- .../lifelonglearning/downstream.go | 6 + .../lifelonglearning/lifelonglearningjob.go | 96 ++++----- .../controllers/lifelonglearning/upstream.go | 18 +- pkg/globalmanager/controllers/manager.go | 12 +- pkg/globalmanager/controllers/upstream.go | 10 +- pkg/globalmanager/runtime/common.go | 14 +- pkg/globalmanager/runtime/secret_injector.go | 1 - pkg/globalmanager/runtime/types.go | 77 ++++--- pkg/globalmanager/runtime/worker.go | 2 +- 22 files changed, 372 insertions(+), 353 deletions(-) diff --git a/cmd/sedna-gm/sedna-gm.go b/cmd/sedna-gm/sedna-gm.go index bce60eca..3777a617 100644 --- a/cmd/sedna-gm/sedna-gm.go +++ b/cmd/sedna-gm/sedna-gm.go @@ -17,7 +17,9 @@ limitations under the License. package main import ( + "math/rand" "os" + "time" "k8s.io/component-base/logs" @@ -25,6 +27,8 @@ import ( ) func main() { + rand.Seed(time.Now().UnixNano()) + command := app.NewControllerCommand() logs.InitLogs() defer logs.FlushLogs() diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go index 1de420c9..8523057c 100644 --- a/pkg/globalmanager/controllers/dataset/dataset.go +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -70,7 +70,5 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { }, }) - c.addUpstreamHandler(cc) - return c, nil } diff --git a/pkg/globalmanager/controllers/dataset/downstream.go b/pkg/globalmanager/controllers/dataset/downstream.go index 8f9553fb..a898fac0 100644 --- a/pkg/globalmanager/controllers/dataset/downstream.go +++ b/pkg/globalmanager/controllers/dataset/downstream.go @@ -32,19 +32,17 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro return nil } + // Since t.Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + dataset.Kind = KindName + // Here only propagate to the nodes with non empty name nodeName := dataset.Spec.NodeName if len(nodeName) == 0 { return fmt.Errorf("empty node name") } - // Since t.Kind may be empty, - // we need to fix the kind here if missing. - // more details at https://github.com/kubernetes/kubernetes/issues/3030 - if len(dataset.Kind) == 0 { - dataset.Kind = KindName - } - runtime.InjectSecretAnnotations(c.kubeClient, dataset, dataset.Spec.CredentialName) return c.sendToEdgeFunc(nodeName, eventType, dataset) @@ -52,6 +50,5 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { c.sendToEdgeFunc = f - return nil } diff --git a/pkg/globalmanager/controllers/dataset/upstream.go b/pkg/globalmanager/controllers/dataset/upstream.go index 26a9feaa..a1b1949e 100644 --- a/pkg/globalmanager/controllers/dataset/upstream.go +++ b/pkg/globalmanager/controllers/dataset/upstream.go @@ -57,6 +57,6 @@ func (c *Controller) updateStatus(name, namespace string, status sednav1.Dataset }) } -func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { - return cc.UpstreamController.Add(KindName, c.updateFromEdge) +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) } diff --git a/pkg/globalmanager/controllers/federatedlearning/downstream.go b/pkg/globalmanager/controllers/federatedlearning/downstream.go index 9a50a8ec..3b5f2fd2 100644 --- a/pkg/globalmanager/controllers/federatedlearning/downstream.go +++ b/pkg/globalmanager/controllers/federatedlearning/downstream.go @@ -29,6 +29,11 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro return nil } + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + job.Kind = KindName + // broadcast to all nodes specified in spec nodeset := make(map[string]bool) for _, trainingWorker := range job.Spec.TrainingWorkers { diff --git a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index d3730a0f..b775b089 100644 --- a/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -54,14 +54,14 @@ const ( ) const ( - FLJobStageAgg = "Aggregation" - FLJobStageTrain = "Training" + jobStageAgg = "Aggregation" + jobStageTrain = "Training" ) // Kind contains the schema.GroupVersionKind for this controller type. var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) -// Controller ensures that all FLJob objects have corresponding pods to +// Controller ensures that all FederatedLearningJob objects have corresponding pods to // run their configured workload. type Controller struct { kubeClient kubernetes.Interface @@ -70,7 +70,7 @@ type Controller struct { // podStoreSynced returns true if the pod store has been synced at least once. // Added as a member to the struct to allow injection for testing. podStoreSynced cache.InformerSynced - // jobStoreSynced returns true if the flJob store has been synced at least once. + // jobStoreSynced returns true if the FederatedLearningJob store has been synced at least once. // Added as a member to the struct to allow injection for testing. jobStoreSynced cache.InformerSynced @@ -256,88 +256,93 @@ func (c *Controller) sync(key string) (bool, error) { sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { - klog.V(4).Infof("FLJob has been deleted: %v", key) + klog.V(4).Infof("%s %v has been deleted", Name, key) return true, nil } return false, err } - flJob := *sharedJob - // set kind for flJob in case that the kind is None - flJob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob")) - // if flJob was finished previously, we don't want to redo the termination - if IsFLJobFinished(&flJob) { + + job := *sharedJob + // set kind for FederatedLearningJob in case that the kind is None + job.SetGroupVersionKind(Kind) + + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } - selector, _ := runtime.GenerateSelector(&flJob) - pods, err := c.podStore.Pods(flJob.Namespace).List(selector) + + selector, _ := runtime.GenerateSelector(&job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if err != nil { return false, err } activePods := k8scontroller.FilterActivePods(pods) active := int32(len(activePods)) - succeeded, failed := getStatus(pods) - conditions := len(flJob.Status.Conditions) - // flJob first start - if flJob.Status.StartTime == nil { + succeeded, failed := countPods(pods) + conditions := len(job.Status.Conditions) + + // set StartTime when job is handled firstly + if job.Status.StartTime == nil { now := metav1.Now() - flJob.Status.StartTime = &now + job.Status.StartTime = &now } var manageJobErr error jobFailed := false var failureReason string var failureMessage string - phase := flJob.Status.Phase + phase := job.Status.Phase if failed > 0 { jobFailed = true failureReason = "workerFailed" - failureMessage = "the worker of FLJob failed" + failureMessage = "the worker of FederatedLearningJob failed" } if jobFailed { - flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) - flJob.Status.Phase = sednav1.FLJobFailed - c.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) + job.Status.Phase = sednav1.FLJobFailed + c.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage) } else { // in the First time, we create the pods if len(pods) == 0 { - active, manageJobErr = c.createPod(&flJob) + active, manageJobErr = c.createPod(&job) } complete := false if succeeded > 0 && active == 0 { complete = true } if complete { - flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondComplete, "", "")) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondComplete, "", "")) now := metav1.Now() - flJob.Status.CompletionTime = &now - c.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed") - flJob.Status.Phase = sednav1.FLJobSucceeded + job.Status.CompletionTime = &now + c.recorder.Event(&job, v1.EventTypeNormal, "Completed", "FederatedLearningJob completed") + job.Status.Phase = sednav1.FLJobSucceeded } else { - flJob.Status.Phase = sednav1.FLJobRunning + job.Status.Phase = sednav1.FLJobRunning } } forget := false // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to - // improve the FLJob backoff policy when parallelism > 1 and few FLJobs failed but others succeed. + // improve the job backoff policy when parallelism > 1 and few FLJobs failed but others succeed. // In this case, we should clear the backoff delay. - if flJob.Status.Succeeded < succeeded { + if job.Status.Succeeded < succeeded { forget = true } - // no need to update the flJob if the status hasn't changed since last time - if flJob.Status.Active != active || flJob.Status.Succeeded != succeeded || flJob.Status.Failed != failed || len(flJob.Status.Conditions) != conditions || flJob.Status.Phase != phase { - flJob.Status.Active = active - flJob.Status.Succeeded = succeeded - flJob.Status.Failed = failed + // no need to update the job if the status hasn't changed since last time + if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions || job.Status.Phase != phase { + job.Status.Active = active + job.Status.Succeeded = succeeded + job.Status.Failed = failed + c.updateJobStatus(&job) - if jobFailed && !IsFLJobFinished(&flJob) { - // returning an error will re-enqueue FLJob after the backoff period - return forget, fmt.Errorf("failed pod(s) detected for flJob key %q", key) + if jobFailed && !IsJobFinished(&job) { + // returning an error will re-enqueue FederatedLearningJob after the backoff period + return forget, fmt.Errorf("failed pod(s) detected for FederatedLearningJob key %q", key) } forget = true @@ -346,7 +351,7 @@ func (c *Controller) sync(key string) (bool, error) { return forget, manageJobErr } -func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { +func NewJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { return sednav1.FLJobCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -357,28 +362,24 @@ func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message } } -// getStatus returns no of succeeded and failed pods running a flJob -func getStatus(pods []*v1.Pod) (succeeded, failed int32) { +// countPods returns number of succeeded and failed pods +func countPods(pods []*v1.Pod) (succeeded, failed int32) { succeeded = int32(filterPods(pods, v1.PodSucceeded)) failed = int32(filterPods(pods, v1.PodFailed)) return } -func (c *Controller) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error { - jobClient := c.client.FederatedLearningJobs(flJob.Namespace) - var err error - for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { - var newFLJob *sednav1.FederatedLearningJob - newFLJob, err = jobClient.Get(context.TODO(), flJob.Name, metav1.GetOptions{}) +func (c *Controller) updateJobStatus(job *sednav1.FederatedLearningJob) error { + jobClient := c.client.FederatedLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break - } - newFLJob.Status = flJob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newFLJob, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return nil + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } // filterPods returns pods based on their phase. @@ -392,7 +393,7 @@ func filterPods(pods []*v1.Pod, phase v1.PodPhase) int { return result } -func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { +func IsJobFinished(j *sednav1.FederatedLearningJob) bool { for _, c := range j.Status.Conditions { if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { return true @@ -423,9 +424,9 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, // deliver pod for aggregation worker aggWorker := job.Spec.AggregationWorker - // Configure container mounting and Env information by initial runtime.WorkerParam + // Configure aggregation worker's mounts and envs var aggPort int32 = 7363 - aggWorkerParam := new(runtime.WorkerParam) + var aggWorkerParam runtime.WorkerParam aggWorkerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "WORKER_NAME": "aggworker-" + utilrand.String(5), @@ -435,7 +436,7 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, "PARTICIPANTS_COUNT": participantsCount, } - aggWorkerParam.WorkerType = FLJobStageAgg + aggWorkerParam.WorkerType = jobStageAgg aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure aggWorkerParam.Mounts = append(aggWorkerParam.Mounts, @@ -450,9 +451,9 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, ) // create aggpod based on configured parameters - _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, aggWorkerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, &aggWorkerParam) if err != nil { - return active, err + return active, fmt.Errorf("failed to create aggregation worker: %w", err) } active++ @@ -462,13 +463,17 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. appIP, err = runtime.GetNodeIPByName(c.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) + if err != nil { + return active, err + } - aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, FLJobStageAgg, aggPort, appIP) + aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, jobStageAgg, aggPort, appIP) if err != nil { return active, err } + // deliver pod for training worker - for _, trainingWorker := range job.Spec.TrainingWorkers { + for i, trainingWorker := range job.Spec.TrainingWorkers { // get dataseturl through parsing crd of dataset datasetName := trainingWorker.Dataset.Name dataset, err := c.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) @@ -483,9 +488,8 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, datasetSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } - // Configure container mounting and env information - workerParam := new(runtime.WorkerParam) - + // Configure training worker's mounts and envs + var workerParam runtime.WorkerParam workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ URL: &runtime.MountURL{ @@ -519,10 +523,11 @@ func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, workerParam.WorkerType = runtime.TrainPodType workerParam.HostNetwork = true workerParam.RestartPolicy = v1.RestartPolicyOnFailure - // create train pod based on configured parameters - _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, workerParam) + + // create training worker based on configured parameters + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, &workerParam) if err != nil { - return active, err + return active, fmt.Errorf("failed to create %dth training worker: %w", i, err) } active++ } @@ -545,25 +550,35 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { kubeClient: cc.KubeClient, client: cc.SednaClient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "flJob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), + recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: Name + "-controller"}), cfg: cfg, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { fc.enqueueController(obj, true) + + // when a federated learning job is added, + // send it to edge's LC. fc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { fc.enqueueController(cur, true) + + // when a federated learning job is updated, + // send it to edge's LC as Added event. fc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { fc.enqueueController(obj, true) + + // when a federated learning job is deleted, + // send it to edge's LC. fc.syncToEdge(watch.Deleted, obj) }, }) + fc.jobLister = jobInformer.Lister() fc.jobStoreSynced = jobInformer.Informer().HasSynced @@ -575,7 +590,5 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { fc.podStore = podInformer.Lister() fc.podStoreSynced = podInformer.Informer().HasSynced - fc.addUpstreamHandler(cc) - return fc, nil } diff --git a/pkg/globalmanager/controllers/federatedlearning/upstream.go b/pkg/globalmanager/controllers/federatedlearning/upstream.go index 0bcba81e..01888a6d 100644 --- a/pkg/globalmanager/controllers/federatedlearning/upstream.go +++ b/pkg/globalmanager/controllers/federatedlearning/upstream.go @@ -110,7 +110,7 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // TODO: more meaningful reason/message reason := "DoTraining" message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) - cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) + cond := NewJobCondition(sednav1.FLJobCondTraining, reason, message) c.appendStatusCondition(name, namespace, cond) } } @@ -118,6 +118,6 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ return nil } -func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { - return cc.UpstreamController.Add(KindName, c.updateFromEdge) +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) } diff --git a/pkg/globalmanager/controllers/incrementallearning/downstream.go b/pkg/globalmanager/controllers/incrementallearning/downstream.go index cba0b136..a53da8cb 100644 --- a/pkg/globalmanager/controllers/incrementallearning/downstream.go +++ b/pkg/globalmanager/controllers/incrementallearning/downstream.go @@ -55,6 +55,10 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro if !ok { return nil } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 job.Kind = KindName jobConditions := job.Status.Conditions @@ -133,7 +137,6 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro } return nil - } func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { diff --git a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index 5e88232f..f1d792ae 100644 --- a/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -32,7 +32,6 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -68,7 +67,7 @@ type Controller struct { // podStoreSynced returns true if the pod store has been synced at least once. // Added as a member to the struct to allow injection for testing. podStoreSynced cache.InformerSynced - // jobStoreSynced returns true if the incrementaljob store has been synced at least once. + // jobStoreSynced returns true if the job store has been synced at least once. // Added as a member to the struct to allow injection for testing. jobStoreSynced cache.InformerSynced @@ -81,8 +80,6 @@ type Controller struct { // IncrementalLearningJobs that need to be updated queue workqueue.RateLimitingInterface - recorder record.EventRecorder - cfg *config.ControllerConfig sendToEdgeFunc runtime.DownstreamSendFunc @@ -104,6 +101,7 @@ func (c *Controller) Run(stopCh <-chan struct{}) { return } + klog.Infof("Starting %s job workers", Name) for i := 0; i < workers; i++ { go wait.Until(c.worker, time.Second, stopCh) @@ -252,7 +250,8 @@ func (c *Controller) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) } - sharedIncrementalJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name) + + sharedJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("incrementallearning job has been deleted: %v", key) @@ -260,19 +259,21 @@ func (c *Controller) sync(key string) (bool, error) { } return false, err } - incrementaljob := *sharedIncrementalJob - // set kind for incrementaljob in case that the kind is None - incrementaljob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob")) - // incrementaljob first start, create pod for inference - if incrementaljob.Status.StartTime == nil { + + job := *sharedJob + // set kind in case that the kind is None + job.SetGroupVersionKind(Kind) + + // when job is handled at first, create pod for inference + if job.Status.StartTime == nil { now := metav1.Now() - incrementaljob.Status.StartTime = &now - pod := c.getSpecifiedPods(&incrementaljob, runtime.InferencePodType) + job.Status.StartTime = &now + pod := c.getSpecifiedPods(&job, runtime.InferencePodType) if pod == nil { - err = c.createInferPod(&incrementaljob) + err = c.createInferPod(&job) } else { if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { - err = c.createInferPod(&incrementaljob) + err = c.createInferPod(&job) } } if err != nil { @@ -280,8 +281,8 @@ func (c *Controller) sync(key string) (bool, error) { } } - // if incrementaljob was finished previously, we don't want to redo the termination - if IsIncrementalJobFinished(&incrementaljob) { + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } @@ -289,20 +290,20 @@ func (c *Controller) sync(key string) (bool, error) { jobFailed := false needUpdated := false - // update conditions of incremental job - needUpdated, err = c.updateIncrementalJobConditions(&incrementaljob) + // transit this job's state machine + needUpdated, err = c.transitJobState(&job) if err != nil { - klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) + klog.V(2).Infof("incrementallearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err) } if needUpdated { - if err := c.updateIncrementalJobStatus(&incrementaljob); err != nil { + if err := c.updateJobStatus(&job); err != nil { return forget, err } - if jobFailed && !IsIncrementalJobFinished(&incrementaljob) { - // returning an error will re-enqueue IncrementalJob after the backoff period - return forget, fmt.Errorf("failed pod(s) detected for incrementaljob key %q", key) + if jobFailed && !IsJobFinished(&job) { + // returning an error will re-enqueue IncrementalLearningJob after the backoff period + return forget, fmt.Errorf("failed pod(s) detected for incrementallearning job key %q", key) } forget = true @@ -317,61 +318,56 @@ func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, key := runtime.AnnotationsKeyPrefix + jobStage ann := job.GetAnnotations() - if ann != nil { - if ann[key] == nodeName { - // already set - return nil - } + if ann[key] == nodeName { + // already set + return nil } + dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) jobClient := c.client.IncrementalLearningJobs(job.Namespace) - var err error - for i := 0; i <= runtime.ResourceUpdateRetries; i++ { - var newJob *sednav1.IncrementalLearningJob - newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break + return err } annotations := newJob.GetAnnotations() - if annotations != nil { - if annotations[key] == nodeName { - return nil - } - } - - dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) - if _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}); err == nil { - break + if annotations[key] == nodeName { + return nil } - } - return err + _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}) + return err + }) } -// updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus -func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) { +// transitJobState transit job to next state +func (c *Controller) transitJobState(job *sednav1.IncrementalLearningJob) (bool, error) { var initialType sednav1.ILJobStageConditionType var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ Stage: sednav1.ILJobTrain, Type: initialType, } + var newConditionType sednav1.ILJobStageConditionType var needUpdated = false - jobConditions := incrementaljob.Status.Conditions + var podStatus v1.PodPhase = v1.PodUnknown var pod *v1.Pod + + jobConditions := job.Status.Conditions if len(jobConditions) > 0 { // get latest pod and pod status latestCondition = (jobConditions)[len(jobConditions)-1] - klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name, + klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", job.Namespace, job.Name, latestCondition.Stage) - pod = c.getSpecifiedPods(incrementaljob, string(latestCondition.Stage)) + pod = c.getSpecifiedPods(job, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase } } + jobStage := latestCondition.Stage currentType := latestCondition.Type newConditionType = currentType @@ -388,14 +384,14 @@ func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.Incr // include train, eval, deploy pod var err error if jobStage == sednav1.ILJobDeploy { - err = c.restartInferPod(incrementaljob) + err = c.restartInferPod(job) if err != nil { - klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) + klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) } else { - klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name) + klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = c.createPod(incrementaljob, jobStage) + err = c.createPod(job, jobStage) } if err != nil { return needUpdated, err @@ -411,17 +407,17 @@ func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.Incr newConditionType = sednav1.ILJobStageCondRunning // add nodeName to job - if err := c.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil { + if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil { return needUpdated, err } } } else if podStatus == v1.PodSucceeded { // watch pod status, if pod completed, set type completed newConditionType = sednav1.ILJobStageCondCompleted - klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) + klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) } else if podStatus == v1.PodFailed { newConditionType = sednav1.ILJobStageCondFailed - klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) + klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) } case sednav1.ILJobStageCondCompleted: jobStage = getNextStage(jobStage) @@ -434,31 +430,29 @@ func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.Incr default: // do nothing when given other type out of cases } - klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", incrementaljob.Namespace, incrementaljob.Name, jobConditions) + + klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) if latestCondition.Type != newConditionType { - incrementaljob.Status.Conditions = append(incrementaljob.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) + job.Status.Conditions = append(job.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) needUpdated = true - return needUpdated, nil } + return needUpdated, nil } -// updateIncrementalJobStatus ensures that jobstatus can be updated rightly -func (c *Controller) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error { - jobClient := c.client.IncrementalLearningJobs(incrementaljob.Namespace) - var err error - for i := 0; i <= runtime.ResourceUpdateRetries; i++ { - var newIncrementalJob *sednav1.IncrementalLearningJob - newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{}) +// updateJobStatus ensures that job status can be updated rightly +func (c *Controller) updateJobStatus(job *sednav1.IncrementalLearningJob) error { + jobClient := c.client.IncrementalLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break - } - newIncrementalJob.Status = incrementaljob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newIncrementalJob, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return err + + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition { @@ -478,21 +472,24 @@ func (c *Controller) generatePodName(jobName string, workerType string) string { } func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { - if podType == "Deploy" { - podType = runtime.InferencePodType - } var latestPod *v1.Pod selector, _ := runtime.GenerateSelector(job) pods, err := c.podStore.Pods(job.Namespace).List(selector) if len(pods) == 0 || err != nil { return nil } + var matchTag = false latestPod = pods[0] + + if podType == "Deploy" { + podType = runtime.InferencePodType + } + for _, pod := range pods { s := strings.Split(pod.Name, "-") - CurrentPodType := s[len(s)-2] - if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) { + currentPodType := s[len(s)-2] + if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && currentPodType == strings.ToLower(podType) { latestPod = pod matchTag = true } @@ -510,12 +507,14 @@ func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error err := c.createInferPod(job) return err } + ctx := context.Background() err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) if err != nil { klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err } + err = c.createInferPod(job) if err != nil { klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) @@ -537,7 +536,7 @@ func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage { } } -func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool { +func IsJobFinished(j *sednav1.IncrementalLearningJob) bool { // TODO return false } @@ -600,13 +599,14 @@ func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sedn } // get all url for train and eval from data in condition + var cond IncrementalCondData condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond IncrementalCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") } + dataURL := cond.Input.DataURL inputmodelURLs := cond.GetInputModelURLs() @@ -619,13 +619,14 @@ func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sedn originalDataURLOrIndex = dataset.Spec.URL } - var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) + var workerParam runtime.WorkerParam + if podtype == sednav1.ILJobTrain { workerParam.WorkerType = runtime.TrainPodType podTemplate = &job.Spec.TrainSpec.Template - // Env parameters for train + // Env parameters for train workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, @@ -688,10 +689,10 @@ func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sedn }, ) } else { + // Configure eval worker's mounts and envs podTemplate = &job.Spec.EvalSpec.Template workerParam.WorkerType = "Eval" - // Configure Env information for eval by initial runtime.WorkerParam workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, @@ -757,10 +758,7 @@ func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sedn workerParam.HostNetwork = true // create pod based on podtype - _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam) - if err != nil { - return err - } + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, &workerParam) return } @@ -771,19 +769,20 @@ func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { return fmt.Errorf("failed to get infer model %s: %w", infermodelName, err) } + inferModelURL := inferModel.Spec.URL - // Env parameters for edge HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) HEMParameterString := string(HEMParameterJSON) - // Configure container mounting and Env information by initial runtime.WorkerParam modelSecret, err := c.getSecret( job.Namespace, inferModel.Spec.CredentialName, fmt.Sprintf("model %s", inferModel.Name), ) - var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) + + // Configure inference worker's mounts and envs + var workerParam runtime.WorkerParam workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ URL: &runtime.MountURL{ @@ -810,13 +809,13 @@ func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { workerParam.WorkerType = runtime.InferencePodType workerParam.HostNetwork = true - // create edge pod - _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) + // create the inference worker + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, &workerParam) return err } -// New creates a new IncrementalJob controller that keeps the relevant pods -// in sync with their corresponding IncrementalJob objects. +// New creates a new incremental learning job controller that keeps the relevant pods +// in sync with the corresponding IncrementalLearningJob objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { podInformer := cc.KubeInformerFactory.Core().V1().Pods() @@ -829,9 +828,9 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { kubeClient: cc.KubeClient, client: cc.SednaClient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "incrementallearningjob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}), - cfg: cc.Config, + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), + + cfg: cc.Config, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -859,7 +858,5 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - jc.addUpstreamHandler(cc) - return jc, nil } diff --git a/pkg/globalmanager/controllers/incrementallearning/upstream.go b/pkg/globalmanager/controllers/incrementallearning/upstream.go index fa3975a4..7932a003 100644 --- a/pkg/globalmanager/controllers/incrementallearning/upstream.go +++ b/pkg/globalmanager/controllers/incrementallearning/upstream.go @@ -157,6 +157,6 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ return nil } -func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { - return cc.UpstreamController.Add(KindName, c.updateFromEdge) +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) } diff --git a/pkg/globalmanager/controllers/jointinference/downstream.go b/pkg/globalmanager/controllers/jointinference/downstream.go index 31778ef6..99b2563d 100644 --- a/pkg/globalmanager/controllers/jointinference/downstream.go +++ b/pkg/globalmanager/controllers/jointinference/downstream.go @@ -31,6 +31,11 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro return nil } + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + joint.Kind = KindName + // Here only propagate to the nodes with non empty name // FIXME: only the case that Spec.NodeName specified is support nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName diff --git a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index 7a182119..faff1143 100644 --- a/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -75,7 +75,7 @@ type Controller struct { // A store of pods podStore corelisters.PodLister - // serviceStoreSynced returns true if the jointinferenceservice store has been synced at least once. + // serviceStoreSynced returns true if the JointInferenceService store has been synced at least once. serviceStoreSynced cache.InformerSynced // A store of service serviceLister sednav1listers.JointInferenceServiceLister @@ -114,7 +114,7 @@ func (c *Controller) Run(stopCh <-chan struct{}) { <-stopCh } -// enqueueByPod enqueues the jointInferenceService object of the specified pod. +// enqueueByPod enqueues the JointInferenceService object of the specified pod. func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) @@ -167,7 +167,7 @@ func (c *Controller) updatePod(old, cur interface{}) { c.addPod(curPod) } -// deletePod enqueues the jointinferenceservice obj When a pod is deleted +// deletePod enqueues the JointinferenceService obj When a pod is deleted func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) @@ -176,7 +176,7 @@ func (c *Controller) deletePod(obj interface{}) { // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod - // changed labels the new jointinferenceservice will not be woken up till the periodic resync. + // changed labels the new JointInferenceService will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { @@ -252,7 +252,7 @@ func (c *Controller) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) } - sharedJointinferenceservice, err := c.serviceLister.JointInferenceServices(ns).Get(name) + sharedService, err := c.serviceLister.JointInferenceServices(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("JointInferenceService has been deleted: %v", key) @@ -261,37 +261,38 @@ func (c *Controller) sync(key string) (bool, error) { return false, err } - jointinferenceservice := *sharedJointinferenceservice + service := *sharedService - // if jointinferenceservice was finished previously, we don't want to redo the termination - if isJointinferenceserviceFinished(&jointinferenceservice) { + // if service was finished previously, we don't want to redo the termination + if isServiceFinished(&service) { return true, nil } - // set kind for jointinferenceservice in case that the kind is None + // set kind for service in case that the kind is None // more details at https://github.com/kubernetes/kubernetes/issues/3030 - jointinferenceservice.SetGroupVersionKind(Kind) + service.SetGroupVersionKind(Kind) - selector, _ := runtime.GenerateSelector(&jointinferenceservice) - pods, err := c.podStore.Pods(jointinferenceservice.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(&service) + pods, err := c.podStore.Pods(service.Namespace).List(selector) if err != nil { return false, err } - klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", jointinferenceservice.Namespace, jointinferenceservice.Name, len(pods), pods) + klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", service.Namespace, service.Name, len(pods), pods) - latestConditionLen := len(jointinferenceservice.Status.Conditions) + latestConditionLen := len(service.Status.Conditions) active := runtime.CalcActivePodCount(pods) var failed int32 = 0 + // neededCounts means that two pods should be created successfully in a jointinference service currently // two pods consist of edge pod and cloud pod var neededCounts int32 = 2 - // jointinferenceservice first start - if jointinferenceservice.Status.StartTime == nil { + + if service.Status.StartTime == nil { now := metav1.Now() - jointinferenceservice.Status.StartTime = &now + service.Status.StartTime = &now } else { failed = neededCounts - active } @@ -303,7 +304,7 @@ func (c *Controller) sync(key string) (bool, error) { // get the latest condition type // based on that condition updated is appended, not inserted. - jobConditions := jointinferenceservice.Status.Conditions + jobConditions := service.Status.Conditions if len(jobConditions) > 0 { latestConditionType = (jobConditions)[len(jobConditions)-1].Type } @@ -316,12 +317,12 @@ func (c *Controller) sync(key string) (bool, error) { serviceFailed = true // TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker reason = "workerFailed" - message = "the worker of Jointinferenceservice failed" + message = "the worker of service failed" newCondtionType = sednav1.JointInferenceServiceCondFailed - c.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message) + c.recorder.Event(&service, v1.EventTypeWarning, reason, message) } else { if len(pods) == 0 { - active, manageServiceErr = c.createWorkers(&jointinferenceservice) + active, manageServiceErr = c.createWorkers(&service) } if manageServiceErr != nil { serviceFailed = true @@ -336,20 +337,20 @@ func (c *Controller) sync(key string) (bool, error) { // if newCondtionType != latestConditionType { - jointinferenceservice.Status.Conditions = append(jointinferenceservice.Status.Conditions, NewJointInferenceServiceCondition(newCondtionType, reason, message)) + service.Status.Conditions = append(service.Status.Conditions, newServiceCondition(newCondtionType, reason, message)) } forget := false // no need to update the jointinferenceservice if the status hasn't changed since last time - if jointinferenceservice.Status.Active != active || jointinferenceservice.Status.Failed != failed || len(jointinferenceservice.Status.Conditions) != latestConditionLen { - jointinferenceservice.Status.Active = active - jointinferenceservice.Status.Failed = failed + if service.Status.Active != active || service.Status.Failed != failed || len(service.Status.Conditions) != latestConditionLen { + service.Status.Active = active + service.Status.Failed = failed - if err := c.updateStatus(&jointinferenceservice); err != nil { + if err := c.updateStatus(&service); err != nil { return forget, err } - if serviceFailed && !isJointinferenceserviceFinished(&jointinferenceservice) { + if serviceFailed && !isServiceFinished(&service) { // returning an error will re-enqueue jointinferenceservice after the backoff period return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key) } @@ -360,8 +361,8 @@ func (c *Controller) sync(key string) (bool, error) { return forget, manageServiceErr } -// NewJointInferenceServiceCondition creates a new joint condition -func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { +// newServiceCondition creates a new joint condition +func newServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { return sednav1.JointInferenceServiceCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -372,24 +373,20 @@ func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServi } } -func (c *Controller) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error { - serviceClient := c.client.JointInferenceServices(jointinferenceservice.Namespace) - var err error - for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { - var newJointinferenceservice *sednav1.JointInferenceService - newJointinferenceservice, err = serviceClient.Get(context.TODO(), jointinferenceservice.Name, metav1.GetOptions{}) +func (c *Controller) updateStatus(service *sednav1.JointInferenceService) error { + client := c.client.JointInferenceServices(service.Namespace) + return runtime.RetryUpdateStatus(service.Name, service.Namespace, func() error { + newService, err := client.Get(context.TODO(), service.Name, metav1.GetOptions{}) if err != nil { - break - } - newJointinferenceservice.Status = jointinferenceservice.Status - if _, err = serviceClient.UpdateStatus(context.TODO(), newJointinferenceservice, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return nil + newService.Status = service.Status + _, err = client.UpdateStatus(context.TODO(), newService, metav1.UpdateOptions{}) + return err + }) } -func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { +func isServiceFinished(j *sednav1.JointInferenceService) bool { for _, c := range j.Status.Conditions { if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue { return true @@ -586,7 +583,5 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - jc.addUpstreamHandler(cc) - return jc, nil } diff --git a/pkg/globalmanager/controllers/jointinference/upstream.go b/pkg/globalmanager/controllers/jointinference/upstream.go index ceff6e77..93d0fa7e 100644 --- a/pkg/globalmanager/controllers/jointinference/upstream.go +++ b/pkg/globalmanager/controllers/jointinference/upstream.go @@ -87,6 +87,6 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ return nil } -func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { - return cc.UpstreamController.Add(KindName, c.updateFromEdge) +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) } diff --git a/pkg/globalmanager/controllers/lifelonglearning/downstream.go b/pkg/globalmanager/controllers/lifelonglearning/downstream.go index 2f33516a..8b9ef5fa 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/downstream.go +++ b/pkg/globalmanager/controllers/lifelonglearning/downstream.go @@ -30,6 +30,12 @@ func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) erro if !ok { return nil } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + job.Kind = KindName + // Here only propagate to the nodes with non empty name // FIXME(llhuii): only the case that all workers having the same nodeName are support, diff --git a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index d7d2dbef..a946ca7c 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -30,7 +30,6 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -78,8 +77,6 @@ type Controller struct { // LifelongLearningJobs that need to be updated queue workqueue.RateLimitingInterface - recorder record.EventRecorder - cfg *config.ControllerConfig sendToEdgeFunc runtime.DownstreamSendFunc @@ -248,7 +245,7 @@ func (c *Controller) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) } - sharedLifelongLearningJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name) + sharedJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) @@ -256,18 +253,18 @@ func (c *Controller) sync(key string) (bool, error) { } return false, err } - lifelonglearningjob := *sharedLifelongLearningJob + job := *sharedJob // set kind for lifelonglearningjob in case that the kind is None - lifelonglearningjob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob")) + job.SetGroupVersionKind(Kind) - // lifelonglearningjob first start - if lifelonglearningjob.Status.StartTime == nil { + if job.Status.StartTime == nil { + // job is first in now := metav1.Now() - lifelonglearningjob.Status.StartTime = &now + job.Status.StartTime = &now } - // if lifelonglearningjob was finished previously, we don't want to redo the termination - if IsLifelongLearningJobFinished(&lifelonglearningjob) { + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } @@ -275,18 +272,18 @@ func (c *Controller) sync(key string) (bool, error) { jobFailed := false needUpdated := false - // update conditions of lifelonglearning job - needUpdated, err = c.updateLifelongLearningJobConditions(&lifelonglearningjob) + // transit this job's state machine + needUpdated, err = c.transitJobState(&job) if err != nil { - klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) + klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", job.Namespace, job.Name, err) } if needUpdated { - if err := c.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil { + if err := c.updateJobStatus(&job); err != nil { return forget, err } - if jobFailed && !IsLifelongLearningJobFinished(&lifelonglearningjob) { + if jobFailed && !IsJobFinished(&job) { // returning an error will re-enqueue LifelongLearningJob after the backoff period return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key) } @@ -297,24 +294,25 @@ func (c *Controller) sync(key string) (bool, error) { return forget, err } -// updateLifelongLearningJobConditions ensures that conditions of lifelonglearning job can be changed by podstatus -func (c *Controller) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) { +// transitJobState transit job to next state +func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) { var initialType sednav1.LLJobStageConditionType var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ Stage: sednav1.LLJobTrain, Type: initialType, } + var newConditionType sednav1.LLJobStageConditionType - latestCondition.Stage = sednav1.LLJobTrain var needUpdated = false - jobConditions := lifelonglearningjob.Status.Conditions + var podStatus v1.PodPhase = v1.PodUnknown + jobConditions := job.Status.Conditions if len(jobConditions) > 0 { // get latest pod and pod status latestCondition = (jobConditions)[len(jobConditions)-1] - klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", lifelonglearningjob.Namespace, lifelonglearningjob.Name, + klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name, latestCondition.Stage) - pod := c.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage)) + pod := c.getSpecifiedPods(job, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase @@ -336,14 +334,14 @@ func (c *Controller) updateLifelongLearningJobConditions(lifelonglearningjob *se // include train, eval, deploy pod var err error if jobStage == sednav1.LLJobDeploy { - err = c.restartInferPod(lifelonglearningjob) + err = c.restartInferPod(job) if err != nil { - klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) + klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) } else { - klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", lifelonglearningjob.Namespace, lifelonglearningjob.Name) + klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = c.createPod(lifelonglearningjob, jobStage) + err = c.createPod(job, jobStage) } if err != nil { return needUpdated, err @@ -361,10 +359,10 @@ func (c *Controller) updateLifelongLearningJobConditions(lifelonglearningjob *se } else if podStatus == v1.PodSucceeded { // watch pod status, if pod completed, set type completed newConditionType = sednav1.LLJobStageCondCompleted - klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) + klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) } else if podStatus == v1.PodFailed { newConditionType = sednav1.LLJobStageCondFailed - klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) + klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) } case sednav1.LLJobStageCondCompleted: jobStage = c.getNextStage(jobStage) @@ -377,34 +375,31 @@ func (c *Controller) updateLifelongLearningJobConditions(lifelonglearningjob *se default: // do nothing when given other type out of cases } - klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobConditions) + + klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) if latestCondition.Type != newConditionType { - lifelonglearningjob.Status.Conditions = append(lifelonglearningjob.Status.Conditions, NewLifelongLearningJobCondition(newConditionType, jobStage)) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(newConditionType, jobStage)) needUpdated = true return needUpdated, nil } return needUpdated, nil } -// updateLifelongLearningJobStatus ensures that jobstatus can be updated rightly -func (c *Controller) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error { - jobClient := c.client.LifelongLearningJobs(lifelonglearningjob.Namespace) - var err error - for i := 0; i <= runtime.ResourceUpdateRetries; i = i + 1 { - var newLifelongLearningJob *sednav1.LifelongLearningJob - newLifelongLearningJob, err = jobClient.Get(context.TODO(), lifelonglearningjob.Name, metav1.GetOptions{}) +// updateJobStatus ensures that jobstatus can be updated rightly +func (c *Controller) updateJobStatus(job *sednav1.LifelongLearningJob) error { + jobClient := c.client.LifelongLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break - } - newLifelongLearningJob.Status = lifelonglearningjob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newLifelongLearningJob, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return err + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } -func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { +func NewJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { return sednav1.LLJobCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -492,7 +487,7 @@ func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret return } -func IsLifelongLearningJobFinished(j *sednav1.LifelongLearningJob) bool { +func IsJobFinished(j *sednav1.LifelongLearningJob) bool { // TODO return false } @@ -529,7 +524,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1 // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond LifelongLearningCondData + var cond ConditionData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -596,7 +591,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1 podTemplate = &job.Spec.EvalSpec.Template workerParam.WorkerType = "Eval" - // Configure Env information for eval by initial runtime.WorkerParam + // Configure Env information for eval by initial WorkerParam workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, @@ -721,8 +716,7 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc := &Controller{ kubeClient: cc.KubeClient, client: cc.SednaClient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "lifelonglearningjob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), cfg: cfg, } @@ -751,7 +745,5 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - jc.addUpstreamHandler(cc) - return jc, nil } diff --git a/pkg/globalmanager/controllers/lifelonglearning/upstream.go b/pkg/globalmanager/controllers/lifelonglearning/upstream.go index 1c5e768f..011c60ec 100644 --- a/pkg/globalmanager/controllers/lifelonglearning/upstream.go +++ b/pkg/globalmanager/controllers/lifelonglearning/upstream.go @@ -32,7 +32,7 @@ import ( type Model = runtime.Model // the data of this condition including the input/output to do the next step -type LifelongLearningCondData struct { +type ConditionData struct { Input *struct { // Only one model cases Model *Model `json:"model,omitempty"` @@ -57,7 +57,7 @@ type LifelongLearningCondData struct { } `json:"output,omitempty"` } -func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string { +func (cd *ConditionData) joinModelURLs(model *Model, models []Model) []string { var modelURLs []string if model != nil { modelURLs = append(modelURLs, model.GetURL()) @@ -69,19 +69,19 @@ func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) return modelURLs } -func (cd *LifelongLearningCondData) Unmarshal(data []byte) error { +func (cd *ConditionData) Unmarshal(data []byte) error { return json.Unmarshal(data, cd) } -func (cd LifelongLearningCondData) Marshal() ([]byte, error) { +func (cd ConditionData) Marshal() ([]byte, error) { return json.Marshal(cd) } -func (cd *LifelongLearningCondData) GetInputModelURLs() []string { +func (cd *ConditionData) GetInputModelURLs() []string { return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) } -func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { +func (cd *ConditionData) GetOutputModelURLs() []string { return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) } @@ -112,7 +112,7 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ // Get the condition data. // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData LifelongLearningCondData + var condData ConditionData err = json.Unmarshal(content, &condData) if err != nil { return err @@ -159,6 +159,6 @@ func (c *Controller) updateFromEdge(name, namespace, operation string, content [ return nil } -func (c *Controller) addUpstreamHandler(cc *runtime.ControllerContext) error { - return cc.UpstreamController.Add(KindName, c.updateFromEdge) +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) } diff --git a/pkg/globalmanager/controllers/manager.go b/pkg/globalmanager/controllers/manager.go index 85328023..42feb40e 100644 --- a/pkg/globalmanager/controllers/manager.go +++ b/pkg/globalmanager/controllers/manager.go @@ -76,7 +76,7 @@ func (m *Manager) Start() error { namespace = metav1.NamespaceAll } - // make this period configurable + // TODO(llhuii): make this period configurable minResyncPeriod := time.Second * 30 kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, genResyncPeriod(minResyncPeriod), kubeinformers.WithNamespace(namespace)) @@ -94,15 +94,11 @@ func (m *Manager) Start() error { } uc, _ := NewUpstreamController(context) - context.UpstreamController = uc downstreamSendFunc := messagelayer.NewContextMessageLayer().SendResourceObject stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - sednaInformerFactory.Start(stopCh) - go uc.Run(stopCh) for name, factory := range NewRegistry() { @@ -111,11 +107,15 @@ func (m *Manager) Start() error { return fmt.Errorf("failed to initialize controller %s: %v", name, err) } f.SetDownstreamSendFunc(downstreamSendFunc) + f.SetUpstreamHandler(uc.Add) + klog.Infof("initialized controller %s", name) go f.Run(stopCh) - klog.Infof("started controller %s", name) } + kubeInformerFactory.Start(stopCh) + sednaInformerFactory.Start(stopCh) + addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port) ws := websocket.NewServer(addr) diff --git a/pkg/globalmanager/controllers/upstream.go b/pkg/globalmanager/controllers/upstream.go index 9e6a2216..c02f2c57 100644 --- a/pkg/globalmanager/controllers/upstream.go +++ b/pkg/globalmanager/controllers/upstream.go @@ -29,13 +29,13 @@ import ( // UpstreamController subscribes the updates from edge and syncs to k8s api server type UpstreamController struct { messageLayer messagelayer.MessageLayer - updateHandlers map[string]runtime.UpstreamUpdateHandler + updateHandlers map[string]runtime.UpstreamHandler } func (uc *UpstreamController) checkOperation(operation string) error { // current only support the 'status' operation if operation != "status" { - return fmt.Errorf("unknown operation %s", operation) + return fmt.Errorf("unknown operation '%s'", operation) } return nil } @@ -84,7 +84,7 @@ func (uc *UpstreamController) Run(stopCh <-chan struct{}) { <-stopCh } -func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamUpdateHandler) error { +func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamHandler) error { kind = strings.ToLower(kind) if _, ok := uc.updateHandlers[kind]; ok { return fmt.Errorf("a upstream handler for kind %s already exists", kind) @@ -95,10 +95,10 @@ func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamUpdateHan } // NewUpstreamController creates a new Upstream controller from config -func NewUpstreamController(cc *runtime.ControllerContext) (runtime.UpstreamControllerI, error) { +func NewUpstreamController(cc *runtime.ControllerContext) (*UpstreamController, error) { uc := &UpstreamController{ messageLayer: messagelayer.NewContextMessageLayer(), - updateHandlers: make(map[string]runtime.UpstreamUpdateHandler), + updateHandlers: make(map[string]runtime.UpstreamHandler), } return uc, nil diff --git a/pkg/globalmanager/runtime/common.go b/pkg/globalmanager/runtime/common.go index 47bc7e0e..e85c15c0 100644 --- a/pkg/globalmanager/runtime/common.go +++ b/pkg/globalmanager/runtime/common.go @@ -34,12 +34,8 @@ import ( ) const ( - // DefaultBackOff is the default backoff period - DefaultBackOff = 10 * time.Second - // MaxBackOff is the max backoff period - MaxBackOff = 360 * time.Second - // ResourceUpdateRetries defines times of retrying to update resource - ResourceUpdateRetries = 3 + // resourceUpdateTries defines times of trying to update resource + resourceUpdateTries = 3 ) // GetNodeIPByName get node ip by node name @@ -152,17 +148,15 @@ func ConvertMapToMetrics(metric map[string]interface{}) []sednav1.Metric { return l } -const upstreamStatusUpdateRetries = 3 - // RetryUpdateStatus simply retries to call the status update func func RetryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { var err error - for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ { + for try := 1; try <= resourceUpdateTries; try++ { err = updateStatusFunc() if err == nil { return nil } - klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err) + klog.Warningf("Error to update %s/%s status, tried %d times: %+v", namespace, name, try, err) } return err } diff --git a/pkg/globalmanager/runtime/secret_injector.go b/pkg/globalmanager/runtime/secret_injector.go index 4386a034..8c986f41 100644 --- a/pkg/globalmanager/runtime/secret_injector.go +++ b/pkg/globalmanager/runtime/secret_injector.go @@ -121,7 +121,6 @@ func InjectSecretAnnotations(client kubernetes.Interface, obj CommonInterface, s } func injectSecretObj(obj CommonInterface, secret *v1.Secret) (err error) { - secretData := secret.GetAnnotations() for k, v := range secret.Data { diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go index ebbcf61f..4a2c075d 100644 --- a/pkg/globalmanager/runtime/types.go +++ b/pkg/globalmanager/runtime/types.go @@ -17,6 +17,8 @@ limitations under the License. package runtime import ( + "time" + "github.com/kubeedge/sedna/pkg/globalmanager/config" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sruntime "k8s.io/apimachinery/pkg/runtime" @@ -29,31 +31,12 @@ import ( sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" ) -// CommonInterface describes the commom interface of CRs -type CommonInterface interface { - metav1.Object - schema.ObjectKind - k8sruntime.Object -} - -// BaseControllerI defines the interface of an controller -type BaseControllerI interface { - Run(stopCh <-chan struct{}) -} - -// FeatureControllerI defines the interface of an AI Feature controller -type FeatureControllerI interface { - BaseControllerI - SetDownstreamSendFunc(f DownstreamSendFunc) error -} - -type Model struct { - Format string `json:"format,omitempty"` - URL string `json:"url,omitempty"` - Metrics map[string]interface{} `json:"metrics,omitempty"` -} - const ( + // DefaultBackOff is the default backoff period + DefaultBackOff = 10 * time.Second + // MaxBackOff is the max backoff period + MaxBackOff = 360 * time.Second + // TrainPodType is type of train pod TrainPodType = "train" // EvalPodType is type of eval pod @@ -65,24 +48,52 @@ const ( AnnotationsKeyPrefix = "sedna.io/" ) +type Model struct { + Format string `json:"format,omitempty"` + URL string `json:"url,omitempty"` + Metrics map[string]interface{} `json:"metrics,omitempty"` +} + func (m *Model) GetURL() string { return m.URL } -// updateHandler handles the updates from LC(running at edge) to update the -// corresponding resource -type UpstreamUpdateHandler func(namespace, name, operation string, content []byte) error - -type UpstreamControllerI interface { - BaseControllerI - Add(kind string, updateHandler UpstreamUpdateHandler) error +// CommonInterface describes the commom interface of CRs +type CommonInterface interface { + metav1.Object + schema.ObjectKind + k8sruntime.Object } +// UpstreamHandler is the function definition for handling the upstream updates, +// i.e. resource updates(mainly status) from LC(running at edge) +type UpstreamHandler = func(namespace, name, operation string, content []byte) error + +// UpstreamHandlerAddFunc defines the upstream controller register function for adding handler +type UpstreamHandlerAddFunc = func(kind string, updateHandler UpstreamHandler) error + +// DownstreamSendFunc is the send function for feature controllers to sync the resource updates(spec and status) to LC type DownstreamSendFunc = func(nodeName string, eventType watch.EventType, obj interface{}) error +// BaseControllerI defines the interface of an controller +type BaseControllerI interface { + Run(stopCh <-chan struct{}) +} + +// FeatureControllerI defines the interface of an AI Feature controller +type FeatureControllerI interface { + BaseControllerI + + // SetDownstreamSendFunc sets up the downstream send function in the feature controller + SetDownstreamSendFunc(f DownstreamSendFunc) error + + // SetUpstreamHandler sets up the upstream handler function for the feature controller + SetUpstreamHandler(add UpstreamHandlerAddFunc) error +} + +// ControllerContext defines the context that all feature controller share and belong to type ControllerContext struct { - Config *config.ControllerConfig - UpstreamController UpstreamControllerI + Config *config.ControllerConfig KubeClient kubernetes.Interface KubeInformerFactory kubeinformers.SharedInformerFactory diff --git a/pkg/globalmanager/runtime/worker.go b/pkg/globalmanager/runtime/worker.go index fab3dd13..df7208f4 100644 --- a/pkg/globalmanager/runtime/worker.go +++ b/pkg/globalmanager/runtime/worker.go @@ -105,7 +105,7 @@ func CreateKubernetesService(kubeClient kubernetes.Interface, object CommonInter return service.Spec.Ports[0].NodePort, nil } -// injectWorkerParam.Modifies pod in-place +// injectWorkerParam modifies pod in-place func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { InjectStorageInitializer(pod, workerParam)