/* Copyright 2021 The KubeEdge Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package federatedlearning import ( "context" "fmt" "strconv" "sync" "time" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/globalmanager/utils" ) const ( // KindName is the kind name of CR this controller controls KindName = "FederatedLearningJob" // Name is this controller name Name = "FederatedLearning" ) const ( jobStageAgg = "Aggregation" jobStageTrain = "Training" ) // Kind contains the schema.GroupVersionKind for this controller type. var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) // Controller ensures that all FederatedLearningJob objects have corresponding pods to // run their configured workload. type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface // podStoreSynced returns true if the pod store has been synced at least once. // Added as a member to the struct to allow injection for testing. podStoreSynced cache.InformerSynced // jobStoreSynced returns true if the FederatedLearningJob store has been synced at least once. // Added as a member to the struct to allow injection for testing. jobStoreSynced cache.InformerSynced // A store of jobs jobLister sednav1listers.FederatedLearningJobLister // A store of pods, populated by the podController podStore corelisters.PodLister // FLJobs that need to be updated queue workqueue.RateLimitingInterface recorder record.EventRecorder cfg *config.ControllerConfig sendToEdgeFunc runtime.DownstreamSendFunc // map to record the pods that are recreated recreatedPods sync.Map flSelector labels.Selector aggServiceHost string preventRecreation bool } // Run starts the main goroutine responsible for watching and syncing jobs. func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 defer utilruntime.HandleCrash() defer c.queue.ShutDown() klog.Infof("Starting %s controller", Name) defer klog.Infof("Shutting down %s controller", Name) if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { klog.Errorf("failed to wait for %s caches to sync", Name) return } klog.Infof("Starting %s workers", Name) for i := 0; i < workers; i++ { go wait.Until(c.worker, time.Second, stopCh) } <-stopCh } // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } if controllerRef.Kind != Kind.Kind { return } job, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } if job.UID != controllerRef.UID { return } c.enqueueController(job, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what federatedlearning job manage it and wake them up. func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) // no pod update, no queue if curPod.ResourceVersion == oldPod.ResourceVersion { return } c.addPod(curPod) } // deletePod enqueues the FederatedLearningJob obj When a pod is deleted func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod // changed labels the new FederatedLearningJob will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { klog.Warningf("couldn't get object from tombstone %+v", obj) return } pod, ok = tombstone.Obj.(*v1.Pod) if !ok { klog.Warningf("tombstone contained object that is not a pod %+v", obj) return } } c.enqueueByPod(pod, true) // when the CRD is updated, do not recreate the pod // if c.preventRecreation is true, do not recreate the pod if c.preventRecreation { return } // if pod is manually deleted, recreate it // first check if the pod is owned by a FederatedLearningJob controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil || controllerRef.Kind != Kind.Kind { return } _, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { if errors.IsNotFound(err) { // The FederatedLearningJob has been deleted, and the Pod should not be rebuilt. klog.Infof("FederatedLearningJob %s/%s not found, skipping pod recreation", pod.Namespace, controllerRef.Name) return } klog.Errorf("Error getting FederatedLearningJob %s/%s: %v", pod.Namespace, controllerRef.Name, err) return } // then check if the pod is already in the map if _, exists := c.recreatedPods.Load(pod.Name); exists { return } // if not, recreate it klog.Infof("Pod %s/%s deleted, recreating...", pod.Namespace, pod.Name) // Create a deep copy of the old pod newPod := pod.DeepCopy() // Reset the resource version and UID as they are unique to each object newPod.ResourceVersion = "" newPod.UID = "" // Clear the status newPod.Status = v1.PodStatus{} // Remove the deletion timestamp newPod.DeletionTimestamp = nil // Remove the deletion grace period seconds newPod.DeletionGracePeriodSeconds = nil _, err = c.kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), newPod, metav1.CreateOptions{}) if err != nil { return } klog.Infof("Successfully recreated pod %s/%s", newPod.Namespace, newPod.Name) // mark the pod as recreated c.recreatedPods.Store(newPod.Name, true) // set a timer to delete the record from the map after a while go func() { time.Sleep(5 * time.Second) c.recreatedPods.Delete(pod.Name) }() } // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) if err != nil { klog.Warningf("Couldn't get key for object %+v: %v", obj, err) return } backoff := time.Duration(0) if !immediate { backoff = runtime.GetBackoff(c.queue, key) } c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. func (c *Controller) worker() { for c.processNextWorkItem() { } } func (c *Controller) processNextWorkItem() bool { key, quit := c.queue.Get() if quit { return false } defer c.queue.Done(key) forget, err := c.sync(key.(string)) if err == nil { if forget { c.queue.Forget(key) } return true } klog.Warningf("Error syncing federatedlearning job: %v", err) c.queue.AddRateLimited(key) return true } // sync will sync the FederatedLearningJob with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) }() ns, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return false, err } if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) } sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("%s %v has been deleted", Name, key) return true, nil } return false, err } job := *sharedJob // set kind for FederatedLearningJob in case that the kind is None job.SetGroupVersionKind(Kind) // if job was finished previously, we don't want to redo the termination if IsJobFinished(&job) { return true, nil } c.flSelector, _ = runtime.GenerateSelector(&job) pods, err := c.podStore.Pods(job.Namespace).List(c.flSelector) if err != nil { return false, err } activePods := utils.FilterActivePods(pods) active := int32(len(activePods)) var activeAgg int32 var activeTrain int32 succeeded, failed := countPods(pods) conditions := len(job.Status.Conditions) // set StartTime when job is handled firstly if job.Status.StartTime == nil { now := metav1.Now() job.Status.StartTime = &now } var manageJobErr error var manageAggErr error var manageTrainErr error jobFailed := false var failureReason string var failureMessage string phase := job.Status.Phase if failed > 0 { jobFailed = true failureReason = "workerFailed" failureMessage = "the worker of FederatedLearningJob failed" } if jobFailed { job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) job.Status.Phase = sednav1.FLJobFailed c.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage) } else { // in the First time, we create the pods if len(pods) == 0 { activeAgg, manageAggErr = c.createAggPod(&job) createServiceErr := c.createService(&job) if createServiceErr != nil { return false, createServiceErr } activeTrain, manageTrainErr = c.createTrainPod(&job) active = activeAgg + activeTrain } complete := false if succeeded > 0 && active == 0 { complete = true } if complete { job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondComplete, "", "")) now := metav1.Now() job.Status.CompletionTime = &now c.recorder.Event(&job, v1.EventTypeNormal, "Completed", "FederatedLearningJob completed") job.Status.Phase = sednav1.FLJobSucceeded } else { job.Status.Phase = sednav1.FLJobRunning } } // Combine manageAggErr and manageTrainErr into a single error if manageAggErr != nil || manageTrainErr != nil { manageJobErr = fmt.Errorf("aggregator error: %v, training error: %v", manageAggErr, manageTrainErr) } forget := false // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to // improve the job backoff policy when parallelism > 1 and few FLJobs failed but others succeed. // In this case, we should clear the backoff delay. if job.Status.Succeeded < succeeded { forget = true } // no need to update the job if the status hasn't changed since last time if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions || job.Status.Phase != phase { job.Status.Active = active job.Status.Succeeded = succeeded job.Status.Failed = failed c.updateJobStatus(&job) if jobFailed && !IsJobFinished(&job) { // returning an error will re-enqueue FederatedLearningJob after the backoff period return forget, fmt.Errorf("failed pod(s) detected for FederatedLearningJob key %q", key) } forget = true } return forget, manageJobErr } func NewJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { return sednav1.FLJobCondition{ Type: conditionType, Status: v1.ConditionTrue, LastProbeTime: metav1.Now(), LastHeartbeatTime: metav1.Now(), Reason: reason, Message: message, } } // countPods returns number of succeeded and failed pods func countPods(pods []*v1.Pod) (succeeded, failed int32) { succeeded = int32(filterPods(pods, v1.PodSucceeded)) failed = int32(filterPods(pods, v1.PodFailed)) return } func (c *Controller) updateJobStatus(job *sednav1.FederatedLearningJob) error { jobClient := c.client.FederatedLearningJobs(job.Namespace) return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { return err } newJob.Status = job.Status _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) return err }) } // filterPods returns pods based on their phase. func filterPods(pods []*v1.Pod, phase v1.PodPhase) int { result := 0 for i := range pods { if phase == pods[i].Status.Phase { result++ } } return result } func IsJobFinished(j *sednav1.FederatedLearningJob) bool { for _, c := range j.Status.Conditions { if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { return true } } return false } func (c *Controller) getSecret(namespace, name, ownerStr string) (secret *v1.Secret, err error) { if name != "" { secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the secret %s for %s: %w", name, ownerStr, err) } } return } func (c *Controller) getModelAndItsSecret(ctx context.Context, namespace, name string) (model *sednav1.Model, secret *v1.Secret, err error) { if name != "" { model, err = c.client.Models(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the model %s: %w", name, err) } } if model != nil { secret, err = c.getSecret( namespace, model.Spec.CredentialName, fmt.Sprintf("model %s", name), ) } return } func (c *Controller) getDatasetAndItsSecret(ctx context.Context, namespace, name string) (dataset *sednav1.Dataset, secret *v1.Secret, err error) { if name != "" { dataset, err = c.client.Datasets(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the dataset %s: %w", name, err) } } if dataset != nil { secret, err = c.getSecret( namespace, dataset.Spec.CredentialName, fmt.Sprintf("model %s", name), ) } return } // addWorkerMount adds CR(e.g., model, dataset)'s url to worker mount. func (c *Controller) addWorkerMount(workerParam *runtime.WorkerParam, url string, envName string, secret *v1.Secret, downloadByInitializer bool) { if url != "" { workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ URL: &runtime.MountURL{ URL: url, Secret: secret, DownloadByInitializer: downloadByInitializer, }, EnvName: envName, }, ) } } // addTransmitterToWorkerParam adds transmitter to the WorkerParam func (c *Controller) addTransmitterToWorkerParam(param *runtime.WorkerParam, job *sednav1.FederatedLearningJob) error { transmitter := job.Spec.Transmitter if transmitter.S3 != nil { param.Env["TRANSMITTER"] = "s3" url := transmitter.S3.AggregationDataPath secret, err := c.getSecret( job.Namespace, transmitter.S3.CredentialName, fmt.Sprintf("for aggregationData: %s", url)) if err != nil { return err } param.Mounts = append(param.Mounts, runtime.WorkerMount{ URL: &runtime.MountURL{ URL: url, Secret: secret, }, EnvName: "AGG_DATA_PATH", }, ) } else { param.Env["TRANSMITTER"] = "ws" } return nil } func (c *Controller) createAggPod(job *sednav1.FederatedLearningJob) (active int32, err error) { active = 0 ctx := context.Background() pretrainedModelName := job.Spec.PretrainedModel.Name pretrainedModel, pretrainedModelSecret, err := c.getModelAndItsSecret(ctx, job.Namespace, pretrainedModelName) if err != nil { return active, err } modelName := job.Spec.AggregationWorker.Model.Name model, modelSecret, err := c.getModelAndItsSecret(ctx, job.Namespace, modelName) if err != nil { return active, fmt.Errorf("failed to get aggregation model: %w", err) } participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) // deliver pod for aggregation worker aggWorker := job.Spec.AggregationWorker // Configure aggregation worker's mounts and envs var aggPort int32 = 7363 var aggWorkerParam runtime.WorkerParam aggWorkerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "WORKER_NAME": "aggworker-" + utilrand.String(5), "JOB_NAME": job.Name, "AGG_BIND_PORT": strconv.Itoa(int(aggPort)), "PARTICIPANTS_COUNT": participantsCount, } if err := c.addTransmitterToWorkerParam(&aggWorkerParam, job); err != nil { return active, fmt.Errorf("failed to add transmitter to worker param: %w", err) } aggWorkerParam.WorkerType = jobStageAgg aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure c.addWorkerMount(&aggWorkerParam, model.Spec.URL, "MODEL_URL", modelSecret, true) if pretrainedModel != nil { c.addWorkerMount(&aggWorkerParam, pretrainedModel.Spec.URL, "PRETRAINED_MODEL_URL", pretrainedModelSecret, true) } aggWorker.Template.Name = fmt.Sprintf("%s-aggworker", job.Name) // create aggpod based on configured parameters _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, &aggWorkerParam) if err != nil { return active, fmt.Errorf("failed to create aggregation worker: %w", err) } klog.Infof("create aggpod success") active++ return } func (c *Controller) createTrainPod(job *sednav1.FederatedLearningJob) (active int32, err error) { active = 0 ctx := context.Background() pretrainedModelName := job.Spec.PretrainedModel.Name pretrainedModel, pretrainedModelSecret, err := c.getModelAndItsSecret(ctx, job.Namespace, pretrainedModelName) if err != nil { return active, fmt.Errorf("failed to get pretrained model: %w", err) } modelName := job.Spec.AggregationWorker.Model.Name model, modelSecret, err := c.getModelAndItsSecret(ctx, job.Namespace, modelName) if err != nil { return active, fmt.Errorf("failed to get aggregation model: %w", err) } var aggPort int32 = 7363 participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) // deliver pod for training worker for i, trainingWorker := range job.Spec.TrainingWorkers { // Configure training worker's mounts and envs var workerParam runtime.WorkerParam c.addWorkerMount(&workerParam, model.Spec.URL, "MODEL_URL", modelSecret, true) if pretrainedModel != nil { c.addWorkerMount(&workerParam, pretrainedModel.Spec.URL, "PRETRAINED_MODEL_URL", pretrainedModelSecret, true) } datasetName := trainingWorker.Dataset.Name dataset, datasetSecret, err := c.getDatasetAndItsSecret(ctx, job.Namespace, datasetName) if err != nil { return active, err } c.addWorkerMount(&workerParam, dataset.Spec.URL, "TRAIN_DATASET_URL", datasetSecret, true) workerParam.Env = map[string]string{ "AGG_PORT": strconv.Itoa(int(aggPort)), "AGG_IP": c.aggServiceHost, "WORKER_NAME": "trainworker-" + utilrand.String(5), "JOB_NAME": job.Name, "PARTICIPANTS_COUNT": participantsCount, "NAMESPACE": job.Namespace, "MODEL_NAME": modelName, "DATASET_NAME": datasetName, "LC_SERVER": c.cfg.LC.Server, } workerParam.WorkerType = runtime.TrainPodType workerParam.HostNetwork = true workerParam.RestartPolicy = v1.RestartPolicyOnFailure if err := c.addTransmitterToWorkerParam(&workerParam, job); err != nil { return active, fmt.Errorf("failed to add transmitter to worker param: %w", err) } trainingWorker.Template.Name = fmt.Sprintf("%s-trainworker-%d", job.Name, i) // create training worker based on configured parameters _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, &workerParam) if err != nil { return active, fmt.Errorf("failed to create %dth training worker: %w", i, err) } active++ } return } // New creates a new federated learning job controller that keeps the relevant pods // in sync with their corresponding FederatedLearningJob objects. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { cfg := cc.Config podInformer := cc.KubeInformerFactory.Core().V1().Pods() jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) fc := &Controller{ kubeClient: cc.KubeClient, client: cc.SednaClient.SednaV1alpha1(), queue: workqueue.NewRateLimitingQueueWithConfig( workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), workqueue.RateLimitingQueueConfig{Name: Name}, ), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: Name + "-controller"}), cfg: cfg, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { fc.enqueueController(obj, true) // when a federated learning job is added, // send it to edge's LC. fc.syncToEdge(watch.Added, obj) }, UpdateFunc: fc.updateJob, DeleteFunc: func(obj interface{}) { fc.enqueueController(obj, true) // when a federated learning job is deleted, // send it to edge's LC. fc.syncToEdge(watch.Deleted, obj) }, }) fc.jobLister = jobInformer.Lister() fc.jobStoreSynced = jobInformer.Informer().HasSynced podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: fc.addPod, UpdateFunc: fc.updatePod, DeleteFunc: fc.deletePod, }) fc.podStore = podInformer.Lister() fc.podStoreSynced = podInformer.Informer().HasSynced return fc, nil } func (c *Controller) updateJob(old, cur interface{}) { oldJob, ok := old.(*sednav1.FederatedLearningJob) if !ok { return } curJob, ok := cur.(*sednav1.FederatedLearningJob) if !ok { return } if oldJob.ResourceVersion == curJob.ResourceVersion { return } if oldJob.Generation != curJob.Generation { pods, err := c.podStore.Pods(curJob.Namespace).List(c.flSelector) if err != nil { klog.Errorf("Failed to list pods: %v", err) } c.preventRecreation = true for _, pod := range pods { // delete all pods c.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{}) klog.Infof("CRD modified, so we deleted pod %s/%s", pod.Namespace, pod.Name) } klog.Infof("CRD modified, so we deleted all pods, and will create new pods") curJob.SetGroupVersionKind(Kind) _, err = c.createAggPod(curJob) if err != nil { klog.Errorf("Failed to create aggregation worker: %v", err) } _, err = c.createTrainPod(curJob) if err != nil { klog.Errorf("Failed to create training workers: %v", err) } // update the job status c.client.FederatedLearningJobs(curJob.Namespace).Update(context.TODO(), curJob, metav1.UpdateOptions{}) } c.preventRecreation = false c.enqueueController(curJob, true) // when a federated learning job is updated, // send it to edge's LC as Added event. c.syncToEdge(watch.Added, curJob) } // create edgemesh service for the job func (c *Controller) createService(job *sednav1.FederatedLearningJob) (err error) { var aggPort int32 = 7363 c.aggServiceHost, err = runtime.CreateEdgeMeshService(c.kubeClient, job, jobStageAgg, aggPort) if err != nil { return err } return nil }