gm: decouple all features into independent packagetags/v0.3.1
| @@ -18,6 +18,7 @@ package app | |||||
| import ( | import ( | ||||
| "fmt" | "fmt" | ||||
| "os" | |||||
| "github.com/spf13/cobra" | "github.com/spf13/cobra" | ||||
| "github.com/spf13/pflag" | "github.com/spf13/pflag" | ||||
| @@ -27,7 +28,7 @@ import ( | |||||
| "k8s.io/klog/v2" | "k8s.io/klog/v2" | ||||
| "github.com/kubeedge/sedna/cmd/sedna-gm/app/options" | "github.com/kubeedge/sedna/cmd/sedna-gm/app/options" | ||||
| controller "github.com/kubeedge/sedna/pkg/globalmanager" | |||||
| controller "github.com/kubeedge/sedna/pkg/globalmanager/controllers" | |||||
| "github.com/kubeedge/sedna/pkg/util" | "github.com/kubeedge/sedna/pkg/util" | ||||
| "github.com/kubeedge/sedna/pkg/version/verflag" | "github.com/kubeedge/sedna/pkg/version/verflag" | ||||
| ) | ) | ||||
| @@ -61,8 +62,12 @@ func NewControllerCommand() *cobra.Command { | |||||
| if errs := config.Validate(); len(errs) > 0 { | if errs := config.Validate(); len(errs) > 0 { | ||||
| klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors())) | klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors())) | ||||
| } | } | ||||
| c := controller.NewController(config) | |||||
| c.Start() | |||||
| c := controller.New(config) | |||||
| err = c.Start() | |||||
| if err != nil { | |||||
| klog.Errorf("failed to start controller: %v", err) | |||||
| os.Exit(1) | |||||
| } | |||||
| }, | }, | ||||
| } | } | ||||
| fs := cmd.Flags() | fs := cmd.Flags() | ||||
| @@ -17,7 +17,9 @@ limitations under the License. | |||||
| package main | package main | ||||
| import ( | import ( | ||||
| "math/rand" | |||||
| "os" | "os" | ||||
| "time" | |||||
| "k8s.io/component-base/logs" | "k8s.io/component-base/logs" | ||||
| @@ -25,6 +27,8 @@ import ( | |||||
| ) | ) | ||||
| func main() { | func main() { | ||||
| rand.Seed(time.Now().UnixNano()) | |||||
| command := app.NewControllerCommand() | command := app.NewControllerCommand() | ||||
| logs.InitLogs() | logs.InitLogs() | ||||
| defer logs.FlushLogs() | defer logs.FlushLogs() | ||||
| @@ -1,71 +0,0 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package globalmanager | |||||
| import ( | |||||
| "fmt" | |||||
| "os" | |||||
| "k8s.io/klog/v2" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| ) | |||||
| // MainController defines the main controller | |||||
| type MainController struct { | |||||
| Config *config.ControllerConfig | |||||
| } | |||||
| // NewController creates a new main controller | |||||
| func NewController(cc *config.ControllerConfig) *MainController { | |||||
| config.InitConfigure(cc) | |||||
| return &MainController{ | |||||
| Config: cc, | |||||
| } | |||||
| } | |||||
| // Start starts the main controller | |||||
| func (c *MainController) Start() { | |||||
| type newFunc func(cfg *config.ControllerConfig) (FeatureControllerI, error) | |||||
| for _, featureFunc := range []newFunc{ | |||||
| NewUpstreamController, | |||||
| NewDownstreamController, | |||||
| NewFederatedController, | |||||
| NewJointController, | |||||
| NewIncrementalJobController, | |||||
| NewLifelongLearningJobController, | |||||
| } { | |||||
| f, _ := featureFunc(c.Config) | |||||
| err := f.Start() | |||||
| if err != nil { | |||||
| klog.Warningf("failed to start controller %s: %+v", f.GetName(), err) | |||||
| } else { | |||||
| klog.Infof("started controller %s", f.GetName()) | |||||
| } | |||||
| } | |||||
| addr := fmt.Sprintf("%s:%d", c.Config.WebSocket.Address, c.Config.WebSocket.Port) | |||||
| ws := websocket.NewServer(addr) | |||||
| err := ws.ListenAndServe() | |||||
| if err != nil { | |||||
| klog.Fatalf("failed to listen websocket at %s", addr) | |||||
| os.Exit(1) | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,74 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package dataset | |||||
| import ( | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | |||||
| "k8s.io/client-go/tools/cache" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| const ( | |||||
| // KindName is the kind name of CR this controller controls | |||||
| KindName = "Dataset" | |||||
| // Name is this controller name | |||||
| Name = "Dataset" | |||||
| ) | |||||
| // Controller handles all dataset objects including: syncing to edge and update from edge. | |||||
| type Controller struct { | |||||
| kubeClient kubernetes.Interface | |||||
| client sednaclientset.SednaV1alpha1Interface | |||||
| cfg *config.ControllerConfig | |||||
| sendToEdgeFunc runtime.DownstreamSendFunc | |||||
| } | |||||
| func (c *Controller) Run(stopCh <-chan struct{}) { | |||||
| // noop now | |||||
| } | |||||
| // New creates a dataset controller | |||||
| func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { | |||||
| c := &Controller{ | |||||
| client: cc.SednaClient.SednaV1alpha1(), | |||||
| kubeClient: cc.KubeClient, | |||||
| } | |||||
| informer := cc.SednaInformerFactory.Sedna().V1alpha1().Datasets().Informer() | |||||
| informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ | |||||
| AddFunc: func(obj interface{}) { | |||||
| c.syncToEdge(watch.Added, obj) | |||||
| }, | |||||
| UpdateFunc: func(old, cur interface{}) { | |||||
| c.syncToEdge(watch.Added, cur) | |||||
| }, | |||||
| DeleteFunc: func(obj interface{}) { | |||||
| c.syncToEdge(watch.Deleted, obj) | |||||
| }, | |||||
| }) | |||||
| return c, nil | |||||
| } | |||||
| @@ -0,0 +1,54 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package dataset | |||||
| import ( | |||||
| "fmt" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| // syncToEdge syncs the dataset resources | |||||
| func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { | |||||
| dataset, ok := obj.(*sednav1.Dataset) | |||||
| if !ok { | |||||
| return nil | |||||
| } | |||||
| // Since t.Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| dataset.Kind = KindName | |||||
| // Here only propagate to the nodes with non empty name | |||||
| nodeName := dataset.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| runtime.InjectSecretAnnotations(c.kubeClient, dataset, dataset.Spec.CredentialName) | |||||
| return c.sendToEdgeFunc(nodeName, eventType, dataset) | |||||
| } | |||||
| func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { | |||||
| c.sendToEdgeFunc = f | |||||
| return nil | |||||
| } | |||||
| @@ -0,0 +1,62 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package dataset | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| // updateFromEdge syncs update from edge | |||||
| func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { | |||||
| status := sednav1.DatasetStatus{} | |||||
| err := json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| return c.updateStatus(name, namespace, status) | |||||
| } | |||||
| // updateStatus updates the dataset status | |||||
| func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error { | |||||
| client := c.client.Datasets(namespace) | |||||
| if status.UpdateTime == nil { | |||||
| now := metav1.Now() | |||||
| status.UpdateTime = &now | |||||
| } | |||||
| return runtime.RetryUpdateStatus(name, namespace, func() error { | |||||
| dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| dataset.Status = status | |||||
| _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { | |||||
| return addFunc(KindName, c.updateFromEdge) | |||||
| } | |||||
| @@ -0,0 +1,56 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package federatedlearning | |||||
| import ( | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { | |||||
| job, ok := obj.(*sednav1.FederatedLearningJob) | |||||
| if !ok { | |||||
| return nil | |||||
| } | |||||
| // Since Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| job.Kind = KindName | |||||
| // broadcast to all nodes specified in spec | |||||
| nodeset := make(map[string]bool) | |||||
| for _, trainingWorker := range job.Spec.TrainingWorkers { | |||||
| // Here only propagate to the nodes with non empty name | |||||
| if len(trainingWorker.Template.Spec.NodeName) > 0 { | |||||
| nodeset[trainingWorker.Template.Spec.NodeName] = true | |||||
| } | |||||
| } | |||||
| for nodeName := range nodeset { | |||||
| c.sendToEdgeFunc(nodeName, eventType, job) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { | |||||
| c.sendToEdgeFunc = f | |||||
| return nil | |||||
| } | |||||
| @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package federatedlearning | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| @@ -28,7 +28,7 @@ import ( | |||||
| utilrand "k8s.io/apimachinery/pkg/util/rand" | utilrand "k8s.io/apimachinery/pkg/util/rand" | ||||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||||
| "k8s.io/apimachinery/pkg/util/wait" | "k8s.io/apimachinery/pkg/util/wait" | ||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | "k8s.io/client-go/kubernetes" | ||||
| "k8s.io/client-go/kubernetes/scheme" | "k8s.io/client-go/kubernetes/scheme" | ||||
| v1core "k8s.io/client-go/kubernetes/typed/core/v1" | v1core "k8s.io/client-go/kubernetes/typed/core/v1" | ||||
| @@ -40,33 +40,37 @@ import ( | |||||
| k8scontroller "k8s.io/kubernetes/pkg/controller" | k8scontroller "k8s.io/kubernetes/pkg/controller" | ||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | ||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | ||||
| informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | ||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | "github.com/kubeedge/sedna/pkg/globalmanager/config" | ||||
| messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| FLJobStageAgg = "Aggregation" | |||||
| FLJobStageTrain = "Training" | |||||
| // KindName is the kind name of CR this controller controls | |||||
| KindName = "FederatedLearningJob" | |||||
| // Name is this controller name | |||||
| Name = "FederatedLearning" | |||||
| ) | ) | ||||
| // flJobControllerKind contains the schema.GroupVersionKind for this controller type. | |||||
| var flJobControllerKind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob") | |||||
| const ( | |||||
| jobStageAgg = "Aggregation" | |||||
| jobStageTrain = "Training" | |||||
| ) | |||||
| // Kind contains the schema.GroupVersionKind for this controller type. | |||||
| var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) | |||||
| // FederatedController ensures that all FLJob objects have corresponding pods to | |||||
| // Controller ensures that all FederatedLearningJob objects have corresponding pods to | |||||
| // run their configured workload. | // run their configured workload. | ||||
| type FederatedController struct { | |||||
| type Controller struct { | |||||
| kubeClient kubernetes.Interface | kubeClient kubernetes.Interface | ||||
| client sednaclientset.SednaV1alpha1Interface | client sednaclientset.SednaV1alpha1Interface | ||||
| // podStoreSynced returns true if the pod store has been synced at least once. | // podStoreSynced returns true if the pod store has been synced at least once. | ||||
| // Added as a member to the struct to allow injection for testing. | // Added as a member to the struct to allow injection for testing. | ||||
| podStoreSynced cache.InformerSynced | podStoreSynced cache.InformerSynced | ||||
| // jobStoreSynced returns true if the flJob store has been synced at least once. | |||||
| // jobStoreSynced returns true if the FederatedLearningJob store has been synced at least once. | |||||
| // Added as a member to the struct to allow injection for testing. | // Added as a member to the struct to allow injection for testing. | ||||
| jobStoreSynced cache.InformerSynced | jobStoreSynced cache.InformerSynced | ||||
| @@ -82,48 +86,47 @@ type FederatedController struct { | |||||
| recorder record.EventRecorder | recorder record.EventRecorder | ||||
| cfg *config.ControllerConfig | cfg *config.ControllerConfig | ||||
| sendToEdgeFunc runtime.DownstreamSendFunc | |||||
| } | } | ||||
| // Run the main goroutine responsible for watching and syncing jobs. | |||||
| func (fc *FederatedController) Start() error { | |||||
| // Run starts the main goroutine responsible for watching and syncing jobs. | |||||
| func (c *Controller) Run(stopCh <-chan struct{}) { | |||||
| workers := 1 | workers := 1 | ||||
| stopCh := messageContext.Done() | |||||
| go func() { | |||||
| defer utilruntime.HandleCrash() | |||||
| defer fc.queue.ShutDown() | |||||
| klog.Infof("Starting federatedlearning job controller") | |||||
| defer klog.Infof("Shutting down federatedlearning job controller") | |||||
| defer utilruntime.HandleCrash() | |||||
| defer c.queue.ShutDown() | |||||
| if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, fc.podStoreSynced, fc.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for caches to sync") | |||||
| klog.Infof("Starting %s controller", Name) | |||||
| defer klog.Infof("Shutting down %s controller", Name) | |||||
| return | |||||
| } | |||||
| if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for %s caches to sync", Name) | |||||
| klog.Infof("Starting federatedlearning job workers") | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(fc.worker, time.Second, stopCh) | |||||
| } | |||||
| return | |||||
| } | |||||
| <-stopCh | |||||
| }() | |||||
| return nil | |||||
| klog.Infof("Starting %s workers", Name) | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(c.worker, time.Second, stopCh) | |||||
| } | |||||
| <-stopCh | |||||
| } | } | ||||
| // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. | // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. | ||||
| func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| controllerRef := metav1.GetControllerOf(pod) | controllerRef := metav1.GetControllerOf(pod) | ||||
| if controllerRef == nil { | if controllerRef == nil { | ||||
| return | return | ||||
| } | } | ||||
| if controllerRef.Kind != flJobControllerKind.Kind { | |||||
| if controllerRef.Kind != Kind.Kind { | |||||
| return | return | ||||
| } | } | ||||
| job, err := fc.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| job, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| if err != nil { | if err != nil { | ||||
| return | return | ||||
| } | } | ||||
| @@ -132,27 +135,27 @@ func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| return | return | ||||
| } | } | ||||
| fc.enqueueController(job, immediate) | |||||
| c.enqueueController(job, immediate) | |||||
| } | } | ||||
| // When a pod is created, enqueue the controller that manages it and update it's expectations. | // When a pod is created, enqueue the controller that manages it and update it's expectations. | ||||
| func (fc *FederatedController) addPod(obj interface{}) { | |||||
| func (c *Controller) addPod(obj interface{}) { | |||||
| pod := obj.(*v1.Pod) | pod := obj.(*v1.Pod) | ||||
| if pod.DeletionTimestamp != nil { | if pod.DeletionTimestamp != nil { | ||||
| // on a restart of the controller, it's possible a new pod shows up in a state that | // on a restart of the controller, it's possible a new pod shows up in a state that | ||||
| // is already pending deletion. Prevent the pod from being a creation observation. | // is already pending deletion. Prevent the pod from being a creation observation. | ||||
| fc.deletePod(pod) | |||||
| c.deletePod(pod) | |||||
| return | return | ||||
| } | } | ||||
| // backoff to queue when PodFailed | // backoff to queue when PodFailed | ||||
| immediate := pod.Status.Phase != v1.PodFailed | immediate := pod.Status.Phase != v1.PodFailed | ||||
| fc.enqueueByPod(pod, immediate) | |||||
| c.enqueueByPod(pod, immediate) | |||||
| } | } | ||||
| // When a pod is updated, figure out what federatedlearning job manage it and wake them up. | // When a pod is updated, figure out what federatedlearning job manage it and wake them up. | ||||
| func (fc *FederatedController) updatePod(old, cur interface{}) { | |||||
| func (c *Controller) updatePod(old, cur interface{}) { | |||||
| curPod := cur.(*v1.Pod) | curPod := cur.(*v1.Pod) | ||||
| oldPod := old.(*v1.Pod) | oldPod := old.(*v1.Pod) | ||||
| @@ -161,11 +164,11 @@ func (fc *FederatedController) updatePod(old, cur interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| fc.addPod(curPod) | |||||
| c.addPod(curPod) | |||||
| } | } | ||||
| // deletePod enqueues the FederatedLearningJob obj When a pod is deleted | // deletePod enqueues the FederatedLearningJob obj When a pod is deleted | ||||
| func (fc *FederatedController) deletePod(obj interface{}) { | |||||
| func (c *Controller) deletePod(obj interface{}) { | |||||
| pod, ok := obj.(*v1.Pod) | pod, ok := obj.(*v1.Pod) | ||||
| // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | ||||
| @@ -186,13 +189,13 @@ func (fc *FederatedController) deletePod(obj interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| fc.enqueueByPod(pod, true) | |||||
| c.enqueueByPod(pod, true) | |||||
| } | } | ||||
| // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, | // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, | ||||
| // immediate tells the controller to update the status right away, and should | // immediate tells the controller to update the status right away, and should | ||||
| // happen ONLY when there was a successful pod run. | // happen ONLY when there was a successful pod run. | ||||
| func (fc *FederatedController) enqueueController(obj interface{}, immediate bool) { | |||||
| func (c *Controller) enqueueController(obj interface{}, immediate bool) { | |||||
| key, err := k8scontroller.KeyFunc(obj) | key, err := k8scontroller.KeyFunc(obj) | ||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("Couldn't get key for object %+v: %v", obj, err) | klog.Warningf("Couldn't get key for object %+v: %v", obj, err) | ||||
| @@ -201,43 +204,43 @@ func (fc *FederatedController) enqueueController(obj interface{}, immediate bool | |||||
| backoff := time.Duration(0) | backoff := time.Duration(0) | ||||
| if !immediate { | if !immediate { | ||||
| backoff = getBackoff(fc.queue, key) | |||||
| backoff = runtime.GetBackoff(c.queue, key) | |||||
| } | } | ||||
| fc.queue.AddAfter(key, backoff) | |||||
| c.queue.AddAfter(key, backoff) | |||||
| } | } | ||||
| // worker runs a worker thread that just dequeues items, processes them, and marks them done. | // worker runs a worker thread that just dequeues items, processes them, and marks them done. | ||||
| // It enforces that the syncHandler is never invoked concurrently with the same key. | // It enforces that the syncHandler is never invoked concurrently with the same key. | ||||
| func (fc *FederatedController) worker() { | |||||
| for fc.processNextWorkItem() { | |||||
| func (c *Controller) worker() { | |||||
| for c.processNextWorkItem() { | |||||
| } | } | ||||
| } | } | ||||
| func (fc *FederatedController) processNextWorkItem() bool { | |||||
| key, quit := fc.queue.Get() | |||||
| func (c *Controller) processNextWorkItem() bool { | |||||
| key, quit := c.queue.Get() | |||||
| if quit { | if quit { | ||||
| return false | return false | ||||
| } | } | ||||
| defer fc.queue.Done(key) | |||||
| defer c.queue.Done(key) | |||||
| forget, err := fc.syncFLJob(key.(string)) | |||||
| forget, err := c.sync(key.(string)) | |||||
| if err == nil { | if err == nil { | ||||
| if forget { | if forget { | ||||
| fc.queue.Forget(key) | |||||
| c.queue.Forget(key) | |||||
| } | } | ||||
| return true | return true | ||||
| } | } | ||||
| klog.Warningf("Error syncing federatedlearning job: %v", err) | klog.Warningf("Error syncing federatedlearning job: %v", err) | ||||
| fc.queue.AddRateLimited(key) | |||||
| c.queue.AddRateLimited(key) | |||||
| return true | return true | ||||
| } | } | ||||
| // syncFLJob will sync the flJob with the given key if it has had its expectations fulfilled, meaning | |||||
| // sync will sync the FederatedLearningJob with the given key if it has had its expectations fulfilled, meaning | |||||
| // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | ||||
| // concurrently with the same key. | // concurrently with the same key. | ||||
| func (fc *FederatedController) syncFLJob(key string) (bool, error) { | |||||
| func (c *Controller) sync(key string) (bool, error) { | |||||
| startTime := time.Now() | startTime := time.Now() | ||||
| defer func() { | defer func() { | ||||
| klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) | klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) | ||||
| @@ -250,91 +253,96 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { | |||||
| if len(ns) == 0 || len(name) == 0 { | if len(ns) == 0 || len(name) == 0 { | ||||
| return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) | return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) | ||||
| } | } | ||||
| sharedFLJob, err := fc.jobLister.FederatedLearningJobs(ns).Get(name) | |||||
| sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name) | |||||
| if err != nil { | if err != nil { | ||||
| if errors.IsNotFound(err) { | if errors.IsNotFound(err) { | ||||
| klog.V(4).Infof("FLJob has been deleted: %v", key) | |||||
| klog.V(4).Infof("%s %v has been deleted", Name, key) | |||||
| return true, nil | return true, nil | ||||
| } | } | ||||
| return false, err | return false, err | ||||
| } | } | ||||
| flJob := *sharedFLJob | |||||
| // set kind for flJob in case that the kind is None | |||||
| flJob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob")) | |||||
| // if flJob was finished previously, we don't want to redo the termination | |||||
| if IsFLJobFinished(&flJob) { | |||||
| job := *sharedJob | |||||
| // set kind for FederatedLearningJob in case that the kind is None | |||||
| job.SetGroupVersionKind(Kind) | |||||
| // if job was finished previously, we don't want to redo the termination | |||||
| if IsJobFinished(&job) { | |||||
| return true, nil | return true, nil | ||||
| } | } | ||||
| selector, _ := GenerateSelector(&flJob) | |||||
| pods, err := fc.podStore.Pods(flJob.Namespace).List(selector) | |||||
| selector, _ := runtime.GenerateSelector(&job) | |||||
| pods, err := c.podStore.Pods(job.Namespace).List(selector) | |||||
| if err != nil { | if err != nil { | ||||
| return false, err | return false, err | ||||
| } | } | ||||
| activePods := k8scontroller.FilterActivePods(pods) | activePods := k8scontroller.FilterActivePods(pods) | ||||
| active := int32(len(activePods)) | active := int32(len(activePods)) | ||||
| succeeded, failed := getStatus(pods) | |||||
| conditions := len(flJob.Status.Conditions) | |||||
| // flJob first start | |||||
| if flJob.Status.StartTime == nil { | |||||
| succeeded, failed := countPods(pods) | |||||
| conditions := len(job.Status.Conditions) | |||||
| // set StartTime when job is handled firstly | |||||
| if job.Status.StartTime == nil { | |||||
| now := metav1.Now() | now := metav1.Now() | ||||
| flJob.Status.StartTime = &now | |||||
| job.Status.StartTime = &now | |||||
| } | } | ||||
| var manageJobErr error | var manageJobErr error | ||||
| jobFailed := false | jobFailed := false | ||||
| var failureReason string | var failureReason string | ||||
| var failureMessage string | var failureMessage string | ||||
| phase := flJob.Status.Phase | |||||
| phase := job.Status.Phase | |||||
| if failed > 0 { | if failed > 0 { | ||||
| jobFailed = true | jobFailed = true | ||||
| failureReason = "workerFailed" | failureReason = "workerFailed" | ||||
| failureMessage = "the worker of FLJob failed" | |||||
| failureMessage = "the worker of FederatedLearningJob failed" | |||||
| } | } | ||||
| if jobFailed { | if jobFailed { | ||||
| flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) | |||||
| flJob.Status.Phase = sednav1.FLJobFailed | |||||
| fc.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage) | |||||
| job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) | |||||
| job.Status.Phase = sednav1.FLJobFailed | |||||
| c.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage) | |||||
| } else { | } else { | ||||
| // in the First time, we create the pods | // in the First time, we create the pods | ||||
| if len(pods) == 0 { | if len(pods) == 0 { | ||||
| active, manageJobErr = fc.createPod(&flJob) | |||||
| active, manageJobErr = c.createPod(&job) | |||||
| } | } | ||||
| complete := false | complete := false | ||||
| if succeeded > 0 && active == 0 { | if succeeded > 0 && active == 0 { | ||||
| complete = true | complete = true | ||||
| } | } | ||||
| if complete { | if complete { | ||||
| flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondComplete, "", "")) | |||||
| job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondComplete, "", "")) | |||||
| now := metav1.Now() | now := metav1.Now() | ||||
| flJob.Status.CompletionTime = &now | |||||
| fc.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed") | |||||
| flJob.Status.Phase = sednav1.FLJobSucceeded | |||||
| job.Status.CompletionTime = &now | |||||
| c.recorder.Event(&job, v1.EventTypeNormal, "Completed", "FederatedLearningJob completed") | |||||
| job.Status.Phase = sednav1.FLJobSucceeded | |||||
| } else { | } else { | ||||
| flJob.Status.Phase = sednav1.FLJobRunning | |||||
| job.Status.Phase = sednav1.FLJobRunning | |||||
| } | } | ||||
| } | } | ||||
| forget := false | forget := false | ||||
| // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true | // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true | ||||
| // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to | // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to | ||||
| // improve the FLJob backoff policy when parallelism > 1 and few FLJobs failed but others succeed. | |||||
| // improve the job backoff policy when parallelism > 1 and few FLJobs failed but others succeed. | |||||
| // In this case, we should clear the backoff delay. | // In this case, we should clear the backoff delay. | ||||
| if flJob.Status.Succeeded < succeeded { | |||||
| if job.Status.Succeeded < succeeded { | |||||
| forget = true | forget = true | ||||
| } | } | ||||
| // no need to update the flJob if the status hasn't changed since last time | |||||
| if flJob.Status.Active != active || flJob.Status.Succeeded != succeeded || flJob.Status.Failed != failed || len(flJob.Status.Conditions) != conditions || flJob.Status.Phase != phase { | |||||
| flJob.Status.Active = active | |||||
| flJob.Status.Succeeded = succeeded | |||||
| flJob.Status.Failed = failed | |||||
| // no need to update the job if the status hasn't changed since last time | |||||
| if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions || job.Status.Phase != phase { | |||||
| job.Status.Active = active | |||||
| job.Status.Succeeded = succeeded | |||||
| job.Status.Failed = failed | |||||
| c.updateJobStatus(&job) | |||||
| if jobFailed && !IsFLJobFinished(&flJob) { | |||||
| // returning an error will re-enqueue FLJob after the backoff period | |||||
| return forget, fmt.Errorf("failed pod(s) detected for flJob key %q", key) | |||||
| if jobFailed && !IsJobFinished(&job) { | |||||
| // returning an error will re-enqueue FederatedLearningJob after the backoff period | |||||
| return forget, fmt.Errorf("failed pod(s) detected for FederatedLearningJob key %q", key) | |||||
| } | } | ||||
| forget = true | forget = true | ||||
| @@ -343,7 +351,7 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { | |||||
| return forget, manageJobErr | return forget, manageJobErr | ||||
| } | } | ||||
| func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { | |||||
| func NewJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { | |||||
| return sednav1.FLJobCondition{ | return sednav1.FLJobCondition{ | ||||
| Type: conditionType, | Type: conditionType, | ||||
| Status: v1.ConditionTrue, | Status: v1.ConditionTrue, | ||||
| @@ -354,28 +362,24 @@ func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message | |||||
| } | } | ||||
| } | } | ||||
| // getStatus returns no of succeeded and failed pods running a flJob | |||||
| func getStatus(pods []*v1.Pod) (succeeded, failed int32) { | |||||
| // countPods returns number of succeeded and failed pods | |||||
| func countPods(pods []*v1.Pod) (succeeded, failed int32) { | |||||
| succeeded = int32(filterPods(pods, v1.PodSucceeded)) | succeeded = int32(filterPods(pods, v1.PodSucceeded)) | ||||
| failed = int32(filterPods(pods, v1.PodFailed)) | failed = int32(filterPods(pods, v1.PodFailed)) | ||||
| return | return | ||||
| } | } | ||||
| func (fc *FederatedController) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error { | |||||
| jobClient := fc.client.FederatedLearningJobs(flJob.Namespace) | |||||
| var err error | |||||
| for i := 0; i <= ResourceUpdateRetries; i = i + 1 { | |||||
| var newFLJob *sednav1.FederatedLearningJob | |||||
| newFLJob, err = jobClient.Get(context.TODO(), flJob.Name, metav1.GetOptions{}) | |||||
| func (c *Controller) updateJobStatus(job *sednav1.FederatedLearningJob) error { | |||||
| jobClient := c.client.FederatedLearningJobs(job.Namespace) | |||||
| return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { | |||||
| newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| break | |||||
| } | |||||
| newFLJob.Status = flJob.Status | |||||
| if _, err = jobClient.UpdateStatus(context.TODO(), newFLJob, metav1.UpdateOptions{}); err == nil { | |||||
| break | |||||
| return err | |||||
| } | } | ||||
| } | |||||
| return nil | |||||
| newJob.Status = job.Status | |||||
| _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | } | ||||
| // filterPods returns pods based on their phase. | // filterPods returns pods based on their phase. | ||||
| @@ -389,7 +393,7 @@ func filterPods(pods []*v1.Pod, phase v1.PodPhase) int { | |||||
| return result | return result | ||||
| } | } | ||||
| func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { | |||||
| func IsJobFinished(j *sednav1.FederatedLearningJob) bool { | |||||
| for _, c := range j.Status.Conditions { | for _, c := range j.Status.Conditions { | ||||
| if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { | if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { | ||||
| return true | return true | ||||
| @@ -398,12 +402,12 @@ func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { | |||||
| return false | return false | ||||
| } | } | ||||
| func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { | |||||
| func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { | |||||
| active = 0 | active = 0 | ||||
| ctx := context.Background() | ctx := context.Background() | ||||
| modelName := job.Spec.AggregationWorker.Model.Name | modelName := job.Spec.AggregationWorker.Model.Name | ||||
| model, err := fc.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) | |||||
| model, err := c.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return active, fmt.Errorf("failed to get model %s: %w", | return active, fmt.Errorf("failed to get model %s: %w", | ||||
| modelName, err) | modelName, err) | ||||
| @@ -412,7 +416,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| secretName := model.Spec.CredentialName | secretName := model.Spec.CredentialName | ||||
| var modelSecret *v1.Secret | var modelSecret *v1.Secret | ||||
| if secretName != "" { | if secretName != "" { | ||||
| modelSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| modelSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| } | } | ||||
| participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) | participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) | ||||
| @@ -420,10 +424,10 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| // deliver pod for aggregation worker | // deliver pod for aggregation worker | ||||
| aggWorker := job.Spec.AggregationWorker | aggWorker := job.Spec.AggregationWorker | ||||
| // Configure container mounting and Env information by initial WorkerParam | |||||
| // Configure aggregation worker's mounts and envs | |||||
| var aggPort int32 = 7363 | var aggPort int32 = 7363 | ||||
| var aggWorkerParam *WorkerParam = new(WorkerParam) | |||||
| aggWorkerParam.env = map[string]string{ | |||||
| var aggWorkerParam runtime.WorkerParam | |||||
| aggWorkerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "WORKER_NAME": "aggworker-" + utilrand.String(5), | "WORKER_NAME": "aggworker-" + utilrand.String(5), | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| @@ -432,12 +436,12 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| "PARTICIPANTS_COUNT": participantsCount, | "PARTICIPANTS_COUNT": participantsCount, | ||||
| } | } | ||||
| aggWorkerParam.workerType = FLJobStageAgg | |||||
| aggWorkerParam.restartPolicy = v1.RestartPolicyOnFailure | |||||
| aggWorkerParam.WorkerType = jobStageAgg | |||||
| aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure | |||||
| aggWorkerParam.mounts = append(aggWorkerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| aggWorkerParam.Mounts = append(aggWorkerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: model.Spec.URL, | URL: model.Spec.URL, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| DownloadByInitializer: false, | DownloadByInitializer: false, | ||||
| @@ -447,9 +451,9 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| ) | ) | ||||
| // create aggpod based on configured parameters | // create aggpod based on configured parameters | ||||
| _, err = createPodWithTemplate(fc.kubeClient, job, &aggWorker.Template, aggWorkerParam) | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, &aggWorkerParam) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | |||||
| return active, fmt.Errorf("failed to create aggregation worker: %w", err) | |||||
| } | } | ||||
| active++ | active++ | ||||
| @@ -458,17 +462,21 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| // FIXME(llhuii): only the case that Spec.NodeName specified is support, | // FIXME(llhuii): only the case that Spec.NodeName specified is support, | ||||
| // will support Spec.NodeSelector. | // will support Spec.NodeSelector. | ||||
| appIP, err = GetNodeIPByName(fc.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) | |||||
| appIP, err = runtime.GetNodeIPByName(c.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) | |||||
| if err != nil { | |||||
| return active, err | |||||
| } | |||||
| aggServicePort, err = CreateKubernetesService(fc.kubeClient, job, FLJobStageAgg, aggPort, appIP) | |||||
| aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, jobStageAgg, aggPort, appIP) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | return active, err | ||||
| } | } | ||||
| // deliver pod for training worker | // deliver pod for training worker | ||||
| for _, trainingWorker := range job.Spec.TrainingWorkers { | |||||
| for i, trainingWorker := range job.Spec.TrainingWorkers { | |||||
| // get dataseturl through parsing crd of dataset | // get dataseturl through parsing crd of dataset | ||||
| datasetName := trainingWorker.Dataset.Name | datasetName := trainingWorker.Dataset.Name | ||||
| dataset, err := fc.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) | |||||
| dataset, err := c.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return active, fmt.Errorf("failed to get dataset %s: %w", | return active, fmt.Errorf("failed to get dataset %s: %w", | ||||
| datasetName, err) | datasetName, err) | ||||
| @@ -477,23 +485,22 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| secretName := dataset.Spec.CredentialName | secretName := dataset.Spec.CredentialName | ||||
| var datasetSecret *v1.Secret | var datasetSecret *v1.Secret | ||||
| if secretName != "" { | if secretName != "" { | ||||
| datasetSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| datasetSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| } | } | ||||
| // Configure container mounting and Env information by initial WorkerParam | |||||
| var workerParam *WorkerParam = new(WorkerParam) | |||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| // Configure training worker's mounts and envs | |||||
| var workerParam runtime.WorkerParam | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: model.Spec.URL, | URL: model.Spec.URL, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| }, | }, | ||||
| EnvName: "MODEL_URL", | EnvName: "MODEL_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: dataset.Spec.URL, | URL: dataset.Spec.URL, | ||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| }, | }, | ||||
| @@ -501,7 +508,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| }, | }, | ||||
| ) | ) | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "AGG_PORT": strconv.Itoa(int(aggServicePort)), | "AGG_PORT": strconv.Itoa(int(aggServicePort)), | ||||
| "AGG_IP": appIP, | "AGG_IP": appIP, | ||||
| @@ -511,65 +518,67 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "MODEL_NAME": modelName, | "MODEL_NAME": modelName, | ||||
| "DATASET_NAME": datasetName, | "DATASET_NAME": datasetName, | ||||
| "LC_SERVER": fc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| workerParam.workerType = TrainPodType | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.restartPolicy = v1.RestartPolicyOnFailure | |||||
| // create train pod based on configured parameters | |||||
| _, err = createPodWithTemplate(fc.kubeClient, job, &trainingWorker.Template, workerParam) | |||||
| workerParam.WorkerType = runtime.TrainPodType | |||||
| workerParam.HostNetwork = true | |||||
| workerParam.RestartPolicy = v1.RestartPolicyOnFailure | |||||
| // create training worker based on configured parameters | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, &workerParam) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | |||||
| return active, fmt.Errorf("failed to create %dth training worker: %w", i, err) | |||||
| } | } | ||||
| active++ | active++ | ||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| func (fc *FederatedController) GetName() string { | |||||
| return "FederatedLearningJobController" | |||||
| } | |||||
| // NewFederatedController creates a new FederatedLearningJob controller that keeps the relevant pods | |||||
| // in sync with their corresponding FFederatedLearningJob objects. | |||||
| func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| namespace := cfg.Namespace | |||||
| if namespace == "" { | |||||
| namespace = metav1.NamespaceAll | |||||
| } | |||||
| kubeClient, err := utils.KubeClient() | |||||
| kubecfg, _ := utils.KubeConfig() | |||||
| crdclient, err := clientset.NewForConfig(kubecfg) | |||||
| kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) | |||||
| // New creates a new federated learning job controller that keeps the relevant pods | |||||
| // in sync with their corresponding FederatedLearningJob objects. | |||||
| func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { | |||||
| cfg := cc.Config | |||||
| podInformer := kubeInformerFactory.Core().V1().Pods() | |||||
| podInformer := cc.KubeInformerFactory.Core().V1().Pods() | |||||
| jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) | |||||
| jobInformer := jobInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() | |||||
| jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() | |||||
| eventBroadcaster := record.NewBroadcaster() | eventBroadcaster := record.NewBroadcaster() | ||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | |||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) | |||||
| fc := &FederatedController{ | |||||
| kubeClient: kubeClient, | |||||
| client: crdclient.SednaV1alpha1(), | |||||
| fc := &Controller{ | |||||
| kubeClient: cc.KubeClient, | |||||
| client: cc.SednaClient.SednaV1alpha1(), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "flJob"), | |||||
| recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), | |||||
| recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: Name + "-controller"}), | |||||
| cfg: cfg, | cfg: cfg, | ||||
| } | } | ||||
| jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||||
| AddFunc: func(obj interface{}) { | AddFunc: func(obj interface{}) { | ||||
| fc.enqueueController(obj, true) | fc.enqueueController(obj, true) | ||||
| // when a federated learning job is added, | |||||
| // send it to edge's LC. | |||||
| fc.syncToEdge(watch.Added, obj) | |||||
| }, | }, | ||||
| UpdateFunc: func(old, cur interface{}) { | UpdateFunc: func(old, cur interface{}) { | ||||
| fc.enqueueController(cur, true) | fc.enqueueController(cur, true) | ||||
| // when a federated learning job is updated, | |||||
| // send it to edge's LC as Added event. | |||||
| fc.syncToEdge(watch.Added, cur) | |||||
| }, | }, | ||||
| DeleteFunc: func(obj interface{}) { | DeleteFunc: func(obj interface{}) { | ||||
| fc.enqueueController(obj, true) | fc.enqueueController(obj, true) | ||||
| // when a federated learning job is deleted, | |||||
| // send it to edge's LC. | |||||
| fc.syncToEdge(watch.Deleted, obj) | |||||
| }, | }, | ||||
| }) | }) | ||||
| fc.jobLister = jobInformer.Lister() | fc.jobLister = jobInformer.Lister() | ||||
| fc.jobStoreSynced = jobInformer.Informer().HasSynced | fc.jobStoreSynced = jobInformer.Informer().HasSynced | ||||
| @@ -581,8 +590,5 @@ func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, e | |||||
| fc.podStore = podInformer.Lister() | fc.podStore = podInformer.Lister() | ||||
| fc.podStoreSynced = podInformer.Informer().HasSynced | fc.podStoreSynced = podInformer.Informer().HasSynced | ||||
| stopCh := make(chan struct{}) | |||||
| kubeInformerFactory.Start(stopCh) | |||||
| jobInformerFactory.Start(stopCh) | |||||
| return fc, err | |||||
| return fc, nil | |||||
| } | } | ||||
| @@ -0,0 +1,123 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package federatedlearning | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| "fmt" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| ) | |||||
| func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error { | |||||
| var err error | |||||
| job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| // federated crd not found | |||||
| return err | |||||
| } | |||||
| modelName := job.Spec.AggregationWorker.Model.Name | |||||
| client := c.client.Models(namespace) | |||||
| return runtime.RetryUpdateStatus(modelName, namespace, (func() error { | |||||
| model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| now := metav1.Now() | |||||
| model.Status.UpdateTime = &now | |||||
| model.Status.Metrics = metrics | |||||
| _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { | |||||
| client := c.client.FederatedLearningJobs(namespace) | |||||
| return runtime.RetryUpdateStatus(name, namespace, (func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| // updateFromEdge updates the federated job's status | |||||
| func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) { | |||||
| // JobInfo defines the job information | |||||
| type JobInfo struct { | |||||
| // Current training round | |||||
| CurrentRound int `json:"currentRound"` | |||||
| UpdateTime string `json:"updateTime"` | |||||
| } | |||||
| // Output defines job output information | |||||
| type Output struct { | |||||
| Models []runtime.Model `json:"models"` | |||||
| JobInfo *JobInfo `json:"ownerInfo"` | |||||
| } | |||||
| var status struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| Output *Output `json:"output"` | |||||
| } | |||||
| err = json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| return | |||||
| } | |||||
| output := status.Output | |||||
| if output != nil { | |||||
| // Update the model's metrics | |||||
| if len(output.Models) > 0 { | |||||
| // only one model | |||||
| model := output.Models[0] | |||||
| metrics := runtime.ConvertMapToMetrics(model.Metrics) | |||||
| if len(metrics) > 0 { | |||||
| c.updateModelMetrics(name, namespace, metrics) | |||||
| } | |||||
| } | |||||
| jobInfo := output.JobInfo | |||||
| // update job info if having any info | |||||
| if jobInfo != nil && jobInfo.CurrentRound > 0 { | |||||
| // Find a good place to save the progress info | |||||
| // TODO: more meaningful reason/message | |||||
| reason := "DoTraining" | |||||
| message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) | |||||
| cond := NewJobCondition(sednav1.FLJobCondTraining, reason, message) | |||||
| c.appendStatusCondition(name, namespace, cond) | |||||
| } | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { | |||||
| return addFunc(KindName, c.updateFromEdge) | |||||
| } | |||||
| @@ -0,0 +1,145 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package incrementallearning | |||||
| import ( | |||||
| "context" | |||||
| "fmt" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/klog/v2" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| // syncModelWithName will sync the model to the specified node. | |||||
| // Now called when creating the incrementaljob. | |||||
| func (c *Controller) syncModelWithName(nodeName, modelName, namespace string) error { | |||||
| model, err := c.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| // TODO: maybe use err.ErrStatus.Code == 404 | |||||
| return fmt.Errorf("model(%s/%s) not found", namespace, modelName) | |||||
| } | |||||
| // Since model.Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| if len(model.Kind) == 0 { | |||||
| model.Kind = "Model" | |||||
| } | |||||
| runtime.InjectSecretAnnotations(c.kubeClient, model, model.Spec.CredentialName) | |||||
| c.sendToEdgeFunc(nodeName, watch.Added, model) | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { | |||||
| job, ok := obj.(*sednav1.IncrementalLearningJob) | |||||
| if !ok { | |||||
| return nil | |||||
| } | |||||
| // Since Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| job.Kind = KindName | |||||
| jobConditions := job.Status.Conditions | |||||
| if len(jobConditions) == 0 { | |||||
| return nil | |||||
| } | |||||
| dataName := job.Spec.Dataset.Name | |||||
| ds, err := c.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) | |||||
| } | |||||
| // LC has dataset object on this node that may call dataset node | |||||
| dsNodeName := ds.Spec.NodeName | |||||
| var trainNodeName string | |||||
| var evalNodeName string | |||||
| ann := job.GetAnnotations() | |||||
| if ann != nil { | |||||
| trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] | |||||
| evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)] | |||||
| } | |||||
| if eventType == watch.Deleted { | |||||
| // delete jobs from all LCs | |||||
| for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { | |||||
| if v != "" { | |||||
| c.sendToEdgeFunc(v, eventType, job) | |||||
| } | |||||
| } | |||||
| return nil | |||||
| } | |||||
| latestCondition := jobConditions[len(jobConditions)-1] | |||||
| currentType := latestCondition.Type | |||||
| jobStage := latestCondition.Stage | |||||
| syncModelWithName := func(modelName string) { | |||||
| if err := c.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { | |||||
| klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", | |||||
| modelName, job.Name, dsNodeName, err) | |||||
| } | |||||
| } | |||||
| syncJobWithNodeName := func(nodeName string) { | |||||
| if err := c.sendToEdgeFunc(nodeName, eventType, job); err != nil { | |||||
| klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", | |||||
| job.Name, nodeName, jobStage, err) | |||||
| } | |||||
| } | |||||
| runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) | |||||
| doJobStageEvent := func(modelName string, nodeName string) { | |||||
| if currentType == sednav1.ILJobStageCondWaiting { | |||||
| syncJobWithNodeName(dsNodeName) | |||||
| syncModelWithName(modelName) | |||||
| } else if currentType == sednav1.ILJobStageCondRunning { | |||||
| if nodeName != "" { | |||||
| syncJobWithNodeName(nodeName) | |||||
| } | |||||
| } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { | |||||
| if nodeName != dsNodeName { | |||||
| // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. | |||||
| c.sendToEdgeFunc(nodeName, watch.Deleted, job) | |||||
| } | |||||
| } | |||||
| } | |||||
| switch jobStage { | |||||
| case sednav1.ILJobTrain: | |||||
| doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) | |||||
| case sednav1.ILJobEval: | |||||
| doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { | |||||
| c.sendToEdgeFunc = f | |||||
| return nil | |||||
| } | |||||
| @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package incrementallearning | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| @@ -30,9 +30,8 @@ import ( | |||||
| utilrand "k8s.io/apimachinery/pkg/util/rand" | utilrand "k8s.io/apimachinery/pkg/util/rand" | ||||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||||
| "k8s.io/apimachinery/pkg/util/wait" | "k8s.io/apimachinery/pkg/util/wait" | ||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | "k8s.io/client-go/kubernetes" | ||||
| "k8s.io/client-go/kubernetes/scheme" | |||||
| v1core "k8s.io/client-go/kubernetes/typed/core/v1" | v1core "k8s.io/client-go/kubernetes/typed/core/v1" | ||||
| corelisters "k8s.io/client-go/listers/core/v1" | corelisters "k8s.io/client-go/listers/core/v1" | ||||
| "k8s.io/client-go/tools/cache" | "k8s.io/client-go/tools/cache" | ||||
| @@ -42,28 +41,33 @@ import ( | |||||
| k8scontroller "k8s.io/kubernetes/pkg/controller" | k8scontroller "k8s.io/kubernetes/pkg/controller" | ||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | ||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | ||||
| informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | ||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | "github.com/kubeedge/sedna/pkg/globalmanager/config" | ||||
| messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | ) | ||||
| // ijControllerKind contains the schema.GroupVersionKind for this controller type. | |||||
| var ijControllerKind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob") | |||||
| const ( | |||||
| // Name is this controller name | |||||
| Name = "IncrementalLearning" | |||||
| // IncrementalJobController ensures that all IncrementalLearningJob objects have corresponding pods to | |||||
| // KindName is the kind name of CR this controller controls | |||||
| KindName = "IncrementalLearningJob" | |||||
| ) | |||||
| // Kind contains the schema.GroupVersionKind for this controller type. | |||||
| var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) | |||||
| // Controller ensures that all IncrementalLearningJob objects have corresponding pods to | |||||
| // run their configured workload. | // run their configured workload. | ||||
| type IncrementalJobController struct { | |||||
| type Controller struct { | |||||
| kubeClient kubernetes.Interface | kubeClient kubernetes.Interface | ||||
| client sednaclientset.SednaV1alpha1Interface | client sednaclientset.SednaV1alpha1Interface | ||||
| // podStoreSynced returns true if the pod store has been synced at least once. | // podStoreSynced returns true if the pod store has been synced at least once. | ||||
| // Added as a member to the struct to allow injection for testing. | // Added as a member to the struct to allow injection for testing. | ||||
| podStoreSynced cache.InformerSynced | podStoreSynced cache.InformerSynced | ||||
| // jobStoreSynced returns true if the incrementaljob store has been synced at least once. | |||||
| // jobStoreSynced returns true if the job store has been synced at least once. | |||||
| // Added as a member to the struct to allow injection for testing. | // Added as a member to the struct to allow injection for testing. | ||||
| jobStoreSynced cache.InformerSynced | jobStoreSynced cache.InformerSynced | ||||
| @@ -76,50 +80,49 @@ type IncrementalJobController struct { | |||||
| // IncrementalLearningJobs that need to be updated | // IncrementalLearningJobs that need to be updated | ||||
| queue workqueue.RateLimitingInterface | queue workqueue.RateLimitingInterface | ||||
| recorder record.EventRecorder | |||||
| cfg *config.ControllerConfig | cfg *config.ControllerConfig | ||||
| sendToEdgeFunc runtime.DownstreamSendFunc | |||||
| } | } | ||||
| // Run the main goroutine responsible for watching and syncing jobs. | |||||
| func (jc *IncrementalJobController) Start() error { | |||||
| // Run starts the main goroutine responsible for watching and syncing jobs. | |||||
| func (c *Controller) Run(stopCh <-chan struct{}) { | |||||
| // TODO: make workers parameter | |||||
| workers := 1 | workers := 1 | ||||
| stopCh := messageContext.Done() | |||||
| go func() { | |||||
| defer utilruntime.HandleCrash() | |||||
| defer jc.queue.ShutDown() | |||||
| klog.Infof("Starting incrementallearning job controller") | |||||
| defer klog.Infof("Shutting down incrementallearning job controller") | |||||
| defer utilruntime.HandleCrash() | |||||
| defer c.queue.ShutDown() | |||||
| if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for caches to sync") | |||||
| klog.Infof("Starting %s controller", Name) | |||||
| defer klog.Infof("Shutting down %s controller", Name) | |||||
| return | |||||
| } | |||||
| klog.Infof("Starting incrementallearning job workers") | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(jc.worker, time.Second, stopCh) | |||||
| } | |||||
| if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for %s caches to sync", Name) | |||||
| <-stopCh | |||||
| }() | |||||
| return nil | |||||
| return | |||||
| } | |||||
| klog.Infof("Starting %s job workers", Name) | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(c.worker, time.Second, stopCh) | |||||
| } | |||||
| <-stopCh | |||||
| } | } | ||||
| // enqueueByPod enqueues the jointInferenceService object of the specified pod. | // enqueueByPod enqueues the jointInferenceService object of the specified pod. | ||||
| func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| controllerRef := metav1.GetControllerOf(pod) | controllerRef := metav1.GetControllerOf(pod) | ||||
| if controllerRef == nil { | if controllerRef == nil { | ||||
| return | return | ||||
| } | } | ||||
| if controllerRef.Kind != ijControllerKind.Kind { | |||||
| if controllerRef.Kind != Kind.Kind { | |||||
| return | return | ||||
| } | } | ||||
| service, err := jc.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| if err != nil { | if err != nil { | ||||
| return | return | ||||
| } | } | ||||
| @@ -128,27 +131,27 @@ func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| return | return | ||||
| } | } | ||||
| jc.enqueueController(service, immediate) | |||||
| c.enqueueController(service, immediate) | |||||
| } | } | ||||
| // When a pod is created, enqueue the controller that manages it and update it's expectations. | // When a pod is created, enqueue the controller that manages it and update it's expectations. | ||||
| func (jc *IncrementalJobController) addPod(obj interface{}) { | |||||
| func (c *Controller) addPod(obj interface{}) { | |||||
| pod := obj.(*v1.Pod) | pod := obj.(*v1.Pod) | ||||
| if pod.DeletionTimestamp != nil { | if pod.DeletionTimestamp != nil { | ||||
| // on a restart of the controller, it's possible a new pod shows up in a state that | // on a restart of the controller, it's possible a new pod shows up in a state that | ||||
| // is already pending deletion. Prevent the pod from being a creation observation. | // is already pending deletion. Prevent the pod from being a creation observation. | ||||
| jc.deletePod(pod) | |||||
| c.deletePod(pod) | |||||
| return | return | ||||
| } | } | ||||
| // backoff to queue when PodFailed | // backoff to queue when PodFailed | ||||
| immediate := pod.Status.Phase != v1.PodFailed | immediate := pod.Status.Phase != v1.PodFailed | ||||
| jc.enqueueByPod(pod, immediate) | |||||
| c.enqueueByPod(pod, immediate) | |||||
| } | } | ||||
| // When a pod is updated, figure out what joint inference service manage it and wake them up. | // When a pod is updated, figure out what joint inference service manage it and wake them up. | ||||
| func (jc *IncrementalJobController) updatePod(old, cur interface{}) { | |||||
| func (c *Controller) updatePod(old, cur interface{}) { | |||||
| curPod := cur.(*v1.Pod) | curPod := cur.(*v1.Pod) | ||||
| oldPod := old.(*v1.Pod) | oldPod := old.(*v1.Pod) | ||||
| @@ -157,11 +160,11 @@ func (jc *IncrementalJobController) updatePod(old, cur interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| jc.addPod(curPod) | |||||
| c.addPod(curPod) | |||||
| } | } | ||||
| // deletePod enqueues the jointinferenceservice obj When a pod is deleted | // deletePod enqueues the jointinferenceservice obj When a pod is deleted | ||||
| func (jc *IncrementalJobController) deletePod(obj interface{}) { | |||||
| func (c *Controller) deletePod(obj interface{}) { | |||||
| pod, ok := obj.(*v1.Pod) | pod, ok := obj.(*v1.Pod) | ||||
| // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | ||||
| @@ -182,13 +185,13 @@ func (jc *IncrementalJobController) deletePod(obj interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| jc.enqueueByPod(pod, true) | |||||
| c.enqueueByPod(pod, true) | |||||
| } | } | ||||
| // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item, | // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item, | ||||
| // immediate tells the controller to update the status right away, and should | // immediate tells the controller to update the status right away, and should | ||||
| // happen ONLY when there was a successful pod run. | // happen ONLY when there was a successful pod run. | ||||
| func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate bool) { | |||||
| func (c *Controller) enqueueController(obj interface{}, immediate bool) { | |||||
| key, err := k8scontroller.KeyFunc(obj) | key, err := k8scontroller.KeyFunc(obj) | ||||
| if err != nil { | if err != nil { | ||||
| utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) | utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) | ||||
| @@ -197,36 +200,36 @@ func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate | |||||
| backoff := time.Duration(0) | backoff := time.Duration(0) | ||||
| if !immediate { | if !immediate { | ||||
| backoff = getBackoff(jc.queue, key) | |||||
| backoff = runtime.GetBackoff(c.queue, key) | |||||
| } | } | ||||
| jc.queue.AddAfter(key, backoff) | |||||
| c.queue.AddAfter(key, backoff) | |||||
| } | } | ||||
| // worker runs a worker thread that just dequeues items, processes them, and marks them done. | // worker runs a worker thread that just dequeues items, processes them, and marks them done. | ||||
| // It enforces that the syncHandler is never invoked concurrently with the same key. | // It enforces that the syncHandler is never invoked concurrently with the same key. | ||||
| func (jc *IncrementalJobController) worker() { | |||||
| for jc.processNextWorkItem() { | |||||
| func (c *Controller) worker() { | |||||
| for c.processNextWorkItem() { | |||||
| } | } | ||||
| } | } | ||||
| func (jc *IncrementalJobController) processNextWorkItem() bool { | |||||
| key, quit := jc.queue.Get() | |||||
| func (c *Controller) processNextWorkItem() bool { | |||||
| key, quit := c.queue.Get() | |||||
| if quit { | if quit { | ||||
| return false | return false | ||||
| } | } | ||||
| defer jc.queue.Done(key) | |||||
| defer c.queue.Done(key) | |||||
| forget, err := jc.sync(key.(string)) | |||||
| forget, err := c.sync(key.(string)) | |||||
| if err == nil { | if err == nil { | ||||
| if forget { | if forget { | ||||
| jc.queue.Forget(key) | |||||
| c.queue.Forget(key) | |||||
| } | } | ||||
| return true | return true | ||||
| } | } | ||||
| utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err)) | utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err)) | ||||
| jc.queue.AddRateLimited(key) | |||||
| c.queue.AddRateLimited(key) | |||||
| return true | return true | ||||
| } | } | ||||
| @@ -234,7 +237,7 @@ func (jc *IncrementalJobController) processNextWorkItem() bool { | |||||
| // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning | // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning | ||||
| // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | ||||
| // concurrently with the same key. | // concurrently with the same key. | ||||
| func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| func (c *Controller) sync(key string) (bool, error) { | |||||
| startTime := time.Now() | startTime := time.Now() | ||||
| defer func() { | defer func() { | ||||
| klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime)) | klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime)) | ||||
| @@ -247,7 +250,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| if len(ns) == 0 || len(name) == 0 { | if len(ns) == 0 || len(name) == 0 { | ||||
| return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) | return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) | ||||
| } | } | ||||
| sharedIncrementalJob, err := jc.jobLister.IncrementalLearningJobs(ns).Get(name) | |||||
| sharedJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name) | |||||
| if err != nil { | if err != nil { | ||||
| if errors.IsNotFound(err) { | if errors.IsNotFound(err) { | ||||
| klog.V(4).Infof("incrementallearning job has been deleted: %v", key) | klog.V(4).Infof("incrementallearning job has been deleted: %v", key) | ||||
| @@ -255,19 +259,21 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| } | } | ||||
| return false, err | return false, err | ||||
| } | } | ||||
| incrementaljob := *sharedIncrementalJob | |||||
| // set kind for incrementaljob in case that the kind is None | |||||
| incrementaljob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob")) | |||||
| // incrementaljob first start, create pod for inference | |||||
| if incrementaljob.Status.StartTime == nil { | |||||
| job := *sharedJob | |||||
| // set kind in case that the kind is None | |||||
| job.SetGroupVersionKind(Kind) | |||||
| // when job is handled at first, create pod for inference | |||||
| if job.Status.StartTime == nil { | |||||
| now := metav1.Now() | now := metav1.Now() | ||||
| incrementaljob.Status.StartTime = &now | |||||
| pod := jc.getSpecifiedPods(&incrementaljob, InferencePodType) | |||||
| job.Status.StartTime = &now | |||||
| pod := c.getSpecifiedPods(&job, runtime.InferencePodType) | |||||
| if pod == nil { | if pod == nil { | ||||
| err = jc.createInferPod(&incrementaljob) | |||||
| err = c.createInferPod(&job) | |||||
| } else { | } else { | ||||
| if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { | if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { | ||||
| err = jc.createInferPod(&incrementaljob) | |||||
| err = c.createInferPod(&job) | |||||
| } | } | ||||
| } | } | ||||
| if err != nil { | if err != nil { | ||||
| @@ -275,8 +281,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| } | } | ||||
| } | } | ||||
| // if incrementaljob was finished previously, we don't want to redo the termination | |||||
| if IsIncrementalJobFinished(&incrementaljob) { | |||||
| // if job was finished previously, we don't want to redo the termination | |||||
| if IsJobFinished(&job) { | |||||
| return true, nil | return true, nil | ||||
| } | } | ||||
| @@ -284,20 +290,20 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| jobFailed := false | jobFailed := false | ||||
| needUpdated := false | needUpdated := false | ||||
| // update conditions of incremental job | |||||
| needUpdated, err = jc.updateIncrementalJobConditions(&incrementaljob) | |||||
| // transit this job's state machine | |||||
| needUpdated, err = c.transitJobState(&job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err) | |||||
| } | } | ||||
| if needUpdated { | if needUpdated { | ||||
| if err := jc.updateIncrementalJobStatus(&incrementaljob); err != nil { | |||||
| if err := c.updateJobStatus(&job); err != nil { | |||||
| return forget, err | return forget, err | ||||
| } | } | ||||
| if jobFailed && !IsIncrementalJobFinished(&incrementaljob) { | |||||
| // returning an error will re-enqueue IncrementalJob after the backoff period | |||||
| return forget, fmt.Errorf("failed pod(s) detected for incrementaljob key %q", key) | |||||
| if jobFailed && !IsJobFinished(&job) { | |||||
| // returning an error will re-enqueue IncrementalLearningJob after the backoff period | |||||
| return forget, fmt.Errorf("failed pod(s) detected for incrementallearning job key %q", key) | |||||
| } | } | ||||
| forget = true | forget = true | ||||
| @@ -308,65 +314,60 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { | |||||
| // setWorkerNodeNameOfJob sets the worker nodeName of the specified job | // setWorkerNodeNameOfJob sets the worker nodeName of the specified job | ||||
| // which is used for downstream to sync job info to the specified LC located in nodeName. | // which is used for downstream to sync job info to the specified LC located in nodeName. | ||||
| func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { | |||||
| key := AnnotationsKeyPrefix + jobStage | |||||
| func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { | |||||
| key := runtime.AnnotationsKeyPrefix + jobStage | |||||
| ann := job.GetAnnotations() | ann := job.GetAnnotations() | ||||
| if ann != nil { | |||||
| if ann[key] == nodeName { | |||||
| // already set | |||||
| return nil | |||||
| } | |||||
| if ann[key] == nodeName { | |||||
| // already set | |||||
| return nil | |||||
| } | } | ||||
| dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) | |||||
| jobClient := jc.client.IncrementalLearningJobs(job.Namespace) | |||||
| var err error | |||||
| for i := 0; i <= ResourceUpdateRetries; i++ { | |||||
| var newJob *sednav1.IncrementalLearningJob | |||||
| newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) | |||||
| jobClient := c.client.IncrementalLearningJobs(job.Namespace) | |||||
| return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { | |||||
| newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| break | |||||
| return err | |||||
| } | } | ||||
| annotations := newJob.GetAnnotations() | annotations := newJob.GetAnnotations() | ||||
| if annotations != nil { | |||||
| if annotations[key] == nodeName { | |||||
| return nil | |||||
| } | |||||
| } | |||||
| dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) | |||||
| if _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}); err == nil { | |||||
| break | |||||
| if annotations[key] == nodeName { | |||||
| return nil | |||||
| } | } | ||||
| } | |||||
| return err | |||||
| _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | } | ||||
| // updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus | |||||
| func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) { | |||||
| // transitJobState transit job to next state | |||||
| func (c *Controller) transitJobState(job *sednav1.IncrementalLearningJob) (bool, error) { | |||||
| var initialType sednav1.ILJobStageConditionType | var initialType sednav1.ILJobStageConditionType | ||||
| var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ | var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ | ||||
| Stage: sednav1.ILJobTrain, | Stage: sednav1.ILJobTrain, | ||||
| Type: initialType, | Type: initialType, | ||||
| } | } | ||||
| var newConditionType sednav1.ILJobStageConditionType | var newConditionType sednav1.ILJobStageConditionType | ||||
| var needUpdated = false | var needUpdated = false | ||||
| jobConditions := incrementaljob.Status.Conditions | |||||
| var podStatus v1.PodPhase = v1.PodUnknown | var podStatus v1.PodPhase = v1.PodUnknown | ||||
| var pod *v1.Pod | var pod *v1.Pod | ||||
| jobConditions := job.Status.Conditions | |||||
| if len(jobConditions) > 0 { | if len(jobConditions) > 0 { | ||||
| // get latest pod and pod status | // get latest pod and pod status | ||||
| latestCondition = (jobConditions)[len(jobConditions)-1] | latestCondition = (jobConditions)[len(jobConditions)-1] | ||||
| klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name, | |||||
| klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", job.Namespace, job.Name, | |||||
| latestCondition.Stage) | latestCondition.Stage) | ||||
| pod = jc.getSpecifiedPods(incrementaljob, string(latestCondition.Stage)) | |||||
| pod = c.getSpecifiedPods(job, string(latestCondition.Stage)) | |||||
| if pod != nil { | if pod != nil { | ||||
| podStatus = pod.Status.Phase | podStatus = pod.Status.Phase | ||||
| } | } | ||||
| } | } | ||||
| jobStage := latestCondition.Stage | jobStage := latestCondition.Stage | ||||
| currentType := latestCondition.Type | currentType := latestCondition.Type | ||||
| newConditionType = currentType | newConditionType = currentType | ||||
| @@ -383,14 +384,14 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo | |||||
| // include train, eval, deploy pod | // include train, eval, deploy pod | ||||
| var err error | var err error | ||||
| if jobStage == sednav1.ILJobDeploy { | if jobStage == sednav1.ILJobDeploy { | ||||
| err = jc.restartInferPod(incrementaljob) | |||||
| err = c.restartInferPod(job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) | |||||
| } else { | } else { | ||||
| klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) | |||||
| } | } | ||||
| } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { | } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { | ||||
| err = jc.createPod(incrementaljob, jobStage) | |||||
| err = c.createPod(job, jobStage) | |||||
| } | } | ||||
| if err != nil { | if err != nil { | ||||
| return needUpdated, err | return needUpdated, err | ||||
| @@ -406,17 +407,17 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo | |||||
| newConditionType = sednav1.ILJobStageCondRunning | newConditionType = sednav1.ILJobStageCondRunning | ||||
| // add nodeName to job | // add nodeName to job | ||||
| if err := jc.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil { | |||||
| if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil { | |||||
| return needUpdated, err | return needUpdated, err | ||||
| } | } | ||||
| } | } | ||||
| } else if podStatus == v1.PodSucceeded { | } else if podStatus == v1.PodSucceeded { | ||||
| // watch pod status, if pod completed, set type completed | // watch pod status, if pod completed, set type completed | ||||
| newConditionType = sednav1.ILJobStageCondCompleted | newConditionType = sednav1.ILJobStageCondCompleted | ||||
| klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) | |||||
| } else if podStatus == v1.PodFailed { | } else if podStatus == v1.PodFailed { | ||||
| newConditionType = sednav1.ILJobStageCondFailed | newConditionType = sednav1.ILJobStageCondFailed | ||||
| klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) | |||||
| } | } | ||||
| case sednav1.ILJobStageCondCompleted: | case sednav1.ILJobStageCondCompleted: | ||||
| jobStage = getNextStage(jobStage) | jobStage = getNextStage(jobStage) | ||||
| @@ -429,31 +430,29 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo | |||||
| default: | default: | ||||
| // do nothing when given other type out of cases | // do nothing when given other type out of cases | ||||
| } | } | ||||
| klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", incrementaljob.Namespace, incrementaljob.Name, jobConditions) | |||||
| klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) | |||||
| if latestCondition.Type != newConditionType { | if latestCondition.Type != newConditionType { | ||||
| incrementaljob.Status.Conditions = append(incrementaljob.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) | |||||
| job.Status.Conditions = append(job.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) | |||||
| needUpdated = true | needUpdated = true | ||||
| return needUpdated, nil | |||||
| } | } | ||||
| return needUpdated, nil | return needUpdated, nil | ||||
| } | } | ||||
| // updateIncrementalJobStatus ensures that jobstatus can be updated rightly | |||||
| func (jc *IncrementalJobController) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error { | |||||
| jobClient := jc.client.IncrementalLearningJobs(incrementaljob.Namespace) | |||||
| var err error | |||||
| for i := 0; i <= ResourceUpdateRetries; i++ { | |||||
| var newIncrementalJob *sednav1.IncrementalLearningJob | |||||
| newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{}) | |||||
| // updateJobStatus ensures that job status can be updated rightly | |||||
| func (c *Controller) updateJobStatus(job *sednav1.IncrementalLearningJob) error { | |||||
| jobClient := c.client.IncrementalLearningJobs(job.Namespace) | |||||
| return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { | |||||
| newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| break | |||||
| return err | |||||
| } | } | ||||
| newIncrementalJob.Status = incrementaljob.Status | |||||
| if _, err = jobClient.UpdateStatus(context.TODO(), newIncrementalJob, metav1.UpdateOptions{}); err == nil { | |||||
| break | |||||
| } | |||||
| } | |||||
| return err | |||||
| newJob.Status = job.Status | |||||
| _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | } | ||||
| func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition { | func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition { | ||||
| @@ -468,26 +467,29 @@ func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, j | |||||
| } | } | ||||
| } | } | ||||
| func (jc *IncrementalJobController) generatePodName(jobName string, workerType string) string { | |||||
| func (c *Controller) generatePodName(jobName string, workerType string) string { | |||||
| return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) | return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) | ||||
| } | } | ||||
| func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { | |||||
| if podType == "Deploy" { | |||||
| podType = InferencePodType | |||||
| } | |||||
| func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { | |||||
| var latestPod *v1.Pod | var latestPod *v1.Pod | ||||
| selector, _ := GenerateSelector(job) | |||||
| pods, err := jc.podStore.Pods(job.Namespace).List(selector) | |||||
| selector, _ := runtime.GenerateSelector(job) | |||||
| pods, err := c.podStore.Pods(job.Namespace).List(selector) | |||||
| if len(pods) == 0 || err != nil { | if len(pods) == 0 || err != nil { | ||||
| return nil | return nil | ||||
| } | } | ||||
| var matchTag = false | var matchTag = false | ||||
| latestPod = pods[0] | latestPod = pods[0] | ||||
| if podType == "Deploy" { | |||||
| podType = runtime.InferencePodType | |||||
| } | |||||
| for _, pod := range pods { | for _, pod := range pods { | ||||
| s := strings.Split(pod.Name, "-") | s := strings.Split(pod.Name, "-") | ||||
| CurrentPodType := s[len(s)-2] | |||||
| if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) { | |||||
| currentPodType := s[len(s)-2] | |||||
| if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && currentPodType == strings.ToLower(podType) { | |||||
| latestPod = pod | latestPod = pod | ||||
| matchTag = true | matchTag = true | ||||
| } | } | ||||
| @@ -498,20 +500,22 @@ func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLea | |||||
| return latestPod | return latestPod | ||||
| } | } | ||||
| func (jc *IncrementalJobController) restartInferPod(job *sednav1.IncrementalLearningJob) error { | |||||
| inferPod := jc.getSpecifiedPods(job, InferencePodType) | |||||
| func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error { | |||||
| inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) | |||||
| if inferPod == nil { | if inferPod == nil { | ||||
| klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name) | klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name) | ||||
| err := jc.createInferPod(job) | |||||
| err := c.createInferPod(job) | |||||
| return err | return err | ||||
| } | } | ||||
| ctx := context.Background() | ctx := context.Background() | ||||
| err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) | |||||
| err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | ||||
| return err | return err | ||||
| } | } | ||||
| err = jc.createInferPod(job) | |||||
| err = c.createInferPod(job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | ||||
| return err | return err | ||||
| @@ -532,14 +536,14 @@ func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage { | |||||
| } | } | ||||
| } | } | ||||
| func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool { | |||||
| func IsJobFinished(j *sednav1.IncrementalLearningJob) bool { | |||||
| // TODO | // TODO | ||||
| return false | return false | ||||
| } | } | ||||
| func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { | |||||
| func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { | |||||
| if name != "" { | if name != "" { | ||||
| secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| err = fmt.Errorf("failed to get the secret %s for %s: %w", | err = fmt.Errorf("failed to get the secret %s for %s: %w", | ||||
| name, | name, | ||||
| @@ -549,7 +553,7 @@ func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr s | |||||
| return | return | ||||
| } | } | ||||
| func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { | |||||
| func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { | |||||
| ctx := context.Background() | ctx := context.Background() | ||||
| var podTemplate *v1.PodTemplateSpec | var podTemplate *v1.PodTemplateSpec | ||||
| @@ -558,25 +562,25 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| deployModelName := job.Spec.DeploySpec.Model.Name | deployModelName := job.Spec.DeploySpec.Model.Name | ||||
| // check initial model name | // check initial model name | ||||
| initialModel, err := jc.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) | |||||
| initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get initial model %s: %w", | return fmt.Errorf("failed to get initial model %s: %w", | ||||
| initialModelName, err) | initialModelName, err) | ||||
| } | } | ||||
| _, err = jc.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) | |||||
| _, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get deploy model %s: %w", | return fmt.Errorf("failed to get deploy model %s: %w", | ||||
| deployModelName, err) | deployModelName, err) | ||||
| } | } | ||||
| dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) | |||||
| dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get dataset %s: %w", | return fmt.Errorf("failed to get dataset %s: %w", | ||||
| incrementalDatasetName, err) | incrementalDatasetName, err) | ||||
| } | } | ||||
| datasetSecret, err := jc.getSecret( | |||||
| datasetSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| dataset.Spec.CredentialName, | dataset.Spec.CredentialName, | ||||
| fmt.Sprintf("dataset %s", dataset.Name), | fmt.Sprintf("dataset %s", dataset.Name), | ||||
| @@ -585,7 +589,7 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| return err | return err | ||||
| } | } | ||||
| jobSecret, err := jc.getSecret( | |||||
| jobSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| job.Spec.CredentialName, | job.Spec.CredentialName, | ||||
| fmt.Sprintf("incremental job %s", job.Name), | fmt.Sprintf("incremental job %s", job.Name), | ||||
| @@ -595,13 +599,14 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| } | } | ||||
| // get all url for train and eval from data in condition | // get all url for train and eval from data in condition | ||||
| var cond IncrementalCondData | |||||
| condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data | condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data | ||||
| klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) | klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) | ||||
| var cond IncrementalCondData | |||||
| (&cond).Unmarshal([]byte(condDataStr)) | (&cond).Unmarshal([]byte(condDataStr)) | ||||
| if cond.Input == nil { | if cond.Input == nil { | ||||
| return fmt.Errorf("empty input from condData") | return fmt.Errorf("empty input from condData") | ||||
| } | } | ||||
| dataURL := cond.Input.DataURL | dataURL := cond.Input.DataURL | ||||
| inputmodelURLs := cond.GetInputModelURLs() | inputmodelURLs := cond.GetInputModelURLs() | ||||
| @@ -614,25 +619,26 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| originalDataURLOrIndex = dataset.Spec.URL | originalDataURLOrIndex = dataset.Spec.URL | ||||
| } | } | ||||
| var workerParam *WorkerParam = new(WorkerParam) | |||||
| var workerParam runtime.WorkerParam | |||||
| if podtype == sednav1.ILJobTrain { | if podtype == sednav1.ILJobTrain { | ||||
| workerParam.workerType = TrainPodType | |||||
| workerParam.WorkerType = runtime.TrainPodType | |||||
| podTemplate = &job.Spec.TrainSpec.Template | podTemplate = &job.Spec.TrainSpec.Template | ||||
| // Env parameters for train | |||||
| workerParam.env = map[string]string{ | |||||
| // Env parameters for train | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "train-worker-" + utilrand.String(5), | "WORKER_NAME": "train-worker-" + utilrand.String(5), | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| baseModelURL := inputmodelURLs[0] | baseModelURL := inputmodelURLs[0] | ||||
| var baseModelSecret *v1.Secret | var baseModelSecret *v1.Secret | ||||
| if baseModelURL == initialModel.Spec.URL { | if baseModelURL == initialModel.Spec.URL { | ||||
| baseModelSecret, err = jc.getSecret( | |||||
| baseModelSecret, err = c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| initialModel.Spec.CredentialName, | initialModel.Spec.CredentialName, | ||||
| fmt.Sprintf("initial model %s", initialModelName), | fmt.Sprintf("initial model %s", initialModelName), | ||||
| @@ -644,17 +650,17 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| baseModelSecret = jobSecret | baseModelSecret = jobSecret | ||||
| } | } | ||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: baseModelURL, | URL: baseModelURL, | ||||
| Secret: baseModelSecret, | Secret: baseModelSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| }, | }, | ||||
| EnvName: "BASE_MODEL_URL", | EnvName: "BASE_MODEL_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: cond.Input.OutputDir, | URL: cond.Input.OutputDir, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: false, | DownloadByInitializer: false, | ||||
| @@ -662,8 +668,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| EnvName: "MODEL_URL", | EnvName: "MODEL_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: dataURL, | URL: dataURL, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| @@ -672,8 +678,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| }, | }, | ||||
| // see https://github.com/kubeedge/sedna/issues/35 | // see https://github.com/kubeedge/sedna/issues/35 | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| URL: originalDataURLOrIndex, | URL: originalDataURLOrIndex, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -683,23 +689,23 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| }, | }, | ||||
| ) | ) | ||||
| } else { | } else { | ||||
| // Configure eval worker's mounts and envs | |||||
| podTemplate = &job.Spec.EvalSpec.Template | podTemplate = &job.Spec.EvalSpec.Template | ||||
| workerParam.workerType = "Eval" | |||||
| workerParam.WorkerType = "Eval" | |||||
| // Configure Env information for eval by initial WorkerParam | |||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "eval-worker-" + utilrand.String(5), | "WORKER_NAME": "eval-worker-" + utilrand.String(5), | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| var modelMountURLs []MountURL | |||||
| var modelMountURLs []runtime.MountURL | |||||
| for _, url := range inputmodelURLs { | for _, url := range inputmodelURLs { | ||||
| var modelSecret *v1.Secret | var modelSecret *v1.Secret | ||||
| if url == initialModel.Spec.URL { | if url == initialModel.Spec.URL { | ||||
| modelSecret, err = jc.getSecret( | |||||
| modelSecret, err = c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| initialModel.Spec.CredentialName, | initialModel.Spec.CredentialName, | ||||
| fmt.Sprintf("initial model %s", initialModelName), | fmt.Sprintf("initial model %s", initialModelName), | ||||
| @@ -711,21 +717,21 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| modelSecret = jobSecret | modelSecret = jobSecret | ||||
| } | } | ||||
| modelMountURLs = append(modelMountURLs, MountURL{ | |||||
| modelMountURLs = append(modelMountURLs, runtime.MountURL{ | |||||
| URL: url, | URL: url, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| }) | }) | ||||
| } | } | ||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URLs: modelMountURLs, | URLs: modelMountURLs, | ||||
| Name: "models", | Name: "models", | ||||
| EnvName: "MODEL_URLS", | EnvName: "MODEL_URLS", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: dataURL, | URL: dataURL, | ||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -734,8 +740,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| EnvName: "TEST_DATASET_URL", | EnvName: "TEST_DATASET_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| URL: originalDataURLOrIndex, | URL: originalDataURLOrIndex, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -748,40 +754,38 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo | |||||
| } | } | ||||
| // set the default policy instead of Always policy | // set the default policy instead of Always policy | ||||
| workerParam.restartPolicy = v1.RestartPolicyOnFailure | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.RestartPolicy = v1.RestartPolicyOnFailure | |||||
| workerParam.HostNetwork = true | |||||
| // create pod based on podtype | // create pod based on podtype | ||||
| _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, &workerParam) | |||||
| return | return | ||||
| } | } | ||||
| func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearningJob) error { | |||||
| func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { | |||||
| infermodelName := job.Spec.DeploySpec.Model.Name | infermodelName := job.Spec.DeploySpec.Model.Name | ||||
| inferModel, err := jc.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) | |||||
| inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get infer model %s: %w", | return fmt.Errorf("failed to get infer model %s: %w", | ||||
| infermodelName, err) | infermodelName, err) | ||||
| } | } | ||||
| inferModelURL := inferModel.Spec.URL | inferModelURL := inferModel.Spec.URL | ||||
| // Env parameters for edge | |||||
| HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) | HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) | ||||
| HEMParameterString := string(HEMParameterJSON) | HEMParameterString := string(HEMParameterJSON) | ||||
| // Configure container mounting and Env information by initial WorkerParam | |||||
| modelSecret, err := jc.getSecret( | |||||
| modelSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| inferModel.Spec.CredentialName, | inferModel.Spec.CredentialName, | ||||
| fmt.Sprintf("model %s", inferModel.Name), | fmt.Sprintf("model %s", inferModel.Name), | ||||
| ) | ) | ||||
| var workerParam *WorkerParam = new(WorkerParam) | |||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| // Configure inference worker's mounts and envs | |||||
| var workerParam runtime.WorkerParam | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: inferModelURL, | URL: inferModelURL, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -791,7 +795,7 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn | |||||
| }, | }, | ||||
| ) | ) | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "inferworker-" + utilrand.String(5), | "WORKER_NAME": "inferworker-" + utilrand.String(5), | ||||
| @@ -799,71 +803,48 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn | |||||
| "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name, | "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name, | ||||
| "HEM_PARAMETERS": HEMParameterString, | "HEM_PARAMETERS": HEMParameterString, | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| workerParam.workerType = InferencePodType | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.WorkerType = runtime.InferencePodType | |||||
| workerParam.HostNetwork = true | |||||
| // create edge pod | |||||
| _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) | |||||
| // create the inference worker | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, &workerParam) | |||||
| return err | return err | ||||
| } | } | ||||
| // GetName returns the name of the incrementallearning job controller | |||||
| func (jc *IncrementalJobController) GetName() string { | |||||
| return "IncrementalLearningJobController" | |||||
| } | |||||
| // NewIncrementalJobController creates a new IncrementalJob controller that keeps the relevant pods | |||||
| // in sync with their corresponding IncrementalJob objects. | |||||
| func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| namespace := cfg.Namespace | |||||
| if namespace == "" { | |||||
| namespace = metav1.NamespaceAll | |||||
| } | |||||
| kubeClient, err := utils.KubeClient() | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| kubecfg, err := utils.KubeConfig() | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| crdclient, err := clientset.NewForConfig(kubecfg) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) | |||||
| // New creates a new incremental learning job controller that keeps the relevant pods | |||||
| // in sync with the corresponding IncrementalLearningJob objects. | |||||
| func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { | |||||
| podInformer := cc.KubeInformerFactory.Core().V1().Pods() | |||||
| podInformer := kubeInformerFactory.Core().V1().Pods() | |||||
| jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) | |||||
| jobInformer := jobInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() | |||||
| jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() | |||||
| eventBroadcaster := record.NewBroadcaster() | eventBroadcaster := record.NewBroadcaster() | ||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | |||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) | |||||
| jc := &Controller{ | |||||
| kubeClient: cc.KubeClient, | |||||
| client: cc.SednaClient.SednaV1alpha1(), | |||||
| jc := &IncrementalJobController{ | |||||
| kubeClient: kubeClient, | |||||
| client: crdclient.SednaV1alpha1(), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "incrementallearningjob"), | |||||
| recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}), | |||||
| cfg: cfg, | |||||
| cfg: cc.Config, | |||||
| } | } | ||||
| jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||||
| AddFunc: func(obj interface{}) { | AddFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Added, obj) | |||||
| }, | }, | ||||
| UpdateFunc: func(old, cur interface{}) { | UpdateFunc: func(old, cur interface{}) { | ||||
| jc.enqueueController(cur, true) | jc.enqueueController(cur, true) | ||||
| jc.syncToEdge(watch.Added, cur) | |||||
| }, | }, | ||||
| DeleteFunc: func(obj interface{}) { | DeleteFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Deleted, obj) | |||||
| }, | }, | ||||
| }) | }) | ||||
| jc.jobLister = jobInformer.Lister() | jc.jobLister = jobInformer.Lister() | ||||
| @@ -877,8 +858,5 @@ func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControlle | |||||
| jc.podStore = podInformer.Lister() | jc.podStore = podInformer.Lister() | ||||
| jc.podStoreSynced = podInformer.Informer().HasSynced | jc.podStoreSynced = podInformer.Informer().HasSynced | ||||
| stopCh := make(chan struct{}) | |||||
| kubeInformerFactory.Start(stopCh) | |||||
| jobInformerFactory.Start(stopCh) | |||||
| return jc, err | |||||
| return jc, nil | |||||
| } | } | ||||
| @@ -0,0 +1,162 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package incrementallearning | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| "fmt" | |||||
| "strings" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| v1 "k8s.io/api/core/v1" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| ) | |||||
| type Model = runtime.Model | |||||
| // the data of this condition including the input/output to do the next step | |||||
| type IncrementalCondData struct { | |||||
| Input *struct { | |||||
| // Only one model cases | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| DataURL string `json:"dataURL,omitempty"` | |||||
| // the data samples reference will be stored into this URL. | |||||
| // The content of this url would be: | |||||
| // # the first uncomment line means the directory | |||||
| // s3://dataset/ | |||||
| // mnist/0.jpg | |||||
| // mnist/1.jpg | |||||
| DataIndexURL string `json:"dataIndexURL,omitempty"` | |||||
| OutputDir string `json:"outputDir,omitempty"` | |||||
| } `json:"input,omitempty"` | |||||
| Output *struct { | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| } `json:"output,omitempty"` | |||||
| } | |||||
| func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { | |||||
| var modelURLs []string | |||||
| if model != nil { | |||||
| modelURLs = append(modelURLs, model.GetURL()) | |||||
| } else { | |||||
| for _, m := range models { | |||||
| modelURLs = append(modelURLs, m.GetURL()) | |||||
| } | |||||
| } | |||||
| return modelURLs | |||||
| } | |||||
| func (cd *IncrementalCondData) GetInputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) | |||||
| } | |||||
| func (cd *IncrementalCondData) GetOutputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) | |||||
| } | |||||
| func (cd *IncrementalCondData) Unmarshal(data []byte) error { | |||||
| return json.Unmarshal(data, cd) | |||||
| } | |||||
| func (cd IncrementalCondData) Marshal() ([]byte, error) { | |||||
| return json.Marshal(cd) | |||||
| } | |||||
| func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { | |||||
| client := c.client.IncrementalLearningJobs(namespace) | |||||
| return runtime.RetryUpdateStatus(name, namespace, (func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| // updateFromEdge syncs the edge updates to k8s | |||||
| func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { | |||||
| var jobStatus struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| } | |||||
| err := json.Unmarshal(content, &jobStatus) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // Get the condition data. | |||||
| // Here unmarshal and marshal immediately to skip the unnecessary fields | |||||
| var condData IncrementalCondData | |||||
| err = json.Unmarshal(content, &condData) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| condDataBytes, _ := json.Marshal(&condData) | |||||
| cond := sednav1.ILJobCondition{ | |||||
| Status: v1.ConditionTrue, | |||||
| LastHeartbeatTime: metav1.Now(), | |||||
| LastTransitionTime: metav1.Now(), | |||||
| Data: string(condDataBytes), | |||||
| Message: "reported by lc", | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Phase) { | |||||
| case "train": | |||||
| cond.Stage = sednav1.ILJobTrain | |||||
| case "eval": | |||||
| cond.Stage = sednav1.ILJobEval | |||||
| case "deploy": | |||||
| cond.Stage = sednav1.ILJobDeploy | |||||
| default: | |||||
| return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Status) { | |||||
| case "ready": | |||||
| cond.Type = sednav1.ILJobStageCondReady | |||||
| case "completed": | |||||
| cond.Type = sednav1.ILJobStageCondCompleted | |||||
| case "failed": | |||||
| cond.Type = sednav1.ILJobStageCondFailed | |||||
| case "waiting": | |||||
| cond.Type = sednav1.ILJobStageCondWaiting | |||||
| default: | |||||
| return fmt.Errorf("invalid condition type: %v", jobStatus.Status) | |||||
| } | |||||
| err = c.appendStatusCondition(name, namespace, cond) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to append condition, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { | |||||
| return addFunc(KindName, c.updateFromEdge) | |||||
| } | |||||
| @@ -0,0 +1,56 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package jointinference | |||||
| import ( | |||||
| "fmt" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { | |||||
| joint, ok := obj.(*sednav1.JointInferenceService) | |||||
| if !ok { | |||||
| return nil | |||||
| } | |||||
| // Since Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| joint.Kind = KindName | |||||
| // Here only propagate to the nodes with non empty name | |||||
| // FIXME: only the case that Spec.NodeName specified is support | |||||
| nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| if len(joint.Kind) == 0 { | |||||
| joint.Kind = KindName | |||||
| } | |||||
| return c.sendToEdgeFunc(nodeName, eventType, joint) | |||||
| } | |||||
| func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { | |||||
| c.sendToEdgeFunc = f | |||||
| return nil | |||||
| } | |||||
| @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package jointinference | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| @@ -29,7 +29,7 @@ import ( | |||||
| utilrand "k8s.io/apimachinery/pkg/util/rand" | utilrand "k8s.io/apimachinery/pkg/util/rand" | ||||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||||
| "k8s.io/apimachinery/pkg/util/wait" | "k8s.io/apimachinery/pkg/util/wait" | ||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | "k8s.io/client-go/kubernetes" | ||||
| "k8s.io/client-go/kubernetes/scheme" | "k8s.io/client-go/kubernetes/scheme" | ||||
| v1core "k8s.io/client-go/kubernetes/typed/core/v1" | v1core "k8s.io/client-go/kubernetes/typed/core/v1" | ||||
| @@ -41,26 +41,32 @@ import ( | |||||
| k8scontroller "k8s.io/kubernetes/pkg/controller" | k8scontroller "k8s.io/kubernetes/pkg/controller" | ||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | ||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | ||||
| informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | ||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | "github.com/kubeedge/sedna/pkg/globalmanager/config" | ||||
| messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| const ( | |||||
| // Name is this controller name | |||||
| Name = "JointInference" | |||||
| // KindName is the kind name of CR this controller controls | |||||
| KindName = "JointInferenceService" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| jointInferenceForEdge = "Edge" | jointInferenceForEdge = "Edge" | ||||
| jointInferenceForCloud = "Cloud" | jointInferenceForCloud = "Cloud" | ||||
| bigModelPort = 5000 | |||||
| ) | ) | ||||
| // jointServiceControllerKind contains the schema.GroupVersionKind for this controller type. | |||||
| var jointServiceControllerKind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService") | |||||
| // Kind contains the schema.GroupVersionKind for this controller type. | |||||
| var Kind = sednav1.SchemeGroupVersion.WithKind(Name) | |||||
| // JointInferenceServiceController ensures that all JointInferenceService objects | |||||
| // Controller ensures that all JointInferenceService objects | |||||
| // have corresponding pods to run their configured workload. | // have corresponding pods to run their configured workload. | ||||
| type JointInferenceServiceController struct { | |||||
| type Controller struct { | |||||
| kubeClient kubernetes.Interface | kubeClient kubernetes.Interface | ||||
| client sednaclientset.SednaV1alpha1Interface | client sednaclientset.SednaV1alpha1Interface | ||||
| @@ -69,7 +75,7 @@ type JointInferenceServiceController struct { | |||||
| // A store of pods | // A store of pods | ||||
| podStore corelisters.PodLister | podStore corelisters.PodLister | ||||
| // serviceStoreSynced returns true if the jointinferenceservice store has been synced at least once. | |||||
| // serviceStoreSynced returns true if the JointInferenceService store has been synced at least once. | |||||
| serviceStoreSynced cache.InformerSynced | serviceStoreSynced cache.InformerSynced | ||||
| // A store of service | // A store of service | ||||
| serviceLister sednav1listers.JointInferenceServiceLister | serviceLister sednav1listers.JointInferenceServiceLister | ||||
| @@ -80,48 +86,47 @@ type JointInferenceServiceController struct { | |||||
| recorder record.EventRecorder | recorder record.EventRecorder | ||||
| cfg *config.ControllerConfig | cfg *config.ControllerConfig | ||||
| sendToEdgeFunc runtime.DownstreamSendFunc | |||||
| } | } | ||||
| // Start starts the main goroutine responsible for watching and syncing services. | |||||
| func (jc *JointInferenceServiceController) Start() error { | |||||
| // Run starts the main goroutine responsible for watching and syncing services. | |||||
| func (c *Controller) Run(stopCh <-chan struct{}) { | |||||
| workers := 1 | workers := 1 | ||||
| stopCh := messageContext.Done() | |||||
| go func() { | |||||
| defer utilruntime.HandleCrash() | |||||
| defer jc.queue.ShutDown() | |||||
| klog.Infof("Starting joint inference service controller") | |||||
| defer klog.Infof("Shutting down joint inference service controller") | |||||
| defer utilruntime.HandleCrash() | |||||
| defer c.queue.ShutDown() | |||||
| if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, jc.podStoreSynced, jc.serviceStoreSynced) { | |||||
| klog.Errorf("failed to wait for joint inferce service caches to sync") | |||||
| klog.Infof("Starting %s controller", Name) | |||||
| defer klog.Infof("Shutting down %s controller", Name) | |||||
| return | |||||
| } | |||||
| if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.serviceStoreSynced) { | |||||
| klog.Errorf("failed to wait for %s caches to sync", Name) | |||||
| klog.Infof("Starting joint inference service workers") | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(jc.worker, time.Second, stopCh) | |||||
| } | |||||
| return | |||||
| } | |||||
| <-stopCh | |||||
| }() | |||||
| return nil | |||||
| klog.Infof("Starting %s workers", Name) | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(c.worker, time.Second, stopCh) | |||||
| } | |||||
| <-stopCh | |||||
| } | } | ||||
| // enqueueByPod enqueues the jointInferenceService object of the specified pod. | |||||
| func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| // enqueueByPod enqueues the JointInferenceService object of the specified pod. | |||||
| func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| controllerRef := metav1.GetControllerOf(pod) | controllerRef := metav1.GetControllerOf(pod) | ||||
| if controllerRef == nil { | if controllerRef == nil { | ||||
| return | return | ||||
| } | } | ||||
| if controllerRef.Kind != jointServiceControllerKind.Kind { | |||||
| if controllerRef.Kind != Kind.Kind { | |||||
| return | return | ||||
| } | } | ||||
| service, err := jc.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) | |||||
| service, err := c.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) | |||||
| if err != nil { | if err != nil { | ||||
| return | return | ||||
| } | } | ||||
| @@ -130,27 +135,27 @@ func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate b | |||||
| return | return | ||||
| } | } | ||||
| jc.enqueueController(service, immediate) | |||||
| c.enqueueController(service, immediate) | |||||
| } | } | ||||
| // When a pod is created, enqueue the controller that manages it and update it's expectations. | // When a pod is created, enqueue the controller that manages it and update it's expectations. | ||||
| func (jc *JointInferenceServiceController) addPod(obj interface{}) { | |||||
| func (c *Controller) addPod(obj interface{}) { | |||||
| pod := obj.(*v1.Pod) | pod := obj.(*v1.Pod) | ||||
| if pod.DeletionTimestamp != nil { | if pod.DeletionTimestamp != nil { | ||||
| // on a restart of the controller, it's possible a new pod shows up in a state that | // on a restart of the controller, it's possible a new pod shows up in a state that | ||||
| // is already pending deletion. Prevent the pod from being a creation observation. | // is already pending deletion. Prevent the pod from being a creation observation. | ||||
| jc.deletePod(pod) | |||||
| c.deletePod(pod) | |||||
| return | return | ||||
| } | } | ||||
| // backoff to queue when PodFailed | // backoff to queue when PodFailed | ||||
| immediate := pod.Status.Phase != v1.PodFailed | immediate := pod.Status.Phase != v1.PodFailed | ||||
| jc.enqueueByPod(pod, immediate) | |||||
| c.enqueueByPod(pod, immediate) | |||||
| } | } | ||||
| // When a pod is updated, figure out what joint inference service manage it and wake them up. | // When a pod is updated, figure out what joint inference service manage it and wake them up. | ||||
| func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { | |||||
| func (c *Controller) updatePod(old, cur interface{}) { | |||||
| curPod := cur.(*v1.Pod) | curPod := cur.(*v1.Pod) | ||||
| oldPod := old.(*v1.Pod) | oldPod := old.(*v1.Pod) | ||||
| @@ -159,11 +164,11 @@ func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| jc.addPod(curPod) | |||||
| c.addPod(curPod) | |||||
| } | } | ||||
| // deletePod enqueues the jointinferenceservice obj When a pod is deleted | |||||
| func (jc *JointInferenceServiceController) deletePod(obj interface{}) { | |||||
| // deletePod enqueues the JointinferenceService obj When a pod is deleted | |||||
| func (c *Controller) deletePod(obj interface{}) { | |||||
| pod, ok := obj.(*v1.Pod) | pod, ok := obj.(*v1.Pod) | ||||
| // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | ||||
| @@ -171,7 +176,7 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) { | |||||
| // When a delete is dropped, the relist will notice a pod in the store not | // When a delete is dropped, the relist will notice a pod in the store not | ||||
| // in the list, leading to the insertion of a tombstone object which contains | // in the list, leading to the insertion of a tombstone object which contains | ||||
| // the deleted key/value. Note that this value might be stale. If the pod | // the deleted key/value. Note that this value might be stale. If the pod | ||||
| // changed labels the new jointinferenceservice will not be woken up till the periodic resync. | |||||
| // changed labels the new JointInferenceService will not be woken up till the periodic resync. | |||||
| if !ok { | if !ok { | ||||
| tombstone, ok := obj.(cache.DeletedFinalStateUnknown) | tombstone, ok := obj.(cache.DeletedFinalStateUnknown) | ||||
| if !ok { | if !ok { | ||||
| @@ -184,13 +189,13 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| jc.enqueueByPod(pod, true) | |||||
| c.enqueueByPod(pod, true) | |||||
| } | } | ||||
| // obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item, | // obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item, | ||||
| // immediate tells the controller to update the status right away, and should | // immediate tells the controller to update the status right away, and should | ||||
| // happen ONLY when there was a successful pod run. | // happen ONLY when there was a successful pod run. | ||||
| func (jc *JointInferenceServiceController) enqueueController(obj interface{}, immediate bool) { | |||||
| func (c *Controller) enqueueController(obj interface{}, immediate bool) { | |||||
| key, err := k8scontroller.KeyFunc(obj) | key, err := k8scontroller.KeyFunc(obj) | ||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("Couldn't get key for object %+v: %v", obj, err) | klog.Warningf("Couldn't get key for object %+v: %v", obj, err) | ||||
| @@ -199,42 +204,42 @@ func (jc *JointInferenceServiceController) enqueueController(obj interface{}, im | |||||
| backoff := time.Duration(0) | backoff := time.Duration(0) | ||||
| if !immediate { | if !immediate { | ||||
| backoff = getBackoff(jc.queue, key) | |||||
| backoff = runtime.GetBackoff(c.queue, key) | |||||
| } | } | ||||
| jc.queue.AddAfter(key, backoff) | |||||
| c.queue.AddAfter(key, backoff) | |||||
| } | } | ||||
| // worker runs a worker thread that just dequeues items, processes them, and marks them done. | // worker runs a worker thread that just dequeues items, processes them, and marks them done. | ||||
| // It enforces that the sync is never invoked concurrently with the same key. | // It enforces that the sync is never invoked concurrently with the same key. | ||||
| func (jc *JointInferenceServiceController) worker() { | |||||
| for jc.processNextWorkItem() { | |||||
| func (c *Controller) worker() { | |||||
| for c.processNextWorkItem() { | |||||
| } | } | ||||
| } | } | ||||
| func (jc *JointInferenceServiceController) processNextWorkItem() bool { | |||||
| key, quit := jc.queue.Get() | |||||
| func (c *Controller) processNextWorkItem() bool { | |||||
| key, quit := c.queue.Get() | |||||
| if quit { | if quit { | ||||
| return false | return false | ||||
| } | } | ||||
| defer jc.queue.Done(key) | |||||
| defer c.queue.Done(key) | |||||
| forget, err := jc.sync(key.(string)) | |||||
| forget, err := c.sync(key.(string)) | |||||
| if err == nil { | if err == nil { | ||||
| if forget { | if forget { | ||||
| jc.queue.Forget(key) | |||||
| c.queue.Forget(key) | |||||
| } | } | ||||
| return true | return true | ||||
| } | } | ||||
| klog.Warningf("Error syncing jointinference service: %v", err) | klog.Warningf("Error syncing jointinference service: %v", err) | ||||
| jc.queue.AddRateLimited(key) | |||||
| c.queue.AddRateLimited(key) | |||||
| return true | return true | ||||
| } | } | ||||
| // sync will sync the jointinferenceservice with the given key. | // sync will sync the jointinferenceservice with the given key. | ||||
| // This function is not meant to be invoked concurrently with the same key. | // This function is not meant to be invoked concurrently with the same key. | ||||
| func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| func (c *Controller) sync(key string) (bool, error) { | |||||
| startTime := time.Now() | startTime := time.Now() | ||||
| defer func() { | defer func() { | ||||
| klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime)) | klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime)) | ||||
| @@ -247,7 +252,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| if len(ns) == 0 || len(name) == 0 { | if len(ns) == 0 || len(name) == 0 { | ||||
| return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) | return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) | ||||
| } | } | ||||
| sharedJointinferenceservice, err := jc.serviceLister.JointInferenceServices(ns).Get(name) | |||||
| sharedService, err := c.serviceLister.JointInferenceServices(ns).Get(name) | |||||
| if err != nil { | if err != nil { | ||||
| if errors.IsNotFound(err) { | if errors.IsNotFound(err) { | ||||
| klog.V(4).Infof("JointInferenceService has been deleted: %v", key) | klog.V(4).Infof("JointInferenceService has been deleted: %v", key) | ||||
| @@ -256,37 +261,38 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| return false, err | return false, err | ||||
| } | } | ||||
| jointinferenceservice := *sharedJointinferenceservice | |||||
| service := *sharedService | |||||
| // if jointinferenceservice was finished previously, we don't want to redo the termination | |||||
| if isJointinferenceserviceFinished(&jointinferenceservice) { | |||||
| // if service was finished previously, we don't want to redo the termination | |||||
| if isServiceFinished(&service) { | |||||
| return true, nil | return true, nil | ||||
| } | } | ||||
| // set kind for jointinferenceservice in case that the kind is None | |||||
| // set kind for service in case that the kind is None | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | // more details at https://github.com/kubernetes/kubernetes/issues/3030 | ||||
| jointinferenceservice.SetGroupVersionKind(jointServiceControllerKind) | |||||
| service.SetGroupVersionKind(Kind) | |||||
| selector, _ := GenerateSelector(&jointinferenceservice) | |||||
| pods, err := jc.podStore.Pods(jointinferenceservice.Namespace).List(selector) | |||||
| selector, _ := runtime.GenerateSelector(&service) | |||||
| pods, err := c.podStore.Pods(service.Namespace).List(selector) | |||||
| if err != nil { | if err != nil { | ||||
| return false, err | return false, err | ||||
| } | } | ||||
| klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", jointinferenceservice.Namespace, jointinferenceservice.Name, len(pods), pods) | |||||
| klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", service.Namespace, service.Name, len(pods), pods) | |||||
| latestConditionLen := len(jointinferenceservice.Status.Conditions) | |||||
| latestConditionLen := len(service.Status.Conditions) | |||||
| active := calcActivePodCount(pods) | |||||
| active := runtime.CalcActivePodCount(pods) | |||||
| var failed int32 = 0 | var failed int32 = 0 | ||||
| // neededCounts means that two pods should be created successfully in a jointinference service currently | // neededCounts means that two pods should be created successfully in a jointinference service currently | ||||
| // two pods consist of edge pod and cloud pod | // two pods consist of edge pod and cloud pod | ||||
| var neededCounts int32 = 2 | var neededCounts int32 = 2 | ||||
| // jointinferenceservice first start | |||||
| if jointinferenceservice.Status.StartTime == nil { | |||||
| if service.Status.StartTime == nil { | |||||
| now := metav1.Now() | now := metav1.Now() | ||||
| jointinferenceservice.Status.StartTime = &now | |||||
| service.Status.StartTime = &now | |||||
| } else { | } else { | ||||
| failed = neededCounts - active | failed = neededCounts - active | ||||
| } | } | ||||
| @@ -298,7 +304,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| // get the latest condition type | // get the latest condition type | ||||
| // based on that condition updated is appended, not inserted. | // based on that condition updated is appended, not inserted. | ||||
| jobConditions := jointinferenceservice.Status.Conditions | |||||
| jobConditions := service.Status.Conditions | |||||
| if len(jobConditions) > 0 { | if len(jobConditions) > 0 { | ||||
| latestConditionType = (jobConditions)[len(jobConditions)-1].Type | latestConditionType = (jobConditions)[len(jobConditions)-1].Type | ||||
| } | } | ||||
| @@ -311,12 +317,12 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| serviceFailed = true | serviceFailed = true | ||||
| // TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker | // TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker | ||||
| reason = "workerFailed" | reason = "workerFailed" | ||||
| message = "the worker of Jointinferenceservice failed" | |||||
| message = "the worker of service failed" | |||||
| newCondtionType = sednav1.JointInferenceServiceCondFailed | newCondtionType = sednav1.JointInferenceServiceCondFailed | ||||
| jc.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message) | |||||
| c.recorder.Event(&service, v1.EventTypeWarning, reason, message) | |||||
| } else { | } else { | ||||
| if len(pods) == 0 { | if len(pods) == 0 { | ||||
| active, manageServiceErr = jc.createWorkers(&jointinferenceservice) | |||||
| active, manageServiceErr = c.createWorkers(&service) | |||||
| } | } | ||||
| if manageServiceErr != nil { | if manageServiceErr != nil { | ||||
| serviceFailed = true | serviceFailed = true | ||||
| @@ -331,20 +337,20 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| // | // | ||||
| if newCondtionType != latestConditionType { | if newCondtionType != latestConditionType { | ||||
| jointinferenceservice.Status.Conditions = append(jointinferenceservice.Status.Conditions, NewJointInferenceServiceCondition(newCondtionType, reason, message)) | |||||
| service.Status.Conditions = append(service.Status.Conditions, newServiceCondition(newCondtionType, reason, message)) | |||||
| } | } | ||||
| forget := false | forget := false | ||||
| // no need to update the jointinferenceservice if the status hasn't changed since last time | // no need to update the jointinferenceservice if the status hasn't changed since last time | ||||
| if jointinferenceservice.Status.Active != active || jointinferenceservice.Status.Failed != failed || len(jointinferenceservice.Status.Conditions) != latestConditionLen { | |||||
| jointinferenceservice.Status.Active = active | |||||
| jointinferenceservice.Status.Failed = failed | |||||
| if service.Status.Active != active || service.Status.Failed != failed || len(service.Status.Conditions) != latestConditionLen { | |||||
| service.Status.Active = active | |||||
| service.Status.Failed = failed | |||||
| if err := jc.updateStatus(&jointinferenceservice); err != nil { | |||||
| if err := c.updateStatus(&service); err != nil { | |||||
| return forget, err | return forget, err | ||||
| } | } | ||||
| if serviceFailed && !isJointinferenceserviceFinished(&jointinferenceservice) { | |||||
| if serviceFailed && !isServiceFinished(&service) { | |||||
| // returning an error will re-enqueue jointinferenceservice after the backoff period | // returning an error will re-enqueue jointinferenceservice after the backoff period | ||||
| return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key) | return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key) | ||||
| } | } | ||||
| @@ -355,8 +361,8 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { | |||||
| return forget, manageServiceErr | return forget, manageServiceErr | ||||
| } | } | ||||
| // NewJointInferenceServiceCondition creates a new joint condition | |||||
| func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { | |||||
| // newServiceCondition creates a new joint condition | |||||
| func newServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { | |||||
| return sednav1.JointInferenceServiceCondition{ | return sednav1.JointInferenceServiceCondition{ | ||||
| Type: conditionType, | Type: conditionType, | ||||
| Status: v1.ConditionTrue, | Status: v1.ConditionTrue, | ||||
| @@ -367,24 +373,20 @@ func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServi | |||||
| } | } | ||||
| } | } | ||||
| func (jc *JointInferenceServiceController) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error { | |||||
| serviceClient := jc.client.JointInferenceServices(jointinferenceservice.Namespace) | |||||
| var err error | |||||
| for i := 0; i <= ResourceUpdateRetries; i = i + 1 { | |||||
| var newJointinferenceservice *sednav1.JointInferenceService | |||||
| newJointinferenceservice, err = serviceClient.Get(context.TODO(), jointinferenceservice.Name, metav1.GetOptions{}) | |||||
| func (c *Controller) updateStatus(service *sednav1.JointInferenceService) error { | |||||
| client := c.client.JointInferenceServices(service.Namespace) | |||||
| return runtime.RetryUpdateStatus(service.Name, service.Namespace, func() error { | |||||
| newService, err := client.Get(context.TODO(), service.Name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| break | |||||
| } | |||||
| newJointinferenceservice.Status = jointinferenceservice.Status | |||||
| if _, err = serviceClient.UpdateStatus(context.TODO(), newJointinferenceservice, metav1.UpdateOptions{}); err == nil { | |||||
| break | |||||
| return err | |||||
| } | } | ||||
| } | |||||
| return nil | |||||
| newService.Status = service.Status | |||||
| _, err = client.UpdateStatus(context.TODO(), newService, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | } | ||||
| func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { | |||||
| func isServiceFinished(j *sednav1.JointInferenceService) bool { | |||||
| for _, c := range j.Status.Conditions { | for _, c := range j.Status.Conditions { | ||||
| if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue { | if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue { | ||||
| return true | return true | ||||
| @@ -393,11 +395,11 @@ func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { | |||||
| return false | return false | ||||
| } | } | ||||
| func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { | |||||
| func (c *Controller) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { | |||||
| active = 0 | active = 0 | ||||
| // create cloud worker | // create cloud worker | ||||
| err = jc.createCloudWorker(service) | |||||
| err = c.createCloudWorker(service) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | return active, err | ||||
| } | } | ||||
| @@ -406,14 +408,14 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI | |||||
| // create k8s service for cloudPod | // create k8s service for cloudPod | ||||
| // FIXME(llhuii): only the case that Spec.NodeName specified is support, | // FIXME(llhuii): only the case that Spec.NodeName specified is support, | ||||
| // will support Spec.NodeSelector. | // will support Spec.NodeSelector. | ||||
| bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) | |||||
| bigServicePort, err := CreateKubernetesService(jc.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) | |||||
| bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) | |||||
| bigServicePort, err := runtime.CreateKubernetesService(c.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | return active, err | ||||
| } | } | ||||
| // create edge worker | // create edge worker | ||||
| err = jc.createEdgeWorker(service, bigServicePort) | |||||
| err = c.createEdgeWorker(service, bigServicePort) | |||||
| if err != nil { | if err != nil { | ||||
| return active, err | return active, err | ||||
| } | } | ||||
| @@ -422,24 +424,24 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI | |||||
| return active, err | return active, err | ||||
| } | } | ||||
| func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.JointInferenceService) error { | |||||
| func (c *Controller) createCloudWorker(service *sednav1.JointInferenceService) error { | |||||
| // deliver pod for cloudworker | // deliver pod for cloudworker | ||||
| cloudModelName := service.Spec.CloudWorker.Model.Name | cloudModelName := service.Spec.CloudWorker.Model.Name | ||||
| cloudModel, err := jc.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) | |||||
| cloudModel, err := c.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get cloud model %s: %w", | return fmt.Errorf("failed to get cloud model %s: %w", | ||||
| cloudModelName, err) | cloudModelName, err) | ||||
| } | } | ||||
| var workerParam WorkerParam | |||||
| var workerParam runtime.WorkerParam | |||||
| secretName := cloudModel.Spec.CredentialName | secretName := cloudModel.Spec.CredentialName | ||||
| var modelSecret *v1.Secret | var modelSecret *v1.Secret | ||||
| if secretName != "" { | if secretName != "" { | ||||
| modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| } | } | ||||
| workerParam.mounts = append(workerParam.mounts, WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: cloudModel.Spec.URL, | URL: cloudModel.Spec.URL, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -448,7 +450,7 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo | |||||
| EnvName: "MODEL_URL", | EnvName: "MODEL_URL", | ||||
| }) | }) | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": service.Namespace, | "NAMESPACE": service.Namespace, | ||||
| "SERVICE_NAME": service.Name, | "SERVICE_NAME": service.Name, | ||||
| "WORKER_NAME": "cloudworker-" + utilrand.String(5), | "WORKER_NAME": "cloudworker-" + utilrand.String(5), | ||||
| @@ -456,21 +458,21 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo | |||||
| "BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)), | "BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)), | ||||
| } | } | ||||
| workerParam.workerType = jointInferenceForCloud | |||||
| workerParam.WorkerType = jointInferenceForCloud | |||||
| // create cloud pod | // create cloud pod | ||||
| _, err = createPodWithTemplate(jc.kubeClient, | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, | |||||
| service, | service, | ||||
| &service.Spec.CloudWorker.Template, | &service.Spec.CloudWorker.Template, | ||||
| &workerParam) | &workerParam) | ||||
| return err | return err | ||||
| } | } | ||||
| func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { | |||||
| func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { | |||||
| // deliver pod for edgeworker | // deliver pod for edgeworker | ||||
| ctx := context.Background() | ctx := context.Background() | ||||
| edgeModelName := service.Spec.EdgeWorker.Model.Name | edgeModelName := service.Spec.EdgeWorker.Model.Name | ||||
| edgeModel, err := jc.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) | |||||
| edgeModel, err := c.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get edge model %s: %w", | return fmt.Errorf("failed to get edge model %s: %w", | ||||
| edgeModelName, err) | edgeModelName, err) | ||||
| @@ -479,13 +481,13 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi | |||||
| secretName := edgeModel.Spec.CredentialName | secretName := edgeModel.Spec.CredentialName | ||||
| var modelSecret *v1.Secret | var modelSecret *v1.Secret | ||||
| if secretName != "" { | if secretName != "" { | ||||
| modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| } | } | ||||
| // FIXME(llhuii): only the case that Spec.NodeName specified is support, | // FIXME(llhuii): only the case that Spec.NodeName specified is support, | ||||
| // will support Spec.NodeSelector. | // will support Spec.NodeSelector. | ||||
| // get bigModelIP from nodeName in cloudWorker | // get bigModelIP from nodeName in cloudWorker | ||||
| bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) | |||||
| bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get node ip: %w", err) | return fmt.Errorf("failed to get node ip: %w", err) | ||||
| } | } | ||||
| @@ -494,10 +496,10 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi | |||||
| HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters) | HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters) | ||||
| HEMParameterString := string(HEMParameterJSON) | HEMParameterString := string(HEMParameterJSON) | ||||
| var workerParam WorkerParam | |||||
| var workerParam runtime.WorkerParam | |||||
| workerParam.mounts = append(workerParam.mounts, WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: edgeModel.Spec.URL, | URL: edgeModel.Spec.URL, | ||||
| Secret: modelSecret, | Secret: modelSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -506,7 +508,7 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi | |||||
| EnvName: "MODEL_URL", | EnvName: "MODEL_URL", | ||||
| }) | }) | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": service.Namespace, | "NAMESPACE": service.Namespace, | ||||
| "SERVICE_NAME": service.Name, | "SERVICE_NAME": service.Name, | ||||
| "WORKER_NAME": "edgeworker-" + utilrand.String(5), | "WORKER_NAME": "edgeworker-" + utilrand.String(5), | ||||
| @@ -517,52 +519,37 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi | |||||
| "HEM_NAME": edgeWorker.HardExampleMining.Name, | "HEM_NAME": edgeWorker.HardExampleMining.Name, | ||||
| "HEM_PARAMETERS": HEMParameterString, | "HEM_PARAMETERS": HEMParameterString, | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| workerParam.workerType = jointInferenceForEdge | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.WorkerType = jointInferenceForEdge | |||||
| workerParam.HostNetwork = true | |||||
| // create edge pod | // create edge pod | ||||
| _, err = createPodWithTemplate(jc.kubeClient, | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, | |||||
| service, | service, | ||||
| &service.Spec.EdgeWorker.Template, | &service.Spec.EdgeWorker.Template, | ||||
| &workerParam) | &workerParam) | ||||
| return err | return err | ||||
| } | } | ||||
| // GetName returns the name of the joint inference controller | |||||
| func (jc *JointInferenceServiceController) GetName() string { | |||||
| return "JointInferenceServiceController" | |||||
| } | |||||
| // NewJointController creates a new JointInferenceService controller that keeps the relevant pods | |||||
| // New creates a new JointInferenceService controller that keeps the relevant pods | |||||
| // in sync with their corresponding JointInferenceService objects. | // in sync with their corresponding JointInferenceService objects. | ||||
| func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| var err error | |||||
| namespace := cfg.Namespace | |||||
| if namespace == "" { | |||||
| namespace = metav1.NamespaceAll | |||||
| } | |||||
| kubeClient, _ := utils.KubeClient() | |||||
| kubecfg, _ := utils.KubeConfig() | |||||
| crdclient, _ := clientset.NewForConfig(kubecfg) | |||||
| kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) | |||||
| func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { | |||||
| cfg := cc.Config | |||||
| podInformer := kubeInformerFactory.Core().V1().Pods() | |||||
| podInformer := cc.KubeInformerFactory.Core().V1().Pods() | |||||
| serviceInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) | |||||
| serviceInformer := serviceInformerFactory.Sedna().V1alpha1().JointInferenceServices() | |||||
| serviceInformer := cc.SednaInformerFactory.Sedna().V1alpha1().JointInferenceServices() | |||||
| eventBroadcaster := record.NewBroadcaster() | eventBroadcaster := record.NewBroadcaster() | ||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | |||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) | |||||
| jc := &JointInferenceServiceController{ | |||||
| kubeClient: kubeClient, | |||||
| client: crdclient.SednaV1alpha1(), | |||||
| jc := &Controller{ | |||||
| kubeClient: cc.KubeClient, | |||||
| client: cc.SednaClient.SednaV1alpha1(), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "jointinferenceservice"), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "jointinferenceservice"), | |||||
| recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), | recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), | ||||
| cfg: cfg, | cfg: cfg, | ||||
| } | } | ||||
| @@ -570,14 +557,17 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error | |||||
| serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||||
| AddFunc: func(obj interface{}) { | AddFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Added, obj) | |||||
| }, | }, | ||||
| UpdateFunc: func(old, cur interface{}) { | UpdateFunc: func(old, cur interface{}) { | ||||
| jc.enqueueController(cur, true) | jc.enqueueController(cur, true) | ||||
| jc.syncToEdge(watch.Added, cur) | |||||
| }, | }, | ||||
| DeleteFunc: func(obj interface{}) { | DeleteFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Deleted, obj) | |||||
| }, | }, | ||||
| }) | }) | ||||
| @@ -593,8 +583,5 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error | |||||
| jc.podStore = podInformer.Lister() | jc.podStore = podInformer.Lister() | ||||
| jc.podStoreSynced = podInformer.Informer().HasSynced | jc.podStoreSynced = podInformer.Informer().HasSynced | ||||
| stopCh := messageContext.Done() | |||||
| kubeInformerFactory.Start(stopCh) | |||||
| serviceInformerFactory.Start(stopCh) | |||||
| return jc, err | |||||
| return jc, nil | |||||
| } | } | ||||
| @@ -0,0 +1,92 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package jointinference | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| "fmt" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/klog/v2" | |||||
| ) | |||||
| func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error { | |||||
| client := c.client.JointInferenceServices(namespace) | |||||
| return runtime.RetryUpdateStatus(name, namespace, func() error { | |||||
| joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| joint.Status.Metrics = metrics | |||||
| _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| // updateFromEdge syncs the edge updates to k8s | |||||
| func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { | |||||
| // Output defines owner output information | |||||
| type Output struct { | |||||
| ServiceInfo map[string]interface{} `json:"ownerInfo"` | |||||
| } | |||||
| var status struct { | |||||
| // Phase always should be "inference" | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| Output *Output `json:"output"` | |||||
| } | |||||
| err := json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // TODO: propagate status.Status to k8s | |||||
| output := status.Output | |||||
| if output == nil || output.ServiceInfo == nil { | |||||
| // no output info | |||||
| klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) | |||||
| return nil | |||||
| } | |||||
| info := output.ServiceInfo | |||||
| for _, ignoreTimeKey := range []string{ | |||||
| "startTime", | |||||
| "updateTime", | |||||
| } { | |||||
| delete(info, ignoreTimeKey) | |||||
| } | |||||
| metrics := runtime.ConvertMapToMetrics(info) | |||||
| err = c.updateMetrics(name, namespace, metrics) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to update metrics, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { | |||||
| return addFunc(KindName, c.updateFromEdge) | |||||
| } | |||||
| @@ -0,0 +1,55 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package lifelonglearning | |||||
| import ( | |||||
| "fmt" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { | |||||
| job, ok := obj.(*sednav1.LifelongLearningJob) | |||||
| if !ok { | |||||
| return nil | |||||
| } | |||||
| // Since Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| job.Kind = KindName | |||||
| // Here only propagate to the nodes with non empty name | |||||
| // FIXME(llhuii): only the case that all workers having the same nodeName are support, | |||||
| // will support Spec.NodeSelector and differenect nodeName. | |||||
| nodeName := job.Spec.TrainSpec.Template.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) | |||||
| return c.sendToEdgeFunc(nodeName, eventType, job) | |||||
| } | |||||
| func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { | |||||
| c.sendToEdgeFunc = f | |||||
| return nil | |||||
| } | |||||
| @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package lifelonglearning | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| @@ -28,9 +28,8 @@ import ( | |||||
| utilrand "k8s.io/apimachinery/pkg/util/rand" | utilrand "k8s.io/apimachinery/pkg/util/rand" | ||||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||||
| "k8s.io/apimachinery/pkg/util/wait" | "k8s.io/apimachinery/pkg/util/wait" | ||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | "k8s.io/client-go/kubernetes" | ||||
| "k8s.io/client-go/kubernetes/scheme" | |||||
| v1core "k8s.io/client-go/kubernetes/typed/core/v1" | v1core "k8s.io/client-go/kubernetes/typed/core/v1" | ||||
| corelisters "k8s.io/client-go/listers/core/v1" | corelisters "k8s.io/client-go/listers/core/v1" | ||||
| "k8s.io/client-go/tools/cache" | "k8s.io/client-go/tools/cache" | ||||
| @@ -40,21 +39,25 @@ import ( | |||||
| k8scontroller "k8s.io/kubernetes/pkg/controller" | k8scontroller "k8s.io/kubernetes/pkg/controller" | ||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | ||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | ||||
| informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" | ||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | "github.com/kubeedge/sedna/pkg/globalmanager/config" | ||||
| messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | ) | ||||
| // ljControllerKind contains the schema.GroupVersionKind for this controller type. | |||||
| var ljControllerKind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob") | |||||
| const ( | |||||
| // KindName is the kind name of CR this controller controls | |||||
| KindName = "LifelongLearningJob" | |||||
| // Name is this controller name | |||||
| Name = "LifelongLearning" | |||||
| ) | |||||
| // Kind contains the schema.GroupVersionKind for this controller type. | |||||
| var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) | |||||
| // LifelongLearningJobController ensures that all LifelongLearningJob objects have corresponding pods to | |||||
| // Controller ensures that all LifelongLearningJob objects have corresponding pods to | |||||
| // run their configured workload. | // run their configured workload. | ||||
| type LifelongLearningJobController struct { | |||||
| type Controller struct { | |||||
| kubeClient kubernetes.Interface | kubeClient kubernetes.Interface | ||||
| client sednaclientset.SednaV1alpha1Interface | client sednaclientset.SednaV1alpha1Interface | ||||
| @@ -74,50 +77,47 @@ type LifelongLearningJobController struct { | |||||
| // LifelongLearningJobs that need to be updated | // LifelongLearningJobs that need to be updated | ||||
| queue workqueue.RateLimitingInterface | queue workqueue.RateLimitingInterface | ||||
| recorder record.EventRecorder | |||||
| cfg *config.ControllerConfig | cfg *config.ControllerConfig | ||||
| sendToEdgeFunc runtime.DownstreamSendFunc | |||||
| } | } | ||||
| // Run the main goroutine responsible for watching and syncing jobs. | |||||
| func (jc *LifelongLearningJobController) Start() error { | |||||
| // Run starts the main goroutine responsible for watching and syncing jobs. | |||||
| func (c *Controller) Run(stopCh <-chan struct{}) { | |||||
| workers := 1 | workers := 1 | ||||
| stopCh := messageContext.Done() | |||||
| go func() { | |||||
| defer utilruntime.HandleCrash() | |||||
| defer jc.queue.ShutDown() | |||||
| klog.Infof("Starting lifelonglearning job controller") | |||||
| defer klog.Infof("Shutting down lifelonglearning job controller") | |||||
| defer utilruntime.HandleCrash() | |||||
| defer c.queue.ShutDown() | |||||
| if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for caches to sync") | |||||
| klog.Infof("Starting %s controller", Name) | |||||
| defer klog.Infof("Shutting down %s controller", Name) | |||||
| return | |||||
| } | |||||
| klog.Infof("Starting lifelonglearning job workers") | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(jc.worker, time.Second, stopCh) | |||||
| } | |||||
| if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { | |||||
| klog.Errorf("failed to wait for %s caches to sync", Name) | |||||
| <-stopCh | |||||
| }() | |||||
| return nil | |||||
| return | |||||
| } | |||||
| klog.Infof("Starting %s workers", Name) | |||||
| for i := 0; i < workers; i++ { | |||||
| go wait.Until(c.worker, time.Second, stopCh) | |||||
| } | |||||
| <-stopCh | |||||
| } | } | ||||
| // enqueueByPod enqueues the lifelonglearningjob object of the specified pod. | // enqueueByPod enqueues the lifelonglearningjob object of the specified pod. | ||||
| func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { | |||||
| controllerRef := metav1.GetControllerOf(pod) | controllerRef := metav1.GetControllerOf(pod) | ||||
| if controllerRef == nil { | if controllerRef == nil { | ||||
| return | return | ||||
| } | } | ||||
| if controllerRef.Kind != ljControllerKind.Kind { | |||||
| if controllerRef.Kind != Kind.Kind { | |||||
| return | return | ||||
| } | } | ||||
| service, err := jc.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| service, err := c.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) | |||||
| if err != nil { | if err != nil { | ||||
| return | return | ||||
| } | } | ||||
| @@ -126,27 +126,27 @@ func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate boo | |||||
| return | return | ||||
| } | } | ||||
| jc.enqueueController(service, immediate) | |||||
| c.enqueueController(service, immediate) | |||||
| } | } | ||||
| // When a pod is created, enqueue the controller that manages it and update it's expectations. | // When a pod is created, enqueue the controller that manages it and update it's expectations. | ||||
| func (jc *LifelongLearningJobController) addPod(obj interface{}) { | |||||
| func (c *Controller) addPod(obj interface{}) { | |||||
| pod := obj.(*v1.Pod) | pod := obj.(*v1.Pod) | ||||
| if pod.DeletionTimestamp != nil { | if pod.DeletionTimestamp != nil { | ||||
| // on a restart of the controller, it's possible a new pod shows up in a state that | // on a restart of the controller, it's possible a new pod shows up in a state that | ||||
| // is already pending deletion. Prevent the pod from being a creation observation. | // is already pending deletion. Prevent the pod from being a creation observation. | ||||
| jc.deletePod(pod) | |||||
| c.deletePod(pod) | |||||
| return | return | ||||
| } | } | ||||
| // backoff to queue when PodFailed | // backoff to queue when PodFailed | ||||
| immediate := pod.Status.Phase != v1.PodFailed | immediate := pod.Status.Phase != v1.PodFailed | ||||
| jc.enqueueByPod(pod, immediate) | |||||
| c.enqueueByPod(pod, immediate) | |||||
| } | } | ||||
| // When a pod is updated, figure out what lifelonglearning job manage it and wake them up. | // When a pod is updated, figure out what lifelonglearning job manage it and wake them up. | ||||
| func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { | |||||
| func (c *Controller) updatePod(old, cur interface{}) { | |||||
| curPod := cur.(*v1.Pod) | curPod := cur.(*v1.Pod) | ||||
| oldPod := old.(*v1.Pod) | oldPod := old.(*v1.Pod) | ||||
| @@ -155,11 +155,11 @@ func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| jc.addPod(curPod) | |||||
| c.addPod(curPod) | |||||
| } | } | ||||
| // deletePod enqueues the lifelonglearningjob obj When a pod is deleted | // deletePod enqueues the lifelonglearningjob obj When a pod is deleted | ||||
| func (jc *LifelongLearningJobController) deletePod(obj interface{}) { | |||||
| func (c *Controller) deletePod(obj interface{}) { | |||||
| pod, ok := obj.(*v1.Pod) | pod, ok := obj.(*v1.Pod) | ||||
| // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go | ||||
| @@ -180,13 +180,13 @@ func (jc *LifelongLearningJobController) deletePod(obj interface{}) { | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| jc.enqueueByPod(pod, true) | |||||
| c.enqueueByPod(pod, true) | |||||
| } | } | ||||
| // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item, | // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item, | ||||
| // immediate tells the controller to update the status right away, and should | // immediate tells the controller to update the status right away, and should | ||||
| // happen ONLY when there was a successful pod run. | // happen ONLY when there was a successful pod run. | ||||
| func (jc *LifelongLearningJobController) enqueueController(obj interface{}, immediate bool) { | |||||
| func (c *Controller) enqueueController(obj interface{}, immediate bool) { | |||||
| key, err := k8scontroller.KeyFunc(obj) | key, err := k8scontroller.KeyFunc(obj) | ||||
| if err != nil { | if err != nil { | ||||
| utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) | utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) | ||||
| @@ -195,36 +195,36 @@ func (jc *LifelongLearningJobController) enqueueController(obj interface{}, imme | |||||
| backoff := time.Duration(0) | backoff := time.Duration(0) | ||||
| if !immediate { | if !immediate { | ||||
| backoff = getBackoff(jc.queue, key) | |||||
| backoff = runtime.GetBackoff(c.queue, key) | |||||
| } | } | ||||
| jc.queue.AddAfter(key, backoff) | |||||
| c.queue.AddAfter(key, backoff) | |||||
| } | } | ||||
| // worker runs a worker thread that just dequeues items, processes them, and marks them done. | // worker runs a worker thread that just dequeues items, processes them, and marks them done. | ||||
| // It enforces that the syncHandler is never invoked concurrently with the same key. | // It enforces that the syncHandler is never invoked concurrently with the same key. | ||||
| func (jc *LifelongLearningJobController) worker() { | |||||
| for jc.processNextWorkItem() { | |||||
| func (c *Controller) worker() { | |||||
| for c.processNextWorkItem() { | |||||
| } | } | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) processNextWorkItem() bool { | |||||
| key, quit := jc.queue.Get() | |||||
| func (c *Controller) processNextWorkItem() bool { | |||||
| key, quit := c.queue.Get() | |||||
| if quit { | if quit { | ||||
| return false | return false | ||||
| } | } | ||||
| defer jc.queue.Done(key) | |||||
| defer c.queue.Done(key) | |||||
| forget, err := jc.sync(key.(string)) | |||||
| forget, err := c.sync(key.(string)) | |||||
| if err == nil { | if err == nil { | ||||
| if forget { | if forget { | ||||
| jc.queue.Forget(key) | |||||
| c.queue.Forget(key) | |||||
| } | } | ||||
| return true | return true | ||||
| } | } | ||||
| utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err)) | utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err)) | ||||
| jc.queue.AddRateLimited(key) | |||||
| c.queue.AddRateLimited(key) | |||||
| return true | return true | ||||
| } | } | ||||
| @@ -232,7 +232,7 @@ func (jc *LifelongLearningJobController) processNextWorkItem() bool { | |||||
| // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning | // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning | ||||
| // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked | ||||
| // concurrently with the same key. | // concurrently with the same key. | ||||
| func (jc *LifelongLearningJobController) sync(key string) (bool, error) { | |||||
| func (c *Controller) sync(key string) (bool, error) { | |||||
| startTime := time.Now() | startTime := time.Now() | ||||
| defer func() { | defer func() { | ||||
| klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime)) | klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime)) | ||||
| @@ -245,7 +245,7 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { | |||||
| if len(ns) == 0 || len(name) == 0 { | if len(ns) == 0 || len(name) == 0 { | ||||
| return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) | return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) | ||||
| } | } | ||||
| sharedLifelongLearningJob, err := jc.jobLister.LifelongLearningJobs(ns).Get(name) | |||||
| sharedJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name) | |||||
| if err != nil { | if err != nil { | ||||
| if errors.IsNotFound(err) { | if errors.IsNotFound(err) { | ||||
| klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) | klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) | ||||
| @@ -253,18 +253,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { | |||||
| } | } | ||||
| return false, err | return false, err | ||||
| } | } | ||||
| lifelonglearningjob := *sharedLifelongLearningJob | |||||
| job := *sharedJob | |||||
| // set kind for lifelonglearningjob in case that the kind is None | // set kind for lifelonglearningjob in case that the kind is None | ||||
| lifelonglearningjob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob")) | |||||
| job.SetGroupVersionKind(Kind) | |||||
| // lifelonglearningjob first start | |||||
| if lifelonglearningjob.Status.StartTime == nil { | |||||
| if job.Status.StartTime == nil { | |||||
| // job is first in | |||||
| now := metav1.Now() | now := metav1.Now() | ||||
| lifelonglearningjob.Status.StartTime = &now | |||||
| job.Status.StartTime = &now | |||||
| } | } | ||||
| // if lifelonglearningjob was finished previously, we don't want to redo the termination | |||||
| if IsLifelongLearningJobFinished(&lifelonglearningjob) { | |||||
| // if job was finished previously, we don't want to redo the termination | |||||
| if IsJobFinished(&job) { | |||||
| return true, nil | return true, nil | ||||
| } | } | ||||
| @@ -272,18 +272,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { | |||||
| jobFailed := false | jobFailed := false | ||||
| needUpdated := false | needUpdated := false | ||||
| // update conditions of lifelonglearning job | |||||
| needUpdated, err = jc.updateLifelongLearningJobConditions(&lifelonglearningjob) | |||||
| // transit this job's state machine | |||||
| needUpdated, err = c.transitJobState(&job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", job.Namespace, job.Name, err) | |||||
| } | } | ||||
| if needUpdated { | if needUpdated { | ||||
| if err := jc.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil { | |||||
| if err := c.updateJobStatus(&job); err != nil { | |||||
| return forget, err | return forget, err | ||||
| } | } | ||||
| if jobFailed && !IsLifelongLearningJobFinished(&lifelonglearningjob) { | |||||
| if jobFailed && !IsJobFinished(&job) { | |||||
| // returning an error will re-enqueue LifelongLearningJob after the backoff period | // returning an error will re-enqueue LifelongLearningJob after the backoff period | ||||
| return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key) | return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key) | ||||
| } | } | ||||
| @@ -294,24 +294,25 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { | |||||
| return forget, err | return forget, err | ||||
| } | } | ||||
| // updateLifelongLearningJobConditions ensures that conditions of lifelonglearning job can be changed by podstatus | |||||
| func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) { | |||||
| // transitJobState transit job to next state | |||||
| func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) { | |||||
| var initialType sednav1.LLJobStageConditionType | var initialType sednav1.LLJobStageConditionType | ||||
| var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ | var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ | ||||
| Stage: sednav1.LLJobTrain, | Stage: sednav1.LLJobTrain, | ||||
| Type: initialType, | Type: initialType, | ||||
| } | } | ||||
| var newConditionType sednav1.LLJobStageConditionType | var newConditionType sednav1.LLJobStageConditionType | ||||
| latestCondition.Stage = sednav1.LLJobTrain | |||||
| var needUpdated = false | var needUpdated = false | ||||
| jobConditions := lifelonglearningjob.Status.Conditions | |||||
| var podStatus v1.PodPhase = v1.PodUnknown | var podStatus v1.PodPhase = v1.PodUnknown | ||||
| jobConditions := job.Status.Conditions | |||||
| if len(jobConditions) > 0 { | if len(jobConditions) > 0 { | ||||
| // get latest pod and pod status | // get latest pod and pod status | ||||
| latestCondition = (jobConditions)[len(jobConditions)-1] | latestCondition = (jobConditions)[len(jobConditions)-1] | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", lifelonglearningjob.Namespace, lifelonglearningjob.Name, | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name, | |||||
| latestCondition.Stage) | latestCondition.Stage) | ||||
| pod := jc.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage)) | |||||
| pod := c.getSpecifiedPods(job, string(latestCondition.Stage)) | |||||
| if pod != nil { | if pod != nil { | ||||
| podStatus = pod.Status.Phase | podStatus = pod.Status.Phase | ||||
| @@ -333,14 +334,14 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif | |||||
| // include train, eval, deploy pod | // include train, eval, deploy pod | ||||
| var err error | var err error | ||||
| if jobStage == sednav1.LLJobDeploy { | if jobStage == sednav1.LLJobDeploy { | ||||
| err = jc.restartInferPod(lifelonglearningjob) | |||||
| err = c.restartInferPod(job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) | |||||
| } else { | } else { | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", lifelonglearningjob.Namespace, lifelonglearningjob.Name) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) | |||||
| } | } | ||||
| } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { | } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { | ||||
| err = jc.createPod(lifelonglearningjob, jobStage) | |||||
| err = c.createPod(job, jobStage) | |||||
| } | } | ||||
| if err != nil { | if err != nil { | ||||
| return needUpdated, err | return needUpdated, err | ||||
| @@ -358,13 +359,13 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif | |||||
| } else if podStatus == v1.PodSucceeded { | } else if podStatus == v1.PodSucceeded { | ||||
| // watch pod status, if pod completed, set type completed | // watch pod status, if pod completed, set type completed | ||||
| newConditionType = sednav1.LLJobStageCondCompleted | newConditionType = sednav1.LLJobStageCondCompleted | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) | |||||
| } else if podStatus == v1.PodFailed { | } else if podStatus == v1.PodFailed { | ||||
| newConditionType = sednav1.LLJobStageCondFailed | newConditionType = sednav1.LLJobStageCondFailed | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) | |||||
| } | } | ||||
| case sednav1.LLJobStageCondCompleted: | case sednav1.LLJobStageCondCompleted: | ||||
| jobStage = jc.getNextStage(jobStage) | |||||
| jobStage = c.getNextStage(jobStage) | |||||
| newConditionType = sednav1.LLJobStageCondWaiting | newConditionType = sednav1.LLJobStageCondWaiting | ||||
| case sednav1.LLJobStageCondFailed: | case sednav1.LLJobStageCondFailed: | ||||
| @@ -374,34 +375,31 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif | |||||
| default: | default: | ||||
| // do nothing when given other type out of cases | // do nothing when given other type out of cases | ||||
| } | } | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobConditions) | |||||
| klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) | |||||
| if latestCondition.Type != newConditionType { | if latestCondition.Type != newConditionType { | ||||
| lifelonglearningjob.Status.Conditions = append(lifelonglearningjob.Status.Conditions, NewLifelongLearningJobCondition(newConditionType, jobStage)) | |||||
| job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(newConditionType, jobStage)) | |||||
| needUpdated = true | needUpdated = true | ||||
| return needUpdated, nil | return needUpdated, nil | ||||
| } | } | ||||
| return needUpdated, nil | return needUpdated, nil | ||||
| } | } | ||||
| // updateLifelongLearningJobStatus ensures that jobstatus can be updated rightly | |||||
| func (jc *LifelongLearningJobController) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error { | |||||
| jobClient := jc.client.LifelongLearningJobs(lifelonglearningjob.Namespace) | |||||
| var err error | |||||
| for i := 0; i <= ResourceUpdateRetries; i = i + 1 { | |||||
| var newLifelongLearningJob *sednav1.LifelongLearningJob | |||||
| newLifelongLearningJob, err = jobClient.Get(context.TODO(), lifelonglearningjob.Name, metav1.GetOptions{}) | |||||
| // updateJobStatus ensures that jobstatus can be updated rightly | |||||
| func (c *Controller) updateJobStatus(job *sednav1.LifelongLearningJob) error { | |||||
| jobClient := c.client.LifelongLearningJobs(job.Namespace) | |||||
| return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { | |||||
| newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| break | |||||
| return err | |||||
| } | } | ||||
| newLifelongLearningJob.Status = lifelonglearningjob.Status | |||||
| if _, err = jobClient.UpdateStatus(context.TODO(), newLifelongLearningJob, metav1.UpdateOptions{}); err == nil { | |||||
| break | |||||
| } | |||||
| } | |||||
| return err | |||||
| newJob.Status = job.Status | |||||
| _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | } | ||||
| func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { | |||||
| func NewJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { | |||||
| return sednav1.LLJobCondition{ | return sednav1.LLJobCondition{ | ||||
| Type: conditionType, | Type: conditionType, | ||||
| Status: v1.ConditionTrue, | Status: v1.ConditionTrue, | ||||
| @@ -413,17 +411,17 @@ func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionTy | |||||
| } | } | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) generatePodName(jobName string, workerType string) string { | |||||
| func (c *Controller) generatePodName(jobName string, workerType string) string { | |||||
| return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) | return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { | |||||
| func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { | |||||
| if podType == "Deploy" { | if podType == "Deploy" { | ||||
| podType = InferencePodType | |||||
| podType = runtime.InferencePodType | |||||
| } | } | ||||
| var latestPod *v1.Pod | var latestPod *v1.Pod | ||||
| selector, _ := GenerateSelector(job) | |||||
| pods, err := jc.podStore.Pods(job.Namespace).List(selector) | |||||
| selector, _ := runtime.GenerateSelector(job) | |||||
| pods, err := c.podStore.Pods(job.Namespace).List(selector) | |||||
| if len(pods) == 0 || err != nil { | if len(pods) == 0 || err != nil { | ||||
| return nil | return nil | ||||
| } | } | ||||
| @@ -443,20 +441,20 @@ func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongL | |||||
| return latestPod | return latestPod | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLearningJob) error { | |||||
| inferPod := jc.getSpecifiedPods(job, InferencePodType) | |||||
| func (c *Controller) restartInferPod(job *sednav1.LifelongLearningJob) error { | |||||
| inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) | |||||
| if inferPod == nil { | if inferPod == nil { | ||||
| klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name) | klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name) | ||||
| err := jc.createInferPod(job) | |||||
| err := c.createInferPod(job) | |||||
| return err | return err | ||||
| } | } | ||||
| ctx := context.Background() | ctx := context.Background() | ||||
| err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) | |||||
| err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | ||||
| return err | return err | ||||
| } | } | ||||
| err = jc.createInferPod(job) | |||||
| err = c.createInferPod(job) | |||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) | ||||
| return err | return err | ||||
| @@ -464,7 +462,7 @@ func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLe | |||||
| return nil | return nil | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { | |||||
| func (c *Controller) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { | |||||
| switch currentStage { | switch currentStage { | ||||
| case sednav1.LLJobTrain: | case sednav1.LLJobTrain: | ||||
| return sednav1.LLJobEval | return sednav1.LLJobEval | ||||
| @@ -477,9 +475,9 @@ func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJob | |||||
| } | } | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { | |||||
| func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { | |||||
| if name != "" { | if name != "" { | ||||
| secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| err = fmt.Errorf("failed to get the secret %s for %s: %w", | err = fmt.Errorf("failed to get the secret %s for %s: %w", | ||||
| name, | name, | ||||
| @@ -489,23 +487,23 @@ func (jc *LifelongLearningJobController) getSecret(namespace, name string, owner | |||||
| return | return | ||||
| } | } | ||||
| func IsLifelongLearningJobFinished(j *sednav1.LifelongLearningJob) bool { | |||||
| func IsJobFinished(j *sednav1.LifelongLearningJob) bool { | |||||
| // TODO | // TODO | ||||
| return false | return false | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { | |||||
| func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { | |||||
| ctx := context.Background() | ctx := context.Background() | ||||
| var podTemplate *v1.PodTemplateSpec | var podTemplate *v1.PodTemplateSpec | ||||
| LLDatasetName := job.Spec.Dataset.Name | LLDatasetName := job.Spec.Dataset.Name | ||||
| dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) | |||||
| dataset, err := c.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) | |||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err) | return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err) | ||||
| } | } | ||||
| datasetSecret, err := jc.getSecret( | |||||
| datasetSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| dataset.Spec.CredentialName, | dataset.Spec.CredentialName, | ||||
| fmt.Sprintf("dataset %s", dataset.Name), | fmt.Sprintf("dataset %s", dataset.Name), | ||||
| @@ -514,7 +512,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| return err | return err | ||||
| } | } | ||||
| jobSecret, err := jc.getSecret( | |||||
| jobSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| job.Spec.CredentialName, | job.Spec.CredentialName, | ||||
| fmt.Sprintf("lifelonglearning job %s", job.Name), | fmt.Sprintf("lifelonglearning job %s", job.Name), | ||||
| @@ -526,7 +524,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| // get all url for train and eval from data in condition | // get all url for train and eval from data in condition | ||||
| condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data | condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data | ||||
| klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) | klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) | ||||
| var cond LifelongLearningCondData | |||||
| var cond ConditionData | |||||
| (&cond).Unmarshal([]byte(condDataStr)) | (&cond).Unmarshal([]byte(condDataStr)) | ||||
| if cond.Input == nil { | if cond.Input == nil { | ||||
| return fmt.Errorf("empty input from condData") | return fmt.Errorf("empty input from condData") | ||||
| @@ -543,25 +541,25 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| originalDataURLOrIndex = dataset.Spec.URL | originalDataURLOrIndex = dataset.Spec.URL | ||||
| } | } | ||||
| var workerParam *WorkerParam = new(WorkerParam) | |||||
| var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) | |||||
| if podtype == sednav1.LLJobTrain { | if podtype == sednav1.LLJobTrain { | ||||
| workerParam.workerType = "Train" | |||||
| workerParam.WorkerType = "Train" | |||||
| podTemplate = &job.Spec.TrainSpec.Template | podTemplate = &job.Spec.TrainSpec.Template | ||||
| // Env parameters for train | // Env parameters for train | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "train-worker-" + utilrand.String(5), | "WORKER_NAME": "train-worker-" + utilrand.String(5), | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "KB_SERVER": jc.cfg.KB.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| "KB_SERVER": c.cfg.KB.Server, | |||||
| } | } | ||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: cond.Input.OutputDir, | URL: cond.Input.OutputDir, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: false, | DownloadByInitializer: false, | ||||
| @@ -569,8 +567,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| EnvName: "OUTPUT_URL", | EnvName: "OUTPUT_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: dataURL, | URL: dataURL, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -579,8 +577,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| }, | }, | ||||
| // see https://github.com/kubeedge/sedna/issues/35 | // see https://github.com/kubeedge/sedna/issues/35 | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| URL: originalDataURLOrIndex, | URL: originalDataURLOrIndex, | ||||
| Indirect: dataset.Spec.URL != originalDataURLOrIndex, | Indirect: dataset.Spec.URL != originalDataURLOrIndex, | ||||
| @@ -591,35 +589,35 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| ) | ) | ||||
| } else { | } else { | ||||
| podTemplate = &job.Spec.EvalSpec.Template | podTemplate = &job.Spec.EvalSpec.Template | ||||
| workerParam.workerType = "Eval" | |||||
| workerParam.WorkerType = "Eval" | |||||
| // Configure Env information for eval by initial WorkerParam | // Configure Env information for eval by initial WorkerParam | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "eval-worker-" + utilrand.String(5), | "WORKER_NAME": "eval-worker-" + utilrand.String(5), | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "KB_SERVER": jc.cfg.KB.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| "KB_SERVER": c.cfg.KB.Server, | |||||
| } | } | ||||
| var modelMountURLs []MountURL | |||||
| var modelMountURLs []runtime.MountURL | |||||
| for _, url := range inputmodelURLs { | for _, url := range inputmodelURLs { | ||||
| modelMountURLs = append(modelMountURLs, MountURL{ | |||||
| modelMountURLs = append(modelMountURLs, runtime.MountURL{ | |||||
| URL: url, | URL: url, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| }) | }) | ||||
| } | } | ||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URLs: modelMountURLs, | URLs: modelMountURLs, | ||||
| Name: "models", | Name: "models", | ||||
| EnvName: "MODEL_URLS", | EnvName: "MODEL_URLS", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: cond.Input.OutputDir, | URL: cond.Input.OutputDir, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: false, | DownloadByInitializer: false, | ||||
| @@ -627,8 +625,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| EnvName: "OUTPUT_URL", | EnvName: "OUTPUT_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: dataURL, | URL: dataURL, | ||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -637,8 +635,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| EnvName: "TEST_DATASET_URL", | EnvName: "TEST_DATASET_URL", | ||||
| }, | }, | ||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| Secret: datasetSecret, | Secret: datasetSecret, | ||||
| URL: originalDataURLOrIndex, | URL: originalDataURLOrIndex, | ||||
| DownloadByInitializer: true, | DownloadByInitializer: true, | ||||
| @@ -651,21 +649,21 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning | |||||
| } | } | ||||
| // set the default policy instead of Always policy | // set the default policy instead of Always policy | ||||
| workerParam.restartPolicy = v1.RestartPolicyOnFailure | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.RestartPolicy = v1.RestartPolicyOnFailure | |||||
| workerParam.HostNetwork = true | |||||
| // create pod based on podtype | // create pod based on podtype | ||||
| _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam) | |||||
| if err != nil { | if err != nil { | ||||
| return err | return err | ||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLearningJob) error { | |||||
| func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { | |||||
| inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/") | inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/") | ||||
| jobSecret, err := jc.getSecret( | |||||
| jobSecret, err := c.getSecret( | |||||
| job.Namespace, | job.Namespace, | ||||
| job.Spec.CredentialName, | job.Spec.CredentialName, | ||||
| fmt.Sprintf("lifelonglearning job %s", job.Name), | fmt.Sprintf("lifelonglearning job %s", job.Name), | ||||
| @@ -674,10 +672,10 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea | |||||
| return err | return err | ||||
| } | } | ||||
| var workerParam *WorkerParam = new(WorkerParam) | |||||
| workerParam.mounts = append(workerParam.mounts, | |||||
| WorkerMount{ | |||||
| URL: &MountURL{ | |||||
| var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) | |||||
| workerParam.Mounts = append(workerParam.Mounts, | |||||
| runtime.WorkerMount{ | |||||
| URL: &runtime.MountURL{ | |||||
| URL: inferModelURL, | URL: inferModelURL, | ||||
| Secret: jobSecret, | Secret: jobSecret, | ||||
| DownloadByInitializer: false, | DownloadByInitializer: false, | ||||
| @@ -687,75 +685,53 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea | |||||
| }, | }, | ||||
| ) | ) | ||||
| workerParam.env = map[string]string{ | |||||
| workerParam.Env = map[string]string{ | |||||
| "NAMESPACE": job.Namespace, | "NAMESPACE": job.Namespace, | ||||
| "JOB_NAME": job.Name, | "JOB_NAME": job.Name, | ||||
| "WORKER_NAME": "inferworker-" + utilrand.String(5), | "WORKER_NAME": "inferworker-" + utilrand.String(5), | ||||
| "LC_SERVER": jc.cfg.LC.Server, | |||||
| "LC_SERVER": c.cfg.LC.Server, | |||||
| } | } | ||||
| workerParam.workerType = InferencePodType | |||||
| workerParam.hostNetwork = true | |||||
| workerParam.WorkerType = runtime.InferencePodType | |||||
| workerParam.HostNetwork = true | |||||
| // create edge pod | // create edge pod | ||||
| _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) | |||||
| _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) | |||||
| return err | return err | ||||
| } | } | ||||
| // GetName returns the name of the lifelonglearning job controller | |||||
| func (jc *LifelongLearningJobController) GetName() string { | |||||
| return "LifelongLearningJobController" | |||||
| } | |||||
| // NewLifelongLearningJobController creates a new LifelongLearningJob controller that keeps the relevant pods | |||||
| // New creates a new LifelongLearningJob controller that keeps the relevant pods | |||||
| // in sync with their corresponding LifelongLearningJob objects. | // in sync with their corresponding LifelongLearningJob objects. | ||||
| func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| namespace := cfg.Namespace | |||||
| if namespace == "" { | |||||
| namespace = metav1.NamespaceAll | |||||
| } | |||||
| kubeClient, err := utils.KubeClient() | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| kubecfg, err := utils.KubeConfig() | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| crdclient, err := clientset.NewForConfig(kubecfg) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) | |||||
| func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { | |||||
| cfg := cc.Config | |||||
| podInformer := kubeInformerFactory.Core().V1().Pods() | |||||
| podInformer := cc.KubeInformerFactory.Core().V1().Pods() | |||||
| jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) | |||||
| jobInformer := jobInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() | |||||
| jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() | |||||
| eventBroadcaster := record.NewBroadcaster() | eventBroadcaster := record.NewBroadcaster() | ||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | |||||
| eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) | |||||
| jc := &LifelongLearningJobController{ | |||||
| kubeClient: kubeClient, | |||||
| client: crdclient.SednaV1alpha1(), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "lifelonglearningjob"), | |||||
| recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}), | |||||
| jc := &Controller{ | |||||
| kubeClient: cc.KubeClient, | |||||
| client: cc.SednaClient.SednaV1alpha1(), | |||||
| queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), | |||||
| cfg: cfg, | cfg: cfg, | ||||
| } | } | ||||
| jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||||
| AddFunc: func(obj interface{}) { | AddFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Added, obj) | |||||
| }, | }, | ||||
| UpdateFunc: func(old, cur interface{}) { | UpdateFunc: func(old, cur interface{}) { | ||||
| jc.enqueueController(cur, true) | jc.enqueueController(cur, true) | ||||
| jc.syncToEdge(watch.Added, cur) | |||||
| }, | }, | ||||
| DeleteFunc: func(obj interface{}) { | DeleteFunc: func(obj interface{}) { | ||||
| jc.enqueueController(obj, true) | jc.enqueueController(obj, true) | ||||
| jc.syncToEdge(watch.Deleted, obj) | |||||
| }, | }, | ||||
| }) | }) | ||||
| jc.jobLister = jobInformer.Lister() | jc.jobLister = jobInformer.Lister() | ||||
| @@ -769,8 +745,5 @@ func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureCont | |||||
| jc.podStore = podInformer.Lister() | jc.podStore = podInformer.Lister() | ||||
| jc.podStoreSynced = podInformer.Informer().HasSynced | jc.podStoreSynced = podInformer.Informer().HasSynced | ||||
| stopCh := make(chan struct{}) | |||||
| kubeInformerFactory.Start(stopCh) | |||||
| jobInformerFactory.Start(stopCh) | |||||
| return jc, err | |||||
| return jc, nil | |||||
| } | } | ||||
| @@ -0,0 +1,164 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package lifelonglearning | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| "fmt" | |||||
| "strings" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| v1 "k8s.io/api/core/v1" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| type Model = runtime.Model | |||||
| // the data of this condition including the input/output to do the next step | |||||
| type ConditionData struct { | |||||
| Input *struct { | |||||
| // Only one model cases | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| DataURL string `json:"dataURL,omitempty"` | |||||
| // the data samples reference will be stored into this URL. | |||||
| // The content of this url would be: | |||||
| // # the first uncomment line means the directory | |||||
| // s3://dataset/ | |||||
| // mnist/0.jpg | |||||
| // mnist/1.jpg | |||||
| DataIndexURL string `json:"dataIndexURL,omitempty"` | |||||
| OutputDir string `json:"outputDir,omitempty"` | |||||
| } `json:"input,omitempty"` | |||||
| Output *struct { | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| } `json:"output,omitempty"` | |||||
| } | |||||
| func (cd *ConditionData) joinModelURLs(model *Model, models []Model) []string { | |||||
| var modelURLs []string | |||||
| if model != nil { | |||||
| modelURLs = append(modelURLs, model.GetURL()) | |||||
| } else { | |||||
| for _, m := range models { | |||||
| modelURLs = append(modelURLs, m.GetURL()) | |||||
| } | |||||
| } | |||||
| return modelURLs | |||||
| } | |||||
| func (cd *ConditionData) Unmarshal(data []byte) error { | |||||
| return json.Unmarshal(data, cd) | |||||
| } | |||||
| func (cd ConditionData) Marshal() ([]byte, error) { | |||||
| return json.Marshal(cd) | |||||
| } | |||||
| func (cd *ConditionData) GetInputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) | |||||
| } | |||||
| func (cd *ConditionData) GetOutputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) | |||||
| } | |||||
| func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { | |||||
| client := c.client.LifelongLearningJobs(namespace) | |||||
| return runtime.RetryUpdateStatus(name, namespace, func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| // updateFromEdge syncs the edge updates to k8s | |||||
| func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { | |||||
| var jobStatus struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| } | |||||
| err := json.Unmarshal(content, &jobStatus) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // Get the condition data. | |||||
| // Here unmarshal and marshal immediately to skip the unnecessary fields | |||||
| var condData ConditionData | |||||
| err = json.Unmarshal(content, &condData) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| condDataBytes, _ := json.Marshal(&condData) | |||||
| cond := sednav1.LLJobCondition{ | |||||
| Status: v1.ConditionTrue, | |||||
| LastHeartbeatTime: metav1.Now(), | |||||
| LastTransitionTime: metav1.Now(), | |||||
| Data: string(condDataBytes), | |||||
| Message: "reported by lc", | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Phase) { | |||||
| case "train": | |||||
| cond.Stage = sednav1.LLJobTrain | |||||
| case "eval": | |||||
| cond.Stage = sednav1.LLJobEval | |||||
| case "deploy": | |||||
| cond.Stage = sednav1.LLJobDeploy | |||||
| default: | |||||
| return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Status) { | |||||
| case "ready": | |||||
| cond.Type = sednav1.LLJobStageCondReady | |||||
| case "completed": | |||||
| cond.Type = sednav1.LLJobStageCondCompleted | |||||
| case "failed": | |||||
| cond.Type = sednav1.LLJobStageCondFailed | |||||
| case "waiting": | |||||
| cond.Type = sednav1.LLJobStageCondWaiting | |||||
| default: | |||||
| return fmt.Errorf("invalid condition type: %v", jobStatus.Status) | |||||
| } | |||||
| err = c.appendStatusCondition(name, namespace, cond) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to append condition, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { | |||||
| return addFunc(KindName, c.updateFromEdge) | |||||
| } | |||||
| @@ -0,0 +1,128 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package controllers | |||||
| import ( | |||||
| "fmt" | |||||
| "math/rand" | |||||
| "time" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/klog/v2" | |||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" | |||||
| websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| ) | |||||
| // Manager defines the controller manager | |||||
| type Manager struct { | |||||
| Config *config.ControllerConfig | |||||
| } | |||||
| // New creates the controller manager | |||||
| func New(cc *config.ControllerConfig) *Manager { | |||||
| config.InitConfigure(cc) | |||||
| return &Manager{ | |||||
| Config: cc, | |||||
| } | |||||
| } | |||||
| func genResyncPeriod(minPeriod time.Duration) time.Duration { | |||||
| factor := rand.Float64() + 1 | |||||
| // [minPeriod, 2*minPeriod) | |||||
| return time.Duration(factor * float64(minPeriod.Nanoseconds())) | |||||
| } | |||||
| // Start starts the controllers it has managed | |||||
| func (m *Manager) Start() error { | |||||
| kubeClient, err := utils.KubeClient() | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| kubecfg, err := utils.KubeConfig() | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| sednaClient, err := clientset.NewForConfig(kubecfg) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| cfg := m.Config | |||||
| namespace := cfg.Namespace | |||||
| if namespace == "" { | |||||
| namespace = metav1.NamespaceAll | |||||
| } | |||||
| // TODO(llhuii): make this period configurable | |||||
| minResyncPeriod := time.Second * 30 | |||||
| kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, genResyncPeriod(minResyncPeriod), kubeinformers.WithNamespace(namespace)) | |||||
| sednaInformerFactory := sednainformers.NewSharedInformerFactoryWithOptions(sednaClient, genResyncPeriod(minResyncPeriod), sednainformers.WithNamespace(namespace)) | |||||
| context := &runtime.ControllerContext{ | |||||
| Config: m.Config, | |||||
| KubeClient: kubeClient, | |||||
| KubeInformerFactory: kubeInformerFactory, | |||||
| SednaClient: sednaClient, | |||||
| SednaInformerFactory: sednaInformerFactory, | |||||
| } | |||||
| uc, _ := NewUpstreamController(context) | |||||
| downstreamSendFunc := messagelayer.NewContextMessageLayer().SendResourceObject | |||||
| stopCh := make(chan struct{}) | |||||
| go uc.Run(stopCh) | |||||
| for name, factory := range NewRegistry() { | |||||
| f, err := factory(context) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to initialize controller %s: %v", name, err) | |||||
| } | |||||
| f.SetDownstreamSendFunc(downstreamSendFunc) | |||||
| f.SetUpstreamHandler(uc.Add) | |||||
| klog.Infof("initialized controller %s", name) | |||||
| go f.Run(stopCh) | |||||
| } | |||||
| kubeInformerFactory.Start(stopCh) | |||||
| sednaInformerFactory.Start(stopCh) | |||||
| addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port) | |||||
| ws := websocket.NewServer(addr) | |||||
| err = ws.ListenAndServe() | |||||
| if err != nil { | |||||
| close(stopCh) | |||||
| return fmt.Errorf("failed to listen websocket at %s: %v", addr, err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -0,0 +1,40 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package controllers | |||||
| import ( | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/controllers/dataset" | |||||
| fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" | |||||
| il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" | |||||
| ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference" | |||||
| ll "github.com/kubeedge/sedna/pkg/globalmanager/controllers/lifelonglearning" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| type FeatureFactory = func(*runtime.ControllerContext) (runtime.FeatureControllerI, error) | |||||
| type Registry map[string]FeatureFactory | |||||
| func NewRegistry() Registry { | |||||
| return Registry{ | |||||
| ji.Name: ji.New, | |||||
| fl.Name: fl.New, | |||||
| il.Name: il.New, | |||||
| ll.Name: ll.New, | |||||
| dataset.Name: dataset.New, | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,105 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package controllers | |||||
| import ( | |||||
| "fmt" | |||||
| "strings" | |||||
| "k8s.io/klog/v2" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| ) | |||||
| // UpstreamController subscribes the updates from edge and syncs to k8s api server | |||||
| type UpstreamController struct { | |||||
| messageLayer messagelayer.MessageLayer | |||||
| updateHandlers map[string]runtime.UpstreamHandler | |||||
| } | |||||
| func (uc *UpstreamController) checkOperation(operation string) error { | |||||
| // current only support the 'status' operation | |||||
| if operation != "status" { | |||||
| return fmt.Errorf("unknown operation '%s'", operation) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| // syncEdgeUpdate receives the updates from edge and syncs these to k8s. | |||||
| func (uc *UpstreamController) syncEdgeUpdate() { | |||||
| for { | |||||
| select { | |||||
| case <-uc.messageLayer.Done(): | |||||
| klog.Info("Stop sedna upstream loop") | |||||
| return | |||||
| default: | |||||
| } | |||||
| update, err := uc.messageLayer.ReceiveResourceUpdate() | |||||
| if err == nil { | |||||
| err = uc.checkOperation(update.Operation) | |||||
| } | |||||
| if err != nil { | |||||
| klog.Warningf("Ignore update since this err: %+v", err) | |||||
| continue | |||||
| } | |||||
| kind := update.Kind | |||||
| namespace := update.Namespace | |||||
| name := update.Name | |||||
| operation := update.Operation | |||||
| handler, ok := uc.updateHandlers[kind] | |||||
| if ok { | |||||
| err := handler(name, namespace, operation, update.Content) | |||||
| if err != nil { | |||||
| klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err) | |||||
| } | |||||
| } else { | |||||
| klog.Warningf("No handler for resource kind %s", kind) | |||||
| } | |||||
| } | |||||
| } | |||||
| // Run starts the upstream controller | |||||
| func (uc *UpstreamController) Run(stopCh <-chan struct{}) { | |||||
| klog.Info("Start the sedna upstream controller") | |||||
| uc.syncEdgeUpdate() | |||||
| <-stopCh | |||||
| } | |||||
| func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamHandler) error { | |||||
| kind = strings.ToLower(kind) | |||||
| if _, ok := uc.updateHandlers[kind]; ok { | |||||
| return fmt.Errorf("a upstream handler for kind %s already exists", kind) | |||||
| } | |||||
| uc.updateHandlers[kind] = handler | |||||
| return nil | |||||
| } | |||||
| // NewUpstreamController creates a new Upstream controller from config | |||||
| func NewUpstreamController(cc *runtime.ControllerContext) (*UpstreamController, error) { | |||||
| uc := &UpstreamController{ | |||||
| messageLayer: messagelayer.NewContextMessageLayer(), | |||||
| updateHandlers: make(map[string]runtime.UpstreamHandler), | |||||
| } | |||||
| return uc, nil | |||||
| } | |||||
| @@ -1,388 +0,0 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package globalmanager | |||||
| import ( | |||||
| "context" | |||||
| "fmt" | |||||
| "time" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/apimachinery/pkg/fields" | |||||
| "k8s.io/apimachinery/pkg/runtime" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| "k8s.io/client-go/kubernetes" | |||||
| "k8s.io/client-go/tools/cache" | |||||
| "k8s.io/klog/v2" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| ) | |||||
| // DownstreamController watch kubernetes api server and send the controller resource change to edge | |||||
| type DownstreamController struct { | |||||
| // events from watch kubernetes api server | |||||
| events chan watch.Event | |||||
| cfg *config.ControllerConfig | |||||
| client *clientset.SednaV1alpha1Client | |||||
| kubeClient kubernetes.Interface | |||||
| messageLayer messagelayer.MessageLayer | |||||
| } | |||||
| func (dc *DownstreamController) injectSecret(obj CommonInterface, secretName string) error { | |||||
| if secretName == "" { | |||||
| return nil | |||||
| } | |||||
| secret, err := dc.kubeClient.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| klog.Warningf("failed to get the secret %s: %+v", | |||||
| secretName, err) | |||||
| return err | |||||
| } | |||||
| InjectSecretObj(obj, secret) | |||||
| return err | |||||
| } | |||||
| // syncDataset syncs the dataset resources | |||||
| func (dc *DownstreamController) syncDataset(eventType watch.EventType, dataset *sednav1.Dataset) error { | |||||
| // Here only propagate to the nodes with non empty name | |||||
| nodeName := dataset.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| dc.injectSecret(dataset, dataset.Spec.CredentialName) | |||||
| return dc.messageLayer.SendResourceObject(nodeName, eventType, dataset) | |||||
| } | |||||
| // syncJointInferenceService syncs the joint-inference-service resources | |||||
| func (dc *DownstreamController) syncJointInferenceService(eventType watch.EventType, joint *sednav1.JointInferenceService) error { | |||||
| // Here only propagate to the nodes with non empty name | |||||
| // FIXME: only the case that Spec.NodeName specified is support | |||||
| nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| return dc.messageLayer.SendResourceObject(nodeName, eventType, joint) | |||||
| } | |||||
| // syncFederatedLearningJob syncs the federated resources | |||||
| func (dc *DownstreamController) syncFederatedLearningJob(eventType watch.EventType, job *sednav1.FederatedLearningJob) error { | |||||
| // broadcast to all nodes specified in spec | |||||
| nodeset := make(map[string]bool) | |||||
| for _, trainingWorker := range job.Spec.TrainingWorkers { | |||||
| // Here only propagate to the nodes with non empty name | |||||
| if len(trainingWorker.Template.Spec.NodeName) > 0 { | |||||
| nodeset[trainingWorker.Template.Spec.NodeName] = true | |||||
| } | |||||
| } | |||||
| for nodeName := range nodeset { | |||||
| dc.messageLayer.SendResourceObject(nodeName, eventType, job) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| // syncModelWithName will sync the model to the specified node. | |||||
| // Now called when creating the incrementaljob. | |||||
| func (dc *DownstreamController) syncModelWithName(nodeName, modelName, namespace string) error { | |||||
| model, err := dc.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| // TODO: maybe use err.ErrStatus.Code == 404 | |||||
| return fmt.Errorf("model(%s/%s) not found", namespace, modelName) | |||||
| } | |||||
| // Since model.Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| if len(model.Kind) == 0 { | |||||
| model.Kind = "Model" | |||||
| } | |||||
| dc.injectSecret(model, model.Spec.CredentialName) | |||||
| dc.messageLayer.SendResourceObject(nodeName, watch.Added, model) | |||||
| return nil | |||||
| } | |||||
| // syncIncrementalJob syncs the incremental learning jobs | |||||
| func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, job *sednav1.IncrementalLearningJob) error { | |||||
| jobConditions := job.Status.Conditions | |||||
| if len(jobConditions) == 0 { | |||||
| return nil | |||||
| } | |||||
| dataName := job.Spec.Dataset.Name | |||||
| ds, err := dc.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) | |||||
| } | |||||
| // LC has dataset object on this node that may call dataset node | |||||
| dsNodeName := ds.Spec.NodeName | |||||
| var trainNodeName string | |||||
| var evalNodeName string | |||||
| ann := job.GetAnnotations() | |||||
| if ann != nil { | |||||
| trainNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] | |||||
| evalNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobEval)] | |||||
| } | |||||
| if eventType == watch.Deleted { | |||||
| // delete jobs from all LCs | |||||
| for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { | |||||
| if v != "" { | |||||
| dc.messageLayer.SendResourceObject(v, eventType, job) | |||||
| } | |||||
| } | |||||
| return nil | |||||
| } | |||||
| latestCondition := jobConditions[len(jobConditions)-1] | |||||
| currentType := latestCondition.Type | |||||
| jobStage := latestCondition.Stage | |||||
| syncModelWithName := func(modelName string) { | |||||
| if err := dc.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { | |||||
| klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", | |||||
| modelName, job.Name, dsNodeName, err) | |||||
| } | |||||
| } | |||||
| syncJobWithNodeName := func(nodeName string) { | |||||
| if err := dc.messageLayer.SendResourceObject(nodeName, eventType, job); err != nil { | |||||
| klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", | |||||
| job.Name, nodeName, jobStage, err) | |||||
| } | |||||
| } | |||||
| dc.injectSecret(job, job.Spec.CredentialName) | |||||
| doJobStageEvent := func(modelName string, nodeName string) { | |||||
| if currentType == sednav1.ILJobStageCondWaiting { | |||||
| syncJobWithNodeName(dsNodeName) | |||||
| syncModelWithName(modelName) | |||||
| } else if currentType == sednav1.ILJobStageCondRunning { | |||||
| if nodeName != "" { | |||||
| syncJobWithNodeName(nodeName) | |||||
| } | |||||
| } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { | |||||
| if nodeName != dsNodeName { | |||||
| // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. | |||||
| dc.messageLayer.SendResourceObject(nodeName, watch.Deleted, job) | |||||
| } | |||||
| } | |||||
| } | |||||
| switch jobStage { | |||||
| case sednav1.ILJobTrain: | |||||
| doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) | |||||
| case sednav1.ILJobEval: | |||||
| doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| // syncLifelongLearningJob syncs the lifelonglearning jobs | |||||
| func (dc *DownstreamController) syncLifelongLearningJob(eventType watch.EventType, job *sednav1.LifelongLearningJob) error { | |||||
| // Here only propagate to the nodes with non empty name | |||||
| // FIXME(llhuii): only the case that all workers having the same nodeName are support, | |||||
| // will support Spec.NodeSelector and differenect nodeName. | |||||
| nodeName := job.Spec.TrainSpec.Template.Spec.NodeName | |||||
| if len(nodeName) == 0 { | |||||
| return fmt.Errorf("empty node name") | |||||
| } | |||||
| dc.injectSecret(job, job.Spec.CredentialName) | |||||
| dc.messageLayer.SendResourceObject(nodeName, eventType, job) | |||||
| return nil | |||||
| } | |||||
| // sync defines the entrypoint of syncing all resources | |||||
| func (dc *DownstreamController) sync(stopCh <-chan struct{}) { | |||||
| for { | |||||
| select { | |||||
| case <-stopCh: | |||||
| klog.Info("Stop controller downstream loop") | |||||
| return | |||||
| case e := <-dc.events: | |||||
| var err error | |||||
| var kind, namespace, name string | |||||
| switch t := e.Object.(type) { | |||||
| case (*sednav1.Dataset): | |||||
| // Since t.Kind may be empty, | |||||
| // we need to fix the kind here if missing. | |||||
| // more details at https://github.com/kubernetes/kubernetes/issues/3030 | |||||
| if len(t.Kind) == 0 { | |||||
| t.Kind = "Dataset" | |||||
| } | |||||
| kind = t.Kind | |||||
| namespace = t.Namespace | |||||
| name = t.Name | |||||
| err = dc.syncDataset(e.Type, t) | |||||
| case (*sednav1.JointInferenceService): | |||||
| // TODO: find a good way to avoid these duplicate codes | |||||
| if len(t.Kind) == 0 { | |||||
| t.Kind = "JointInferenceService" | |||||
| } | |||||
| kind = t.Kind | |||||
| namespace = t.Namespace | |||||
| name = t.Name | |||||
| err = dc.syncJointInferenceService(e.Type, t) | |||||
| case (*sednav1.FederatedLearningJob): | |||||
| if len(t.Kind) == 0 { | |||||
| t.Kind = "FederatedLearningJob" | |||||
| } | |||||
| kind = t.Kind | |||||
| namespace = t.Namespace | |||||
| name = t.Name | |||||
| err = dc.syncFederatedLearningJob(e.Type, t) | |||||
| case (*sednav1.IncrementalLearningJob): | |||||
| if len(t.Kind) == 0 { | |||||
| t.Kind = "IncrementalLearningJob" | |||||
| } | |||||
| kind = t.Kind | |||||
| namespace = t.Namespace | |||||
| name = t.Name | |||||
| err = dc.syncIncrementalJob(e.Type, t) | |||||
| case (*sednav1.LifelongLearningJob): | |||||
| if len(t.Kind) == 0 { | |||||
| t.Kind = "LifelongLearningJob" | |||||
| } | |||||
| kind = t.Kind | |||||
| namespace = t.Namespace | |||||
| name = t.Name | |||||
| err = dc.syncLifelongLearningJob(e.Type, t) | |||||
| default: | |||||
| klog.Warningf("object type: %T unsupported", e) | |||||
| continue | |||||
| } | |||||
| if err != nil { | |||||
| klog.Warningf("Error to sync %s(%s/%s), err: %+v", kind, namespace, name, err) | |||||
| } else { | |||||
| klog.V(2).Infof("synced %s(%s/%s)", kind, namespace, name) | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| // watch function watches the crd resources which should by synced to nodes | |||||
| func (dc *DownstreamController) watch(stopCh <-chan struct{}) { | |||||
| rh := cache.ResourceEventHandlerFuncs{ | |||||
| AddFunc: func(obj interface{}) { | |||||
| eventObj := obj.(runtime.Object) | |||||
| dc.events <- watch.Event{Type: watch.Added, Object: eventObj} | |||||
| }, | |||||
| UpdateFunc: func(old, cur interface{}) { | |||||
| // Since we don't support the spec update operation currently, | |||||
| // so only status updates arrive here and NO propagation to edge. | |||||
| // Update: | |||||
| // We sync it to edge when using self-built websocket, and | |||||
| // this sync isn't needed when we switch out self-built websocket. | |||||
| dc.events <- watch.Event{Type: watch.Added, Object: cur.(runtime.Object)} | |||||
| }, | |||||
| DeleteFunc: func(obj interface{}) { | |||||
| eventObj := obj.(runtime.Object) | |||||
| dc.events <- watch.Event{Type: watch.Deleted, Object: eventObj} | |||||
| }, | |||||
| } | |||||
| client := dc.client.RESTClient() | |||||
| // make this option configurable | |||||
| resyncPeriod := time.Second * 60 | |||||
| namespace := dc.cfg.Namespace | |||||
| // TODO: use the informer | |||||
| for resourceName, object := range map[string]runtime.Object{ | |||||
| "datasets": &sednav1.Dataset{}, | |||||
| "jointinferenceservices": &sednav1.JointInferenceService{}, | |||||
| "federatedlearningjobs": &sednav1.FederatedLearningJob{}, | |||||
| "incrementallearningjobs": &sednav1.IncrementalLearningJob{}, | |||||
| "lifelonglearningjobs": &sednav1.LifelongLearningJob{}, | |||||
| } { | |||||
| lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything()) | |||||
| si := cache.NewSharedInformer(lw, object, resyncPeriod) | |||||
| si.AddEventHandler(rh) | |||||
| go si.Run(stopCh) | |||||
| } | |||||
| } | |||||
| // Start starts the controller | |||||
| func (dc *DownstreamController) Start() error { | |||||
| stopCh := dc.messageLayer.Done() | |||||
| // watch is an asynchronous call | |||||
| dc.watch(stopCh) | |||||
| // sync is a synchronous call | |||||
| go dc.sync(stopCh) | |||||
| return nil | |||||
| } | |||||
| // GetName returns the name of the downstream controller | |||||
| func (dc *DownstreamController) GetName() string { | |||||
| return "DownstreamController" | |||||
| } | |||||
| // NewDownstreamController creates a controller DownstreamController from config | |||||
| func NewDownstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| // TODO: make bufferSize configurable | |||||
| bufferSize := 10 | |||||
| events := make(chan watch.Event, bufferSize) | |||||
| crdclient, err := utils.NewCRDClient() | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("create crd client failed with error: %w", err) | |||||
| } | |||||
| kubeClient, err := utils.KubeClient() | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| dc := &DownstreamController{ | |||||
| cfg: cfg, | |||||
| events: events, | |||||
| client: crdclient, | |||||
| kubeClient: kubeClient, | |||||
| messageLayer: messagelayer.NewContextMessageLayer(), | |||||
| } | |||||
| return dc, nil | |||||
| } | |||||
| @@ -14,10 +14,11 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package runtime | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "encoding/json" | |||||
| "fmt" | "fmt" | ||||
| "math" | "math" | ||||
| "strings" | "strings" | ||||
| @@ -27,16 +28,14 @@ import ( | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| "k8s.io/client-go/kubernetes" | "k8s.io/client-go/kubernetes" | ||||
| "k8s.io/client-go/util/workqueue" | "k8s.io/client-go/util/workqueue" | ||||
| "k8s.io/klog/v2" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| // DefaultBackOff is the default backoff period | |||||
| DefaultBackOff = 10 * time.Second | |||||
| // MaxBackOff is the max backoff period | |||||
| MaxBackOff = 360 * time.Second | |||||
| bigModelPort int32 = 5000 | |||||
| // ResourceUpdateRetries defines times of retrying to update resource | |||||
| ResourceUpdateRetries = 3 | |||||
| // resourceUpdateTries defines times of trying to update resource | |||||
| resourceUpdateTries = 3 | |||||
| ) | ) | ||||
| // GetNodeIPByName get node ip by node name | // GetNodeIPByName get node ip by node name | ||||
| @@ -62,8 +61,8 @@ func GetNodeIPByName(kubeClient kubernetes.Interface, name string) (string, erro | |||||
| return "", fmt.Errorf("can't found node ip for node %s", name) | return "", fmt.Errorf("can't found node ip for node %s", name) | ||||
| } | } | ||||
| // getBackoff calc the next wait time for the key | |||||
| func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { | |||||
| // GetBackoff calc the next wait time for the key | |||||
| func GetBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { | |||||
| exp := queue.NumRequeues(key) | exp := queue.NumRequeues(key) | ||||
| if exp <= 0 { | if exp <= 0 { | ||||
| @@ -83,7 +82,7 @@ func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Dur | |||||
| return calculated | return calculated | ||||
| } | } | ||||
| func calcActivePodCount(pods []*v1.Pod) int32 { | |||||
| func CalcActivePodCount(pods []*v1.Pod) int32 { | |||||
| var result int32 = 0 | var result int32 = 0 | ||||
| for _, p := range pods { | for _, p := range pods { | ||||
| if v1.PodSucceeded != p.Status.Phase && | if v1.PodSucceeded != p.Status.Phase && | ||||
| @@ -129,3 +128,35 @@ func ConvertK8SValidName(name string) string { | |||||
| return string(fixName) | return string(fixName) | ||||
| } | } | ||||
| // ConvertMapToMetrics converts the metric map to list of resource Metric | |||||
| func ConvertMapToMetrics(metric map[string]interface{}) []sednav1.Metric { | |||||
| var l []sednav1.Metric | |||||
| for k, v := range metric { | |||||
| var displayValue string | |||||
| switch t := v.(type) { | |||||
| case string: | |||||
| displayValue = t | |||||
| default: | |||||
| // ignore the json marshal error | |||||
| b, _ := json.Marshal(v) | |||||
| displayValue = string(b) | |||||
| } | |||||
| l = append(l, sednav1.Metric{Key: k, Value: displayValue}) | |||||
| } | |||||
| return l | |||||
| } | |||||
| // RetryUpdateStatus simply retries to call the status update func | |||||
| func RetryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { | |||||
| var err error | |||||
| for try := 1; try <= resourceUpdateTries; try++ { | |||||
| err = updateStatusFunc() | |||||
| if err == nil { | |||||
| return nil | |||||
| } | |||||
| klog.Warningf("Error to update %s/%s status, tried %d times: %+v", namespace, name, try, err) | |||||
| } | |||||
| return err | |||||
| } | |||||
| @@ -14,13 +14,16 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package runtime | |||||
| import ( | import ( | ||||
| "context" | |||||
| "encoding/json" | "encoding/json" | ||||
| "fmt" | "fmt" | ||||
| v1 "k8s.io/api/core/v1" | v1 "k8s.io/api/core/v1" | ||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/client-go/kubernetes" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| @@ -106,11 +109,18 @@ func MergeSecretEnvs(nowE, newE []v1.EnvVar, overwrite bool) []v1.EnvVar { | |||||
| return nowE | return nowE | ||||
| } | } | ||||
| func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { | |||||
| if secret == nil { | |||||
| func InjectSecretAnnotations(client kubernetes.Interface, obj CommonInterface, secretName string) (err error) { | |||||
| if len(secretName) == 0 { | |||||
| return | |||||
| } | |||||
| secret, err := client.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return | return | ||||
| } | } | ||||
| return injectSecretObj(obj, secret) | |||||
| } | |||||
| func injectSecretObj(obj CommonInterface, secret *v1.Secret) (err error) { | |||||
| secretData := secret.GetAnnotations() | secretData := secret.GetAnnotations() | ||||
| for k, v := range secret.Data { | for k, v := range secret.Data { | ||||
| @@ -127,4 +137,5 @@ func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { | |||||
| ann[SecretAnnotationKey] = string(b) | ann[SecretAnnotationKey] = string(b) | ||||
| obj.SetAnnotations(ann) | obj.SetAnnotations(ann) | ||||
| return nil | |||||
| } | } | ||||
| @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and | |||||
| limitations under the License. | limitations under the License. | ||||
| */ | */ | ||||
| package globalmanager | |||||
| package runtime | |||||
| import ( | import ( | ||||
| "net/url" | "net/url" | ||||
| @@ -179,7 +179,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { | |||||
| hostPathType := v1.HostPathDirectory | hostPathType := v1.HostPathDirectory | ||||
| for _, mount := range workerParam.mounts { | |||||
| for _, mount := range workerParam.Mounts { | |||||
| for _, m := range mount.URLs { | for _, m := range mount.URLs { | ||||
| if m.HostPath == "" { | if m.HostPath == "" { | ||||
| continue | continue | ||||
| @@ -240,7 +240,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { | |||||
| func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) { | func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) { | ||||
| var secretEnvs []v1.EnvVar | var secretEnvs []v1.EnvVar | ||||
| for _, mount := range workerParam.mounts { | |||||
| for _, mount := range workerParam.Mounts { | |||||
| for _, m := range mount.URLs { | for _, m := range mount.URLs { | ||||
| if m.Disable || m.DownloadByInitializer { | if m.Disable || m.DownloadByInitializer { | ||||
| continue | continue | ||||
| @@ -259,7 +259,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { | |||||
| var downloadPairs []string | var downloadPairs []string | ||||
| var secretEnvs []v1.EnvVar | var secretEnvs []v1.EnvVar | ||||
| for _, mount := range workerParam.mounts { | |||||
| for _, mount := range workerParam.Mounts { | |||||
| for _, m := range mount.URLs { | for _, m := range mount.URLs { | ||||
| if m.Disable { | if m.Disable { | ||||
| continue | continue | ||||
| @@ -345,7 +345,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { | |||||
| func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { | func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { | ||||
| var mounts []WorkerMount | var mounts []WorkerMount | ||||
| // parse the mounts and environment key | // parse the mounts and environment key | ||||
| for _, mount := range workerParam.mounts { | |||||
| for _, mount := range workerParam.Mounts { | |||||
| var envPaths []string | var envPaths []string | ||||
| if mount.URL != nil { | if mount.URL != nil { | ||||
| @@ -374,13 +374,13 @@ func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { | |||||
| } | } | ||||
| if mount.EnvName != "" { | if mount.EnvName != "" { | ||||
| workerParam.env[mount.EnvName] = strings.Join( | |||||
| workerParam.Env[mount.EnvName] = strings.Join( | |||||
| envPaths, urlsFieldSep, | envPaths, urlsFieldSep, | ||||
| ) | ) | ||||
| } | } | ||||
| } | } | ||||
| workerParam.mounts = mounts | |||||
| workerParam.Mounts = mounts | |||||
| // need to call injectInitializerContainer before injectHostPathMount | // need to call injectInitializerContainer before injectHostPathMount | ||||
| // since injectHostPathMount could inject volumeMount to init container | // since injectHostPathMount could inject volumeMount to init container | ||||
| @@ -0,0 +1,103 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package runtime | |||||
| import ( | |||||
| "time" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| k8sruntime "k8s.io/apimachinery/pkg/runtime" | |||||
| "k8s.io/apimachinery/pkg/runtime/schema" | |||||
| "k8s.io/apimachinery/pkg/watch" | |||||
| kubeinformers "k8s.io/client-go/informers" | |||||
| "k8s.io/client-go/kubernetes" | |||||
| sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" | |||||
| sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" | |||||
| ) | |||||
| const ( | |||||
| // DefaultBackOff is the default backoff period | |||||
| DefaultBackOff = 10 * time.Second | |||||
| // MaxBackOff is the max backoff period | |||||
| MaxBackOff = 360 * time.Second | |||||
| // TrainPodType is type of train pod | |||||
| TrainPodType = "train" | |||||
| // EvalPodType is type of eval pod | |||||
| EvalPodType = "eval" | |||||
| // InferencePodType is type of inference pod | |||||
| InferencePodType = "inference" | |||||
| // AnnotationsKeyPrefix defines prefix of key in annotations | |||||
| AnnotationsKeyPrefix = "sedna.io/" | |||||
| ) | |||||
| type Model struct { | |||||
| Format string `json:"format,omitempty"` | |||||
| URL string `json:"url,omitempty"` | |||||
| Metrics map[string]interface{} `json:"metrics,omitempty"` | |||||
| } | |||||
| func (m *Model) GetURL() string { | |||||
| return m.URL | |||||
| } | |||||
| // CommonInterface describes the commom interface of CRs | |||||
| type CommonInterface interface { | |||||
| metav1.Object | |||||
| schema.ObjectKind | |||||
| k8sruntime.Object | |||||
| } | |||||
| // UpstreamHandler is the function definition for handling the upstream updates, | |||||
| // i.e. resource updates(mainly status) from LC(running at edge) | |||||
| type UpstreamHandler = func(namespace, name, operation string, content []byte) error | |||||
| // UpstreamHandlerAddFunc defines the upstream controller register function for adding handler | |||||
| type UpstreamHandlerAddFunc = func(kind string, updateHandler UpstreamHandler) error | |||||
| // DownstreamSendFunc is the send function for feature controllers to sync the resource updates(spec and status) to LC | |||||
| type DownstreamSendFunc = func(nodeName string, eventType watch.EventType, obj interface{}) error | |||||
| // BaseControllerI defines the interface of an controller | |||||
| type BaseControllerI interface { | |||||
| Run(stopCh <-chan struct{}) | |||||
| } | |||||
| // FeatureControllerI defines the interface of an AI Feature controller | |||||
| type FeatureControllerI interface { | |||||
| BaseControllerI | |||||
| // SetDownstreamSendFunc sets up the downstream send function in the feature controller | |||||
| SetDownstreamSendFunc(f DownstreamSendFunc) error | |||||
| // SetUpstreamHandler sets up the upstream handler function for the feature controller | |||||
| SetUpstreamHandler(add UpstreamHandlerAddFunc) error | |||||
| } | |||||
| // ControllerContext defines the context that all feature controller share and belong to | |||||
| type ControllerContext struct { | |||||
| Config *config.ControllerConfig | |||||
| KubeClient kubernetes.Interface | |||||
| KubeInformerFactory kubeinformers.SharedInformerFactory | |||||
| SednaClient sednaclientset.Interface | |||||
| SednaInformerFactory sednainformers.SharedInformerFactory | |||||
| } | |||||
| @@ -1,4 +1,4 @@ | |||||
| package globalmanager | |||||
| package runtime | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| @@ -27,15 +27,15 @@ type WorkerMount struct { | |||||
| // WorkerParam describes the system-defined parameters of worker | // WorkerParam describes the system-defined parameters of worker | ||||
| type WorkerParam struct { | type WorkerParam struct { | ||||
| mounts []WorkerMount | |||||
| Mounts []WorkerMount | |||||
| env map[string]string | |||||
| workerType string | |||||
| Env map[string]string | |||||
| WorkerType string | |||||
| // if true, force to use hostNetwork | // if true, force to use hostNetwork | ||||
| hostNetwork bool | |||||
| HostNetwork bool | |||||
| restartPolicy v1.RestartPolicy | |||||
| RestartPolicy v1.RestartPolicy | |||||
| } | } | ||||
| // generateLabels generates labels for an object | // generateLabels generates labels for an object | ||||
| @@ -109,7 +109,7 @@ func CreateKubernetesService(kubeClient kubernetes.Interface, object CommonInter | |||||
| func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { | func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { | ||||
| InjectStorageInitializer(pod, workerParam) | InjectStorageInitializer(pod, workerParam) | ||||
| envs := createEnvVars(workerParam.env) | |||||
| envs := createEnvVars(workerParam.Env) | |||||
| for idx := range pod.Spec.Containers { | for idx := range pod.Spec.Containers { | ||||
| pod.Spec.Containers[idx].Env = append( | pod.Spec.Containers[idx].Env = append( | ||||
| pod.Spec.Containers[idx].Env, envs..., | pod.Spec.Containers[idx].Env, envs..., | ||||
| @@ -121,27 +121,27 @@ func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInter | |||||
| pod.Labels = make(map[string]string) | pod.Labels = make(map[string]string) | ||||
| } | } | ||||
| for k, v := range generateLabels(object, workerParam.workerType) { | |||||
| for k, v := range generateLabels(object, workerParam.WorkerType) { | |||||
| pod.Labels[k] = v | pod.Labels[k] = v | ||||
| } | } | ||||
| pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.workerType) + "-" | |||||
| pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.WorkerType) + "-" | |||||
| pod.Namespace = object.GetNamespace() | pod.Namespace = object.GetNamespace() | ||||
| if workerParam.hostNetwork { | |||||
| if workerParam.HostNetwork { | |||||
| // FIXME | // FIXME | ||||
| // force to set hostnetwork | // force to set hostnetwork | ||||
| pod.Spec.HostNetwork = true | pod.Spec.HostNetwork = true | ||||
| } | } | ||||
| if pod.Spec.RestartPolicy == "" { | if pod.Spec.RestartPolicy == "" { | ||||
| pod.Spec.RestartPolicy = workerParam.restartPolicy | |||||
| pod.Spec.RestartPolicy = workerParam.RestartPolicy | |||||
| } | } | ||||
| } | } | ||||
| // createPodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam | |||||
| func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { | |||||
| // CreatePodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam | |||||
| func CreatePodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { | |||||
| objectKind := object.GroupVersionKind() | objectKind := object.GroupVersionKind() | ||||
| pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind)) | pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind)) | ||||
| injectWorkerParam(pod, workerParam, object) | injectWorkerParam(pod, workerParam, object) | ||||
| @@ -149,7 +149,7 @@ func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, | |||||
| createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{}) | createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{}) | ||||
| objectName := object.GetNamespace() + "/" + object.GetName() | objectName := object.GetNamespace() + "/" + object.GetName() | ||||
| if err != nil { | if err != nil { | ||||
| klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.workerType, objectKind, objectName, err) | |||||
| klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.WorkerType, objectKind, objectName, err) | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName) | klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName) | ||||
| @@ -1,168 +0,0 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package globalmanager | |||||
| import ( | |||||
| "encoding/json" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/apimachinery/pkg/runtime" | |||||
| "k8s.io/apimachinery/pkg/runtime/schema" | |||||
| ) | |||||
| // CommonInterface describes the commom interface of CRs | |||||
| type CommonInterface interface { | |||||
| metav1.Object | |||||
| schema.ObjectKind | |||||
| runtime.Object | |||||
| } | |||||
| // FeatureControllerI defines the interface of an AI Feature controller | |||||
| type FeatureControllerI interface { | |||||
| Start() error | |||||
| GetName() string | |||||
| } | |||||
| type Model struct { | |||||
| Format string `json:"format,omitempty"` | |||||
| URL string `json:"url,omitempty"` | |||||
| Metrics map[string]interface{} `json:"metrics,omitempty"` | |||||
| } | |||||
| // the data of this condition including the input/output to do the next step | |||||
| type IncrementalCondData struct { | |||||
| Input *struct { | |||||
| // Only one model cases | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| DataURL string `json:"dataURL,omitempty"` | |||||
| // the data samples reference will be stored into this URL. | |||||
| // The content of this url would be: | |||||
| // # the first uncomment line means the directory | |||||
| // s3://dataset/ | |||||
| // mnist/0.jpg | |||||
| // mnist/1.jpg | |||||
| DataIndexURL string `json:"dataIndexURL,omitempty"` | |||||
| OutputDir string `json:"outputDir,omitempty"` | |||||
| } `json:"input,omitempty"` | |||||
| Output *struct { | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| } `json:"output,omitempty"` | |||||
| } | |||||
| const ( | |||||
| // TrainPodType is type of train pod | |||||
| TrainPodType = "train" | |||||
| // EvalPodType is type of eval pod | |||||
| EvalPodType = "eval" | |||||
| // InferencePodType is type of inference pod | |||||
| InferencePodType = "inference" | |||||
| // AnnotationsKeyPrefix defines prefix of key in annotations | |||||
| AnnotationsKeyPrefix = "sedna.io/" | |||||
| ) | |||||
| func (m *Model) GetURL() string { | |||||
| return m.URL | |||||
| } | |||||
| func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { | |||||
| var modelURLs []string | |||||
| if model != nil { | |||||
| modelURLs = append(modelURLs, model.GetURL()) | |||||
| } else { | |||||
| for _, m := range models { | |||||
| modelURLs = append(modelURLs, m.GetURL()) | |||||
| } | |||||
| } | |||||
| return modelURLs | |||||
| } | |||||
| func (cd *IncrementalCondData) GetInputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) | |||||
| } | |||||
| func (cd *IncrementalCondData) GetOutputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) | |||||
| } | |||||
| func (cd *IncrementalCondData) Unmarshal(data []byte) error { | |||||
| return json.Unmarshal(data, cd) | |||||
| } | |||||
| func (cd IncrementalCondData) Marshal() ([]byte, error) { | |||||
| return json.Marshal(cd) | |||||
| } | |||||
| // the data of this condition including the input/output to do the next step | |||||
| type LifelongLearningCondData struct { | |||||
| Input *struct { | |||||
| // Only one model cases | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| DataURL string `json:"dataURL,omitempty"` | |||||
| // the data samples reference will be stored into this URL. | |||||
| // The content of this url would be: | |||||
| // # the first uncomment line means the directory | |||||
| // s3://dataset/ | |||||
| // mnist/0.jpg | |||||
| // mnist/1.jpg | |||||
| DataIndexURL string `json:"dataIndexURL,omitempty"` | |||||
| OutputDir string `json:"outputDir,omitempty"` | |||||
| } `json:"input,omitempty"` | |||||
| Output *struct { | |||||
| Model *Model `json:"model,omitempty"` | |||||
| Models []Model `json:"models,omitempty"` | |||||
| } `json:"output,omitempty"` | |||||
| } | |||||
| func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string { | |||||
| var modelURLs []string | |||||
| if model != nil { | |||||
| modelURLs = append(modelURLs, model.GetURL()) | |||||
| } else { | |||||
| for _, m := range models { | |||||
| modelURLs = append(modelURLs, m.GetURL()) | |||||
| } | |||||
| } | |||||
| return modelURLs | |||||
| } | |||||
| func (cd *LifelongLearningCondData) Unmarshal(data []byte) error { | |||||
| return json.Unmarshal(data, cd) | |||||
| } | |||||
| func (cd LifelongLearningCondData) Marshal() ([]byte, error) { | |||||
| return json.Marshal(cd) | |||||
| } | |||||
| func (cd *LifelongLearningCondData) GetInputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) | |||||
| } | |||||
| func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { | |||||
| return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) | |||||
| } | |||||
| @@ -1,519 +0,0 @@ | |||||
| /* | |||||
| Copyright 2021 The KubeEdge Authors. | |||||
| Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| you may not use this file except in compliance with the License. | |||||
| You may obtain a copy of the License at | |||||
| http://www.apache.org/licenses/LICENSE-2.0 | |||||
| Unless required by applicable law or agreed to in writing, software | |||||
| distributed under the License is distributed on an "AS IS" BASIS, | |||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| See the License for the specific language governing permissions and | |||||
| limitations under the License. | |||||
| */ | |||||
| package globalmanager | |||||
| import ( | |||||
| "context" | |||||
| "encoding/json" | |||||
| "fmt" | |||||
| "strings" | |||||
| v1 "k8s.io/api/core/v1" | |||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |||||
| "k8s.io/klog/v2" | |||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | |||||
| clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/config" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/utils" | |||||
| ) | |||||
| // updateHandler handles the updates from LC(running at edge) to update the | |||||
| // corresponding resource | |||||
| type updateHandler func(namespace, name, operation string, content []byte) error | |||||
| // UpstreamController subscribes the updates from edge and syncs to k8s api server | |||||
| type UpstreamController struct { | |||||
| client *clientset.SednaV1alpha1Client | |||||
| messageLayer messagelayer.MessageLayer | |||||
| updateHandlers map[string]updateHandler | |||||
| } | |||||
| const upstreamStatusUpdateRetries = 3 | |||||
| // retryUpdateStatus simply retries to call the status update func | |||||
| func retryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { | |||||
| var err error | |||||
| for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ { | |||||
| err = updateStatusFunc() | |||||
| if err == nil { | |||||
| return nil | |||||
| } | |||||
| klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err) | |||||
| } | |||||
| return err | |||||
| } | |||||
| func newUnmarshalError(namespace, name, operation string, content []byte) error { | |||||
| return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) | |||||
| } | |||||
| func checkUpstreamOperation(operation string) error { | |||||
| // current only support the 'status' operation | |||||
| if operation != "status" { | |||||
| return fmt.Errorf("unknown operation %s", operation) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| // updateDatasetStatus updates the dataset status | |||||
| func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error { | |||||
| client := uc.client.Datasets(namespace) | |||||
| if status.UpdateTime == nil { | |||||
| now := metav1.Now() | |||||
| status.UpdateTime = &now | |||||
| } | |||||
| return retryUpdateStatus(name, namespace, func() error { | |||||
| dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| dataset.Status = status | |||||
| _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| // updateDatasetFromEdge syncs update from edge | |||||
| func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { | |||||
| err := checkUpstreamOperation(operation) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| status := sednav1.DatasetStatus{} | |||||
| err = json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| return uc.updateDatasetStatus(name, namespace, status) | |||||
| } | |||||
| // convertToMetrics converts the metrics from LCs to resource metrics | |||||
| func convertToMetrics(m map[string]interface{}) []sednav1.Metric { | |||||
| var l []sednav1.Metric | |||||
| for k, v := range m { | |||||
| var displayValue string | |||||
| switch t := v.(type) { | |||||
| case string: | |||||
| displayValue = t | |||||
| default: | |||||
| // ignore the json marshal error | |||||
| b, _ := json.Marshal(v) | |||||
| displayValue = string(b) | |||||
| } | |||||
| l = append(l, sednav1.Metric{Key: k, Value: displayValue}) | |||||
| } | |||||
| return l | |||||
| } | |||||
| func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string, metrics []sednav1.Metric) error { | |||||
| client := uc.client.JointInferenceServices(namespace) | |||||
| return retryUpdateStatus(name, namespace, func() error { | |||||
| joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| joint.Status.Metrics = metrics | |||||
| _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| // updateJointInferenceFromEdge syncs the edge updates to k8s | |||||
| func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { | |||||
| err := checkUpstreamOperation(operation) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // Output defines owner output information | |||||
| type Output struct { | |||||
| ServiceInfo map[string]interface{} `json:"ownerInfo"` | |||||
| } | |||||
| var status struct { | |||||
| // Phase always should be "inference" | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| Output *Output `json:"output"` | |||||
| } | |||||
| err = json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| // TODO: propagate status.Status to k8s | |||||
| output := status.Output | |||||
| if output == nil || output.ServiceInfo == nil { | |||||
| // no output info | |||||
| klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) | |||||
| return nil | |||||
| } | |||||
| info := output.ServiceInfo | |||||
| for _, ignoreTimeKey := range []string{ | |||||
| "startTime", | |||||
| "updateTime", | |||||
| } { | |||||
| delete(info, ignoreTimeKey) | |||||
| } | |||||
| metrics := convertToMetrics(info) | |||||
| err = uc.updateJointInferenceMetrics(name, namespace, metrics) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to update metrics, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (uc *UpstreamController) updateModelMetrics(name, namespace string, metrics []sednav1.Metric) error { | |||||
| client := uc.client.Models(namespace) | |||||
| return retryUpdateStatus(name, namespace, (func() error { | |||||
| model, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| now := metav1.Now() | |||||
| model.Status.UpdateTime = &now | |||||
| model.Status.Metrics = metrics | |||||
| _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| func (uc *UpstreamController) updateModelMetricsByFederatedName(name, namespace string, metrics []sednav1.Metric) error { | |||||
| client := uc.client.FederatedLearningJobs(namespace) | |||||
| var err error | |||||
| federatedLearningJob, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| // federated crd not found | |||||
| return err | |||||
| } | |||||
| modelName := federatedLearningJob.Spec.AggregationWorker.Model.Name | |||||
| return uc.updateModelMetrics(modelName, namespace, metrics) | |||||
| } | |||||
| func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { | |||||
| client := uc.client.FederatedLearningJobs(namespace) | |||||
| return retryUpdateStatus(name, namespace, (func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| // updateFederatedLearningJobFromEdge updates the federated job's status | |||||
| func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { | |||||
| err = checkUpstreamOperation(operation) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // JobInfo defines the job information | |||||
| type JobInfo struct { | |||||
| // Current training round | |||||
| CurrentRound int `json:"currentRound"` | |||||
| UpdateTime string `json:"updateTime"` | |||||
| } | |||||
| // Output defines job output information | |||||
| type Output struct { | |||||
| Models []Model `json:"models"` | |||||
| JobInfo *JobInfo `json:"ownerInfo"` | |||||
| } | |||||
| var status struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| Output *Output `json:"output"` | |||||
| } | |||||
| err = json.Unmarshal(content, &status) | |||||
| if err != nil { | |||||
| err = newUnmarshalError(namespace, name, operation, content) | |||||
| return | |||||
| } | |||||
| output := status.Output | |||||
| if output != nil { | |||||
| // Update the model's metrics | |||||
| if len(output.Models) > 0 { | |||||
| // only one model | |||||
| model := output.Models[0] | |||||
| metrics := convertToMetrics(model.Metrics) | |||||
| if len(metrics) > 0 { | |||||
| uc.updateModelMetricsByFederatedName(name, namespace, metrics) | |||||
| } | |||||
| } | |||||
| jobInfo := output.JobInfo | |||||
| // update job info if having any info | |||||
| if jobInfo != nil && jobInfo.CurrentRound > 0 { | |||||
| // Find a good place to save the progress info | |||||
| // TODO: more meaningful reason/message | |||||
| reason := "DoTraining" | |||||
| message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) | |||||
| cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) | |||||
| uc.appendFederatedLearningJobStatusCondition(name, namespace, cond) | |||||
| } | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { | |||||
| client := uc.client.IncrementalLearningJobs(namespace) | |||||
| return retryUpdateStatus(name, namespace, (func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| })) | |||||
| } | |||||
| // updateIncrementalLearningFromEdge syncs the edge updates to k8s | |||||
| func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error { | |||||
| err := checkUpstreamOperation(operation) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| var jobStatus struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| } | |||||
| err = json.Unmarshal(content, &jobStatus) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| // Get the condition data. | |||||
| // Here unmarshal and marshal immediately to skip the unnecessary fields | |||||
| var condData IncrementalCondData | |||||
| err = json.Unmarshal(content, &condData) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| condDataBytes, _ := json.Marshal(&condData) | |||||
| cond := sednav1.ILJobCondition{ | |||||
| Status: v1.ConditionTrue, | |||||
| LastHeartbeatTime: metav1.Now(), | |||||
| LastTransitionTime: metav1.Now(), | |||||
| Data: string(condDataBytes), | |||||
| Message: "reported by lc", | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Phase) { | |||||
| case "train": | |||||
| cond.Stage = sednav1.ILJobTrain | |||||
| case "eval": | |||||
| cond.Stage = sednav1.ILJobEval | |||||
| case "deploy": | |||||
| cond.Stage = sednav1.ILJobDeploy | |||||
| default: | |||||
| return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Status) { | |||||
| case "ready": | |||||
| cond.Type = sednav1.ILJobStageCondReady | |||||
| case "completed": | |||||
| cond.Type = sednav1.ILJobStageCondCompleted | |||||
| case "failed": | |||||
| cond.Type = sednav1.ILJobStageCondFailed | |||||
| case "waiting": | |||||
| cond.Type = sednav1.ILJobStageCondWaiting | |||||
| default: | |||||
| return fmt.Errorf("invalid condition type: %v", jobStatus.Status) | |||||
| } | |||||
| err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to append condition, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (uc *UpstreamController) appendLifelongLearningJobStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { | |||||
| client := uc.client.LifelongLearningJobs(namespace) | |||||
| return retryUpdateStatus(name, namespace, func() error { | |||||
| job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| job.Status.Conditions = append(job.Status.Conditions, cond) | |||||
| _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) | |||||
| return err | |||||
| }) | |||||
| } | |||||
| // updateLifelongLearningJobFromEdge syncs the edge updates to k8s | |||||
| func (uc *UpstreamController) updateLifelongLearningJobFromEdge(name, namespace, operation string, content []byte) error { | |||||
| err := checkUpstreamOperation(operation) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| var jobStatus struct { | |||||
| Phase string `json:"phase"` | |||||
| Status string `json:"status"` | |||||
| } | |||||
| err = json.Unmarshal(content, &jobStatus) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| // Get the condition data. | |||||
| // Here unmarshal and marshal immediately to skip the unnecessary fields | |||||
| var condData LifelongLearningCondData | |||||
| err = json.Unmarshal(content, &condData) | |||||
| if err != nil { | |||||
| return newUnmarshalError(namespace, name, operation, content) | |||||
| } | |||||
| condDataBytes, _ := json.Marshal(&condData) | |||||
| cond := sednav1.LLJobCondition{ | |||||
| Status: v1.ConditionTrue, | |||||
| LastHeartbeatTime: metav1.Now(), | |||||
| LastTransitionTime: metav1.Now(), | |||||
| Data: string(condDataBytes), | |||||
| Message: "reported by lc", | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Phase) { | |||||
| case "train": | |||||
| cond.Stage = sednav1.LLJobTrain | |||||
| case "eval": | |||||
| cond.Stage = sednav1.LLJobEval | |||||
| case "deploy": | |||||
| cond.Stage = sednav1.LLJobDeploy | |||||
| default: | |||||
| return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) | |||||
| } | |||||
| switch strings.ToLower(jobStatus.Status) { | |||||
| case "ready": | |||||
| cond.Type = sednav1.LLJobStageCondReady | |||||
| case "completed": | |||||
| cond.Type = sednav1.LLJobStageCondCompleted | |||||
| case "failed": | |||||
| cond.Type = sednav1.LLJobStageCondFailed | |||||
| case "waiting": | |||||
| cond.Type = sednav1.LLJobStageCondWaiting | |||||
| default: | |||||
| return fmt.Errorf("invalid condition type: %v", jobStatus.Status) | |||||
| } | |||||
| err = uc.appendLifelongLearningJobStatusCondition(name, namespace, cond) | |||||
| if err != nil { | |||||
| return fmt.Errorf("failed to append condition, err:%+w", err) | |||||
| } | |||||
| return nil | |||||
| } | |||||
| // syncEdgeUpdate receives the updates from edge and syncs these to k8s. | |||||
| func (uc *UpstreamController) syncEdgeUpdate() { | |||||
| for { | |||||
| select { | |||||
| case <-uc.messageLayer.Done(): | |||||
| klog.Info("Stop sedna upstream loop") | |||||
| return | |||||
| default: | |||||
| } | |||||
| update, err := uc.messageLayer.ReceiveResourceUpdate() | |||||
| if err != nil { | |||||
| klog.Warningf("Ignore update since this err: %+v", err) | |||||
| continue | |||||
| } | |||||
| kind := update.Kind | |||||
| namespace := update.Namespace | |||||
| name := update.Name | |||||
| operation := update.Operation | |||||
| handler, ok := uc.updateHandlers[kind] | |||||
| if ok { | |||||
| err := handler(name, namespace, operation, update.Content) | |||||
| if err != nil { | |||||
| klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err) | |||||
| } | |||||
| } else { | |||||
| klog.Warningf("No handler for resource kind %s", kind) | |||||
| } | |||||
| } | |||||
| } | |||||
| // Start the upstream controller | |||||
| func (uc *UpstreamController) Start() error { | |||||
| klog.Info("Start the sedna upstream controller") | |||||
| go uc.syncEdgeUpdate() | |||||
| return nil | |||||
| } | |||||
| // GetName returns the name of the upstream controller | |||||
| func (uc *UpstreamController) GetName() string { | |||||
| return "UpstreamController" | |||||
| } | |||||
| // NewUpstreamController creates a new Upstream controller from config | |||||
| func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { | |||||
| client, err := utils.NewCRDClient() | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("create crd client failed with error: %w", err) | |||||
| } | |||||
| uc := &UpstreamController{ | |||||
| client: client, | |||||
| messageLayer: messagelayer.NewContextMessageLayer(), | |||||
| } | |||||
| // NOTE: current no direct model update from edge, | |||||
| // model update will be triggered by the corresponding training feature | |||||
| uc.updateHandlers = map[string]updateHandler{ | |||||
| "dataset": uc.updateDatasetFromEdge, | |||||
| "jointinferenceservice": uc.updateJointInferenceFromEdge, | |||||
| "federatedlearningjob": uc.updateFederatedLearningJobFromEdge, | |||||
| "incrementallearningjob": uc.updateIncrementalLearningFromEdge, | |||||
| "lifelonglearningjob": uc.updateLifelongLearningJobFromEdge, | |||||
| } | |||||
| return uc, nil | |||||
| } | |||||
| @@ -31,7 +31,8 @@ import ( | |||||
| "github.com/kubeedge/sedna/cmd/sedna-lc/app/options" | "github.com/kubeedge/sedna/cmd/sedna-lc/app/options" | ||||
| sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" | ||||
| "github.com/kubeedge/sedna/pkg/globalmanager" | |||||
| gmtypes "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" | |||||
| "github.com/kubeedge/sedna/pkg/globalmanager/runtime" | |||||
| "github.com/kubeedge/sedna/pkg/localcontroller/db" | "github.com/kubeedge/sedna/pkg/localcontroller/db" | ||||
| "github.com/kubeedge/sedna/pkg/localcontroller/gmclient" | "github.com/kubeedge/sedna/pkg/localcontroller/gmclient" | ||||
| "github.com/kubeedge/sedna/pkg/localcontroller/storage" | "github.com/kubeedge/sedna/pkg/localcontroller/storage" | ||||
| @@ -435,11 +436,11 @@ func newTrigger(t sednav1.Trigger) (trigger.Base, error) { | |||||
| func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo { | func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo { | ||||
| jobConditions := job.Status.Conditions | jobConditions := job.Status.Conditions | ||||
| // TODO: globalmanager.type changes to common.type for gm and lc | |||||
| var models []globalmanager.Model | |||||
| // TODO: runtime.type changes to common.type for gm and lc | |||||
| var models []runtime.Model | |||||
| for i := len(jobConditions) - 1; i >= 0; i-- { | for i := len(jobConditions) - 1; i >= 0; i-- { | ||||
| var cond globalmanager.IncrementalCondData | |||||
| var cond gmtypes.IncrementalCondData | |||||
| jobCond := jobConditions[i] | jobCond := jobConditions[i] | ||||
| if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { | if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { | ||||
| if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil { | if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil { | ||||